53997da5d vs 03a1b695e - NVFuser codegen diff
53997da5d
53997da5d Use set allocation domains if used by TMA or MmaOp (#4234)
[browse]
Naoya Maruyama <naoyam@users.noreply.github.com>
Thu Apr 10 18:40:23 2025 -0700
03a1b695e
03a1b695e temp
[browse]
Naoya Maruyama <nmaruyama@nvidia.com>
Fri Apr 11 08:59:59 2025 -0700
Command: build/test_view
GPUs:
['NVIDIA H100 80GB HBM3\n', 'NVIDIA H100 80GB HBM3\n', 'NVIDIA H100 80GB HBM3\n', 'NVIDIA H100 80GB HBM3\n', 'NVIDIA H100 80GB HBM3\n', 'NVIDIA H100 80GB HBM3\n', 'NVIDIA H100 80GB HBM3\n', 'NVIDIA H100 80GB HBM3\n']
nvcc --version
matches between runs
> nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2025 NVIDIA Corporation
Built on Fri_Feb_21_20:23:50_PST_2025
Cuda compilation tools, release 12.8, V12.8.93
Build cuda_12.8.r12.8/compiler.35583870_0
Env
matches between runs
BIBINPUTS=.//:/home/nmaruyama/tex/bib//:
BSTINPUTS=.//:/home/nmaruyama/tex/bst//:
CLASSPATH=.
CONDA_DEFAULT_ENV=/opt/conda/pytorch
CONDA_EXE=/opt/conda/miniconda/bin/conda
CONDA_PREFIX=/opt/conda/pytorch
CONDA_PROMPT_MODIFIER=(/opt/conda/pytorch)
CONDA_PYTHON_EXE=/opt/conda/miniconda/bin/python
CONDA_SHLVL=1
CUDA_CACHE_MAX_SIZE=4294967296
CVSROOT=smghome:/home/naoya/cvs
CVS_RSH=ssh
DEBUG_SERDE=disable
ECLIPSE_HOME=/home/nmaruyama/apps/eclipse
EDITOR=vim
GREP_COLOR=33
GTK_IM_MODULE=scim
HOME=/home/nmaruyama
HOSTNAME=debug3.cuda128_clang19_ubuntu24.04_python3.10_pyt
I_MPI_CC=icc
I_MPI_CXX=icpc
I_MPI_F77=ifort
I_MPI_F90=ifort
LANG=C
LC_ALL=en_US.UTF-8
LC_TERMINAL=iTerm2
LC_TERMINAL_VERSION=3.5.11
LD_LIBRARY_PATH=/usr/lib64
LESSGLOBALTAGS=global
LOGNAME=nmaruyama
MOZILLA_FIVE_HOME=/usr/lib/mozilla
NVFUSER_DISABLE=parallel_compile
NVFUSER_DUMP=cuda_to_file,ptxas_verbose,ptx
NVFUSER_ENABLE=static_fusion_count
NVFUSER_TEST_RANDOM_SEED=0
OLDPWD=/home/nmaruyama
PATH=/opt/conda/pytorch/bin:/opt/conda/miniconda/condabin:/home/nmaruyama/perl5/bin:/home/nmaruyama/miniconda3/bin:/home/nmaruyama/scratch/nvfuser/utils:/home/nmaruyama/bin:/home/nmaruyama/apps/bin:/usr/local/bin:/usr/bin:/bin:/usr/local/sbin:/usr/sbin:/sbin:/home/nmaruyama/projects/tools/tau/x86_64/bin:/home/nmaruyama/projects/tools/pdtoolkit/x86_64/bin:/home/nmaruyama/projects/tools/papi/bin:/home/nmaruyama/projects/tools/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/home/nmaruyama/apps/eclipse
PERL5LIB=/home/nmaruyama/perl5/lib/perl5
PERL_LOCAL_LIB_ROOT=/home/nmaruyama/perl5
PERL_MB_OPT=--install_base "/home/nmaruyama/perl5"
PERL_MM_OPT=INSTALL_BASE=/home/nmaruyama/perl5
PROMPT=%m:%1~$
PWD=/home/nmaruyama/scratch/nvfuser/debug3
QT_IM_MODULE=scim
R_LIBS_USER=/home/nmaruyama/apps/r-lib
SHELL=/bin/zsh
SHLVL=3
SSH_CLIENT=192.168.128.1 56024 22
SSH_CONNECTION=192.168.128.1 56024 192.168.128.3 22
SSH_TTY=/dev/pts/0
SVN_EDITOR=vim
TERM=xterm-256color
TEXINPUTS=.//:/home/nmaruyama/tex/sty//:
TZ=America/Los_Angeles
USER=nmaruyama
_=/usr/bin/printenv
WORDCHARS=*?[]~=&;!#$%^(){}
NVFuser preamble
matches between runs
#ifdef __NVCC__
#include <complex>
#endif // __NVCC__
namespace {
using int8_t = signed char;
using uint8_t = unsigned char;
using int16_t = short int;
using uint16_t = unsigned short int;
using int32_t = int;
using uint32_t = unsigned int;
using int64_t = long long int;
using uint64_t = unsigned long long int;
// Modified from cuda.h
struct TensorMap {
alignas(64)
uint64_t opaque[16];
};
typedef int nvfuser_index_t; // NOTE: index type hard-coded as int for display only
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
#ifdef __NVCC__
#include <type_traits>
#else
// The following namespace std is modified from LLVM, see the following
// copyright information
//
// -*- C++ -*-
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// copy-pasted from some llvm files:
// - https://github.com/llvm/llvm-project/blob/main/libcxx/include/type_traits
// -
// https://github.com/llvm/llvm-project/blob/main/clang/test/Headers/Inputs/include/type_traits
namespace std {
template <class _Tp>
_Tp&& __declval(int);
template <class _Tp>
_Tp __declval(long);
template <class _Tp>
decltype(__declval<_Tp>(0)) declval() noexcept;
template <class _Tp, _Tp __v>
struct integral_constant {
static const _Tp value = __v;
typedef _Tp value_type;
typedef integral_constant type;
};
typedef integral_constant<bool, true> true_type;
typedef integral_constant<bool, false> false_type;
// is_same, functional
template <class _Tp, class _Up>
struct is_same : public false_type {};
template <class _Tp>
struct is_same<_Tp, _Tp> : public true_type {};
template <class T, class U>
constexpr bool is_same_v = is_same<T, U>::value;
// is_integral, for some types.
template <class _Tp>
struct is_integral : public integral_constant<bool, false> {};
template <>
struct is_integral<bool> : public integral_constant<bool, true> {};
template <>
struct is_integral<char> : public integral_constant<bool, true> {};
template <>
struct is_integral<short> : public integral_constant<bool, true> {};
template <>
struct is_integral<int> : public integral_constant<bool, true> {};
template <>
struct is_integral<long> : public integral_constant<bool, true> {};
template <>
struct is_integral<long long> : public integral_constant<bool, true> {};
// enable_if, functional
template <bool _C, typename _Tp>
struct enable_if {};
template <typename _Tp>
struct enable_if<true, _Tp> {
using type = _Tp;
};
template <bool b, class T = void>
using enable_if_t = typename enable_if<b, T>::type;
template <class _Tp>
struct remove_const {
typedef _Tp type;
};
template <class _Tp>
struct remove_const<const _Tp> {
typedef _Tp type;
};
template <class _Tp>
using remove_const_t = typename remove_const<_Tp>::type;
template <class _Tp>
struct remove_volatile {
typedef _Tp type;
};
template <class _Tp>
struct remove_volatile<volatile _Tp> {
typedef _Tp type;
};
template <class _Tp>
using remove_volatile_t = typename remove_volatile<_Tp>::type;
template <class _Tp>
struct remove_cv {
typedef typename remove_volatile<typename remove_const<_Tp>::type>::type type;
};
template <class _Tp>
using remove_cv_t = typename remove_cv<_Tp>::type;
template <class _Tp>
struct __libcpp_is_floating_point : public false_type {};
template <>
struct __libcpp_is_floating_point<float> : public true_type {};
template <>
struct __libcpp_is_floating_point<double> : public true_type {};
template <>
struct __libcpp_is_floating_point<long double> : public true_type {};
template <class _Tp>
struct is_floating_point
: public __libcpp_is_floating_point<typename remove_cv<_Tp>::type> {};
template <class _Tp>
struct is_arithmetic
: public integral_constant<
bool,
is_integral<_Tp>::value || is_floating_point<_Tp>::value> {};
template <class _Tp>
inline constexpr bool is_arithmetic_v = is_arithmetic<_Tp>::value;
template <class _Tp>
struct __numeric_type {
static void __test(...);
static float __test(float);
static double __test(char);
static double __test(int);
static double __test(unsigned);
static double __test(long);
static double __test(unsigned long);
static double __test(long long);
static double __test(unsigned long long);
static double __test(double);
static long double __test(long double);
typedef decltype(__test(declval<_Tp>())) type;
static const bool value = !is_same<type, void>::value;
};
template <>
struct __numeric_type<void> {
static const bool value = true;
};
// __promote
template <
class _A1,
class _A2 = void,
class _A3 = void,
bool = __numeric_type<_A1>::value && __numeric_type<_A2>::value &&
__numeric_type<_A3>::value>
class __promote_imp {
public:
static const bool value = false;
};
template <class _A1, class _A2, class _A3>
class __promote_imp<_A1, _A2, _A3, true> {
private:
typedef typename __promote_imp<_A1>::type __type1;
typedef typename __promote_imp<_A2>::type __type2;
typedef typename __promote_imp<_A3>::type __type3;
public:
typedef decltype(__type1() + __type2() + __type3()) type;
static const bool value = true;
};
template <class _A1, class _A2>
class __promote_imp<_A1, _A2, void, true> {
private:
typedef typename __promote_imp<_A1>::type __type1;
typedef typename __promote_imp<_A2>::type __type2;
public:
typedef decltype(__type1() + __type2()) type;
static const bool value = true;
};
template <class _A1>
class __promote_imp<_A1, void, void, true> {
public:
typedef typename __numeric_type<_A1>::type type;
static const bool value = true;
};
template <class _A1, class _A2 = void, class _A3 = void>
class __promote : public __promote_imp<_A1, _A2, _A3> {};
} // namespace std
#endif
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
#ifdef __NVCC__
#include <bit>
#else
namespace std {
template <class To, class From>
std::enable_if_t<sizeof(To) == sizeof(From), To> bit_cast(
const From& src) noexcept {
return *reinterpret_cast<const To*>(&src);
}
} // namespace std
// Intentionally not supporting signed integers to stay consistent with
// https://en.cppreference.com/w/cpp/numeric/bit_ceil
__device__ __forceinline__ unsigned int bit_ceil(unsigned int x) {
if (x == 0) {
return 1;
}
return 1u << (32 - __clz(x - 1));
}
__device__ __forceinline__ unsigned long long bit_ceil(unsigned long long x) {
if (x == 0) {
return 1;
}
return 1ull << (64 - __clzll(x - 1));
}
#endif
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
#ifndef __NVCC__
#define POS_INFINITY __int_as_float(0x7f800000)
#define INFINITY POS_INFINITY
#define NEG_INFINITY __int_as_float(0xff800000)
#define NAN __int_as_float(0x7fffffff)
//===----------------------------------------------------------------------===//
// The following namespace std is modified from LLVM, see the following
// copyright information
//
// -*- C++ -*-
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// copy-pasted from the following llvm file:
// https://github.com/llvm/llvm-project/blob/main/libcxx/include/complex
namespace std {
template <class _Tp>
class complex;
template <class _Tp>
complex<_Tp> operator*(const complex<_Tp>& __z, const complex<_Tp>& __w);
template <class _Tp>
complex<_Tp> operator/(const complex<_Tp>& __x, const complex<_Tp>& __y);
template <class _Tp>
class complex {
public:
typedef _Tp value_type;
private:
value_type __re_;
value_type __im_;
public:
constexpr complex(
const value_type& __re = value_type(),
const value_type& __im = value_type())
: __re_(__re), __im_(__im) {}
template <class _Xp>
constexpr complex(const complex<_Xp>& __c)
: __re_(__c.real()), __im_(__c.imag()) {}
constexpr value_type real() const {
return __re_;
}
constexpr value_type imag() const {
return __im_;
}
void real(value_type __re) {
__re_ = __re;
}
void imag(value_type __im) {
__im_ = __im;
}
constexpr operator bool() const {
return real() || imag();
}
complex& operator=(const value_type& __re) {
__re_ = __re;
__im_ = value_type();
return *this;
}
complex& operator+=(const value_type& __re) {
__re_ += __re;
return *this;
}
complex& operator-=(const value_type& __re) {
__re_ -= __re;
return *this;
}
complex& operator*=(const value_type& __re) {
__re_ *= __re;
__im_ *= __re;
return *this;
}
complex& operator/=(const value_type& __re) {
__re_ /= __re;
__im_ /= __re;
return *this;
}
template <class _Xp>
complex& operator=(const complex<_Xp>& __c) {
__re_ = __c.real();
__im_ = __c.imag();
return *this;
}
template <class _Xp>
complex& operator+=(const complex<_Xp>& __c) {
__re_ += __c.real();
__im_ += __c.imag();
return *this;
}
template <class _Xp>
complex& operator-=(const complex<_Xp>& __c) {
__re_ -= __c.real();
__im_ -= __c.imag();
return *this;
}
template <class _Xp>
complex& operator*=(const complex<_Xp>& __c) {
*this = *this * complex(__c.real(), __c.imag());
return *this;
}
template <class _Xp>
complex& operator/=(const complex<_Xp>& __c) {
*this = *this / complex(__c.real(), __c.imag());
return *this;
}
};
template <>
class complex<double>;
template <>
class complex<float> {
float __re_;
float __im_;
public:
typedef float value_type;
constexpr complex(float __re = 0.0f, float __im = 0.0f)
: __re_(__re), __im_(__im) {}
explicit constexpr complex(const complex<double>& __c);
// copy volatile to non-volatile
constexpr complex(const volatile complex<float>& other)
: __re_(other.__re_), __im_(other.__im_) {}
constexpr complex(const complex<float>& other)
: __re_(other.__re_), __im_(other.__im_) {}
constexpr float real() const {
return __re_;
}
constexpr float imag() const {
return __im_;
}
void real(value_type __re) {
__re_ = __re;
}
void imag(value_type __im) {
__im_ = __im;
}
constexpr operator bool() const {
return real() || imag();
}
complex& operator=(float __re) {
__re_ = __re;
__im_ = value_type();
return *this;
}
complex& operator+=(float __re) {
__re_ += __re;
return *this;
}
complex& operator-=(float __re) {
__re_ -= __re;
return *this;
}
complex& operator*=(float __re) {
__re_ *= __re;
__im_ *= __re;
return *this;
}
complex& operator/=(float __re) {
__re_ /= __re;
__im_ /= __re;
return *this;
}
template <class _Xp>
complex& operator=(const complex<_Xp>& __c) {
__re_ = __c.real();
__im_ = __c.imag();
return *this;
}
// non-volatile to volatile
template <class _Xp>
volatile complex& operator=(const complex<_Xp>& __c) volatile {
__re_ = __c.real();
__im_ = __c.imag();
return *this;
}
// volatile to non-volatile
template <class _Xp>
complex& operator=(const volatile complex<_Xp>& __c) {
__re_ = __c.real();
__im_ = __c.imag();
return *this;
}
// volatile to volatile
template <class _Xp>
volatile complex& operator=(const volatile complex<_Xp>& __c) volatile {
__re_ = __c.real();
__im_ = __c.imag();
return *this;
}
template <class _Xp>
complex& operator+=(const complex<_Xp>& __c) {
__re_ += __c.real();
__im_ += __c.imag();
return *this;
}
template <class _Xp>
complex& operator-=(const complex<_Xp>& __c) {
__re_ -= __c.real();
__im_ -= __c.imag();
return *this;
}
template <class _Xp>
complex& operator*=(const complex<_Xp>& __c) {
*this = *this * complex(__c.real(), __c.imag());
return *this;
}
template <class _Xp>
complex& operator/=(const complex<_Xp>& __c) {
*this = *this / complex(__c.real(), __c.imag());
return *this;
}
};
template <>
class complex<double> {
double __re_;
double __im_;
public:
typedef double value_type;
constexpr complex(double __re = 0.0, double __im = 0.0)
: __re_(__re), __im_(__im) {}
constexpr complex(const complex<float>& __c);
// copy volatile to non-volatile
constexpr complex(const volatile complex<double>& other)
: __re_(other.__re_), __im_(other.__im_) {}
constexpr complex(const complex<double>& other)
: __re_(other.__re_), __im_(other.__im_) {}
constexpr double real() const {
return __re_;
}
constexpr double imag() const {
return __im_;
}
void real(value_type __re) {
__re_ = __re;
}
void imag(value_type __im) {
__im_ = __im;
}
constexpr operator bool() const {
return real() || imag();
}
complex& operator=(double __re) {
__re_ = __re;
__im_ = value_type();
return *this;
}
complex& operator+=(double __re) {
__re_ += __re;
return *this;
}
complex& operator-=(double __re) {
__re_ -= __re;
return *this;
}
complex& operator*=(double __re) {
__re_ *= __re;
__im_ *= __re;
return *this;
}
complex& operator/=(double __re) {
__re_ /= __re;
__im_ /= __re;
return *this;
}
template <class _Xp>
complex& operator=(const complex<_Xp>& __c) {
__re_ = __c.real();
__im_ = __c.imag();
return *this;
}
// non-volatile to volatile
template <class _Xp>
volatile complex& operator=(const complex<_Xp>& __c) volatile {
__re_ = __c.real();
__im_ = __c.imag();
return *this;
}
// volatile to non-volatile
template <class _Xp>
complex& operator=(const volatile complex<_Xp>& __c) {
__re_ = __c.real();
__im_ = __c.imag();
return *this;
}
// volatile to volatile
template <class _Xp>
volatile complex& operator=(const volatile complex<_Xp>& __c) volatile {
__re_ = __c.real();
__im_ = __c.imag();
return *this;
}
template <class _Xp>
complex& operator+=(const complex<_Xp>& __c) {
__re_ += __c.real();
__im_ += __c.imag();
return *this;
}
template <class _Xp>
complex& operator-=(const complex<_Xp>& __c) {
__re_ -= __c.real();
__im_ -= __c.imag();
return *this;
}
template <class _Xp>
complex& operator*=(const complex<_Xp>& __c) {
*this = *this * complex(__c.real(), __c.imag());
return *this;
}
template <class _Xp>
complex& operator/=(const complex<_Xp>& __c) {
*this = *this / complex(__c.real(), __c.imag());
return *this;
}
};
inline constexpr complex<float>::complex(const complex<double>& __c)
: __re_(__c.real()), __im_(__c.imag()) {}
inline constexpr complex<double>::complex(const complex<float>& __c)
: __re_(__c.real()), __im_(__c.imag()) {}
// 26.3.6 operators:
template <class _Tp>
inline complex<_Tp> operator+(
const complex<_Tp>& __x,
const complex<_Tp>& __y) {
complex<_Tp> __t(__x);
__t += __y;
return __t;
}
template <class _Tp>
inline complex<_Tp> operator+(const complex<_Tp>& __x, const _Tp& __y) {
complex<_Tp> __t(__x);
__t += __y;
return __t;
}
template <class _Tp>
inline complex<_Tp> operator+(const _Tp& __x, const complex<_Tp>& __y) {
complex<_Tp> __t(__y);
__t += __x;
return __t;
}
template <class _Tp>
inline complex<_Tp> operator-(
const complex<_Tp>& __x,
const complex<_Tp>& __y) {
complex<_Tp> __t(__x);
__t -= __y;
return __t;
}
template <class _Tp>
inline complex<_Tp> operator-(const complex<_Tp>& __x, const _Tp& __y) {
complex<_Tp> __t(__x);
__t -= __y;
return __t;
}
template <class _Tp>
inline complex<_Tp> operator-(const _Tp& __x, const complex<_Tp>& __y) {
complex<_Tp> __t(-__y);
__t += __x;
return __t;
}
template <class _Tp>
complex<_Tp> operator*(const complex<_Tp>& __z, const complex<_Tp>& __w) {
_Tp __a = __z.real();
_Tp __b = __z.imag();
_Tp __c = __w.real();
_Tp __d = __w.imag();
_Tp __ac = __a * __c;
_Tp __bd = __b * __d;
_Tp __ad = __a * __d;
_Tp __bc = __b * __c;
_Tp __x = __ac - __bd;
_Tp __y = __ad + __bc;
if (isnan(__x) && isnan(__y)) {
bool __recalc = false;
if (isinf(__a) || isinf(__b)) {
__a = copysign(isinf(__a) ? _Tp(1) : _Tp(0), __a);
__b = copysign(isinf(__b) ? _Tp(1) : _Tp(0), __b);
if (isnan(__c))
__c = copysign(_Tp(0), __c);
if (isnan(__d))
__d = copysign(_Tp(0), __d);
__recalc = true;
}
if (isinf(__c) || isinf(__d)) {
__c = copysign(isinf(__c) ? _Tp(1) : _Tp(0), __c);
__d = copysign(isinf(__d) ? _Tp(1) : _Tp(0), __d);
if (isnan(__a))
__a = copysign(_Tp(0), __a);
if (isnan(__b))
__b = copysign(_Tp(0), __b);
__recalc = true;
}
if (!__recalc &&
(isinf(__ac) || isinf(__bd) || isinf(__ad) || isinf(__bc))) {
if (isnan(__a))
__a = copysign(_Tp(0), __a);
if (isnan(__b))
__b = copysign(_Tp(0), __b);
if (isnan(__c))
__c = copysign(_Tp(0), __c);
if (isnan(__d))
__d = copysign(_Tp(0), __d);
__recalc = true;
}
if (__recalc) {
__x = _Tp(INFINITY) * (__a * __c - __b * __d);
__y = _Tp(INFINITY) * (__a * __d + __b * __c);
}
}
return complex<_Tp>(__x, __y);
}
template <class _Tp>
inline complex<_Tp> operator*(const complex<_Tp>& __x, const _Tp& __y) {
complex<_Tp> __t(__x);
__t *= __y;
return __t;
}
template <class _Tp>
inline complex<_Tp> operator*(const _Tp& __x, const complex<_Tp>& __y) {
complex<_Tp> __t(__y);
__t *= __x;
return __t;
}
template <class _Tp>
complex<_Tp> operator/(const complex<_Tp>& __z, const complex<_Tp>& __w) {
int __ilogbw = 0;
_Tp __a = __z.real();
_Tp __b = __z.imag();
_Tp __c = __w.real();
_Tp __d = __w.imag();
_Tp __logbw = logb(fmax(fabs(__c), fabs(__d)));
if (isfinite(__logbw)) {
__ilogbw = static_cast<int>(__logbw);
__c = scalbn(__c, -__ilogbw);
__d = scalbn(__d, -__ilogbw);
}
_Tp __denom = __c * __c + __d * __d;
_Tp __x = scalbn((__a * __c + __b * __d) / __denom, -__ilogbw);
_Tp __y = scalbn((__b * __c - __a * __d) / __denom, -__ilogbw);
if (isnan(__x) && isnan(__y)) {
if ((__denom == _Tp(0)) && (!isnan(__a) || !isnan(__b))) {
__x = copysign(_Tp(INFINITY), __c) * __a;
__y = copysign(_Tp(INFINITY), __c) * __b;
} else if ((isinf(__a) || isinf(__b)) && isfinite(__c) && isfinite(__d)) {
__a = copysign(isinf(__a) ? _Tp(1) : _Tp(0), __a);
__b = copysign(isinf(__b) ? _Tp(1) : _Tp(0), __b);
__x = _Tp(INFINITY) * (__a * __c + __b * __d);
__y = _Tp(INFINITY) * (__b * __c - __a * __d);
} else if (
isinf(__logbw) && __logbw > _Tp(0) && isfinite(__a) && isfinite(__b)) {
__c = copysign(isinf(__c) ? _Tp(1) : _Tp(0), __c);
__d = copysign(isinf(__d) ? _Tp(1) : _Tp(0), __d);
__x = _Tp(0) * (__a * __c + __b * __d);
__y = _Tp(0) * (__b * __c - __a * __d);
}
}
return complex<_Tp>(__x, __y);
}
template <class _Tp>
inline complex<_Tp> operator/(const complex<_Tp>& __x, const _Tp& __y) {
return complex<_Tp>(__x.real() / __y, __x.imag() / __y);
}
template <class _Tp>
inline complex<_Tp> operator/(const _Tp& __x, const complex<_Tp>& __y) {
complex<_Tp> __t(__x);
__t /= __y;
return __t;
}
template <class _Tp>
inline complex<_Tp> operator+(const complex<_Tp>& __x) {
return __x;
}
template <class _Tp>
inline complex<_Tp> operator-(const complex<_Tp>& __x) {
return complex<_Tp>(-__x.real(), -__x.imag());
}
template <class _Tp>
inline constexpr bool operator==(
const complex<_Tp>& __x,
const complex<_Tp>& __y) {
return __x.real() == __y.real() && __x.imag() == __y.imag();
}
template <class _Tp>
inline constexpr bool operator==(const complex<_Tp>& __x, const _Tp& __y) {
return __x.real() == __y && __x.imag() == 0;
}
template <class _Tp>
inline constexpr bool operator==(const _Tp& __x, const complex<_Tp>& __y) {
return __x == __y.real() && 0 == __y.imag();
}
template <class _Tp>
inline constexpr bool operator!=(
const complex<_Tp>& __x,
const complex<_Tp>& __y) {
return !(__x == __y);
}
template <class _Tp>
inline constexpr bool operator!=(const complex<_Tp>& __x, const _Tp& __y) {
return !(__x == __y);
}
template <class _Tp>
inline constexpr bool operator!=(const _Tp& __x, const complex<_Tp>& __y) {
return !(__x == __y);
}
template <class _Tp>
inline constexpr bool operator&&(
const complex<_Tp>& __x,
const complex<_Tp>& __y) {
return bool(__x) && bool(__y);
}
template <class _Tp>
inline constexpr bool isnan(const complex<_Tp>& __x) {
return isnan(__x.real()) || isnan(__x.imag());
}
template <class _Tp>
inline constexpr bool operator||(
const complex<_Tp>& __x,
const complex<_Tp>& __y) {
return bool(__x) || bool(__y);
}
// 26.3.7 values:
template <
class _Tp,
bool = is_integral<_Tp>::value,
bool = is_floating_point<_Tp>::value>
struct __libcpp_complex_overload_traits {};
// Integral Types
template <class _Tp>
struct __libcpp_complex_overload_traits<_Tp, true, false> {
typedef double _ValueType;
typedef complex<double> _ComplexType;
};
// Floating point types
template <class _Tp>
struct __libcpp_complex_overload_traits<_Tp, false, true> {
typedef _Tp _ValueType;
typedef complex<_Tp> _ComplexType;
};
// real
template <class _Tp>
inline constexpr _Tp real(const complex<_Tp>& __c) {
return __c.real();
}
template <class _Tp>
inline constexpr typename __libcpp_complex_overload_traits<_Tp>::_ValueType real(
_Tp __re) {
return __re;
}
// imag
template <class _Tp>
inline constexpr _Tp imag(const complex<_Tp>& __c) {
return __c.imag();
}
template <class _Tp>
inline constexpr typename __libcpp_complex_overload_traits<_Tp>::_ValueType imag(
_Tp) {
return 0;
}
// abs
template <class _Tp>
inline _Tp abs(const complex<_Tp>& __c) {
return hypot(__c.real(), __c.imag());
}
// arg
template <class _Tp>
inline _Tp arg(const complex<_Tp>& __c) {
return atan2(__c.imag(), __c.real());
}
template <class _Tp>
inline typename enable_if<
is_integral<_Tp>::value || is_same<_Tp, double>::value,
double>::type
arg(_Tp __re) {
return atan2(0., __re);
}
template <class _Tp>
inline typename enable_if<is_same<_Tp, float>::value, float>::type arg(
_Tp __re) {
return atan2f(0.F, __re);
}
} // namespace std
namespace std {
using ::isfinite;
using ::isinf;
using ::isnan;
using ::signbit;
using ::abs;
using ::acos;
using ::acosf;
using ::asin;
using ::asinf;
using ::atan;
using ::atan2;
using ::atan2f;
using ::atanf;
using ::ceil;
using ::ceilf;
using ::cos;
using ::cosf;
using ::cosh;
using ::coshf;
using ::exp;
using ::expf;
using ::fabs;
using ::fabsf;
using ::floor;
using ::floorf;
using ::fmod;
using ::fmodf;
using ::frexp;
using ::frexpf;
using ::ldexp;
using ::ldexpf;
using ::log;
using ::logf;
using ::log10;
using ::log10f;
using ::modf;
using ::modff;
using ::pow;
using ::powf;
using ::sin;
using ::sinf;
using ::sinh;
using ::sinhf;
using ::sqrt;
using ::sqrtf;
using ::tan;
using ::tanf;
using ::tanh;
using ::tanhf;
using ::acosh;
using ::acoshf;
using ::asinh;
using ::asinhf;
using ::atanh;
using ::atanhf;
using ::cbrt;
using ::cbrtf;
using ::copysign;
using ::copysignf;
using ::erf;
using ::erfc;
using ::erfcf;
using ::erff;
using ::exp2;
using ::exp2f;
using ::expm1;
using ::expm1f;
using ::fdim;
using ::fdimf;
using ::fma;
using ::fmaf;
using ::fmax;
using ::fmaxf;
using ::fmin;
using ::fminf;
using ::hypot;
using ::hypotf;
using ::ilogb;
using ::ilogbf;
using ::lgamma;
using ::lgammaf;
using ::llrint;
using ::llrintf;
using ::llround;
using ::llroundf;
using ::log1p;
using ::log1pf;
using ::log2;
using ::log2f;
using ::logb;
using ::logbf;
using ::lrint;
using ::lrintf;
using ::lround;
using ::lroundf;
using ::nan;
using ::nanf;
using ::nearbyint;
using ::nearbyintf;
using ::nextafter;
using ::nextafterf;
using ::remainder;
using ::remainderf;
using ::remquo;
using ::remquof;
using ::rint;
using ::rintf;
using ::round;
using ::roundf;
using ::scalbln;
using ::scalblnf;
using ::scalbn;
using ::scalbnf;
using ::tgamma;
using ::tgammaf;
using ::trunc;
using ::truncf;
} // namespace std
namespace std {
// norm
template <class _Tp>
inline _Tp norm(const complex<_Tp>& __c) {
if (isinf(__c.real()))
return abs(__c.real());
if (isinf(__c.imag()))
return abs(__c.imag());
return __c.real() * __c.real() + __c.imag() * __c.imag();
}
template <class _Tp>
inline typename __libcpp_complex_overload_traits<_Tp>::_ValueType norm(
_Tp __re) {
typedef typename __libcpp_complex_overload_traits<_Tp>::_ValueType _ValueType;
return static_cast<_ValueType>(__re) * __re;
}
// conj
template <class _Tp>
inline complex<_Tp> conj(const complex<_Tp>& __c) {
return complex<_Tp>(__c.real(), -__c.imag());
}
template <class _Tp>
inline typename __libcpp_complex_overload_traits<_Tp>::_ComplexType conj(
_Tp __re) {
typedef
typename __libcpp_complex_overload_traits<_Tp>::_ComplexType _ComplexType;
return _ComplexType(__re);
}
// proj
template <class _Tp>
inline complex<_Tp> proj(const complex<_Tp>& __c) {
complex<_Tp> __r = __c;
if (isinf(__c.real()) || isinf(__c.imag()))
__r = complex<_Tp>(INFINITY, copysign(_Tp(0), __c.imag()));
return __r;
}
template <class _Tp>
inline typename enable_if<
is_floating_point<_Tp>::value,
typename __libcpp_complex_overload_traits<_Tp>::_ComplexType>::type
proj(_Tp __re) {
if (isinf(__re))
__re = abs(__re);
return complex<_Tp>(__re);
}
template <class _Tp>
inline typename enable_if<
is_integral<_Tp>::value,
typename __libcpp_complex_overload_traits<_Tp>::_ComplexType>::type
proj(_Tp __re) {
typedef
typename __libcpp_complex_overload_traits<_Tp>::_ComplexType _ComplexType;
return _ComplexType(__re);
}
// polar
template <class _Tp>
complex<_Tp> polar(const _Tp& __rho, const _Tp& __theta = _Tp()) {
if (isnan(__rho) || signbit(__rho))
return complex<_Tp>(_Tp(NAN), _Tp(NAN));
if (isnan(__theta)) {
if (isinf(__rho))
return complex<_Tp>(__rho, __theta);
return complex<_Tp>(__theta, __theta);
}
if (isinf(__theta)) {
if (isinf(__rho))
return complex<_Tp>(__rho, _Tp(NAN));
return complex<_Tp>(_Tp(NAN), _Tp(NAN));
}
_Tp __x = __rho * cos(__theta);
if (isnan(__x))
__x = 0;
_Tp __y = __rho * sin(__theta);
if (isnan(__y))
__y = 0;
return complex<_Tp>(__x, __y);
}
// log
template <class _Tp>
inline complex<_Tp> log(const complex<_Tp>& __x) {
return complex<_Tp>(log(abs(__x)), arg(__x));
}
// log10
template <class _Tp>
inline complex<_Tp> log10(const complex<_Tp>& __x) {
return log(__x) / log(_Tp(10));
}
// log2
template <class _Tp>
inline complex<_Tp> log2(const complex<_Tp>& __x) {
return log(__x) / log(_Tp(2));
}
// sqrt
template <class _Tp>
complex<_Tp> sqrt(const complex<_Tp>& __x) {
if (isinf(__x.imag()))
return complex<_Tp>(_Tp(INFINITY), __x.imag());
if (isinf(__x.real())) {
if (__x.real() > _Tp(0))
return complex<_Tp>(
__x.real(),
isnan(__x.imag()) ? __x.imag() : copysign(_Tp(0), __x.imag()));
return complex<_Tp>(
isnan(__x.imag()) ? __x.imag() : _Tp(0),
copysign(__x.real(), __x.imag()));
}
return polar(sqrt(abs(__x)), arg(__x) / _Tp(2));
}
// exp
template <class _Tp>
complex<_Tp> exp(const complex<_Tp>& __x) {
_Tp __i = __x.imag();
if (__i == 0) {
return complex<_Tp>(exp(__x.real()), copysign(_Tp(0), __x.imag()));
}
if (isinf(__x.real())) {
if (__x.real() < _Tp(0)) {
if (!isfinite(__i))
__i = _Tp(1);
} else if (__i == 0 || !isfinite(__i)) {
if (isinf(__i))
__i = _Tp(NAN);
return complex<_Tp>(__x.real(), __i);
}
}
_Tp __e = exp(__x.real());
return complex<_Tp>(__e * cos(__i), __e * sin(__i));
}
// pow
template <class _Tp>
inline complex<_Tp> pow(const complex<_Tp>& __x, const complex<_Tp>& __y) {
return exp(__y * log(__x));
}
template <class _Tp, class _Up>
inline complex<typename __promote<_Tp, _Up>::type> pow(
const complex<_Tp>& __x,
const complex<_Up>& __y) {
typedef complex<typename __promote<_Tp, _Up>::type> result_type;
return std::pow(result_type(__x), result_type(__y));
}
template <class _Tp, class _Up>
inline typename enable_if<
is_arithmetic<_Up>::value,
complex<typename __promote<_Tp, _Up>::type>>::type
pow(const complex<_Tp>& __x, const _Up& __y) {
typedef complex<typename __promote<_Tp, _Up>::type> result_type;
return std::pow(result_type(__x), result_type(__y));
}
template <class _Tp, class _Up>
inline typename enable_if<
is_arithmetic<_Tp>::value,
complex<typename __promote<_Tp, _Up>::type>>::type
pow(const _Tp& __x, const complex<_Up>& __y) {
typedef complex<typename __promote<_Tp, _Up>::type> result_type;
return std::pow(result_type(__x), result_type(__y));
}
// __sqr, computes pow(x, 2)
template <class _Tp>
inline complex<_Tp> __sqr(const complex<_Tp>& __x) {
return complex<_Tp>(
(__x.real() - __x.imag()) * (__x.real() + __x.imag()),
_Tp(2) * __x.real() * __x.imag());
}
// asinh
template <class _Tp>
complex<_Tp> asinh(const complex<_Tp>& __x) {
const _Tp __pi(atan2(+0., -0.));
if (isinf(__x.real())) {
if (isnan(__x.imag()))
return __x;
if (isinf(__x.imag()))
return complex<_Tp>(__x.real(), copysign(__pi * _Tp(0.25), __x.imag()));
return complex<_Tp>(__x.real(), copysign(_Tp(0), __x.imag()));
}
if (isnan(__x.real())) {
if (isinf(__x.imag()))
return complex<_Tp>(__x.imag(), __x.real());
if (__x.imag() == 0)
return __x;
return complex<_Tp>(__x.real(), __x.real());
}
if (isinf(__x.imag()))
return complex<_Tp>(
copysign(__x.imag(), __x.real()), copysign(__pi / _Tp(2), __x.imag()));
complex<_Tp> __z = log(__x + sqrt(__sqr(__x) + _Tp(1)));
return complex<_Tp>(
copysign(__z.real(), __x.real()), copysign(__z.imag(), __x.imag()));
}
// acosh
template <class _Tp>
complex<_Tp> acosh(const complex<_Tp>& __x) {
const _Tp __pi(atan2(+0., -0.));
if (isinf(__x.real())) {
if (isnan(__x.imag()))
return complex<_Tp>(abs(__x.real()), __x.imag());
if (isinf(__x.imag())) {
if (__x.real() > 0)
return complex<_Tp>(__x.real(), copysign(__pi * _Tp(0.25), __x.imag()));
else
return complex<_Tp>(
-__x.real(), copysign(__pi * _Tp(0.75), __x.imag()));
}
if (__x.real() < 0)
return complex<_Tp>(-__x.real(), copysign(__pi, __x.imag()));
return complex<_Tp>(__x.real(), copysign(_Tp(0), __x.imag()));
}
if (isnan(__x.real())) {
if (isinf(__x.imag()))
return complex<_Tp>(abs(__x.imag()), __x.real());
return complex<_Tp>(__x.real(), __x.real());
}
if (isinf(__x.imag()))
return complex<_Tp>(abs(__x.imag()), copysign(__pi / _Tp(2), __x.imag()));
complex<_Tp> __z = log(__x + sqrt(__sqr(__x) - _Tp(1)));
return complex<_Tp>(
copysign(__z.real(), _Tp(0)), copysign(__z.imag(), __x.imag()));
}
// atanh
template <class _Tp>
complex<_Tp> atanh(const complex<_Tp>& __x) {
const _Tp __pi(atan2(+0., -0.));
if (isinf(__x.imag())) {
return complex<_Tp>(
copysign(_Tp(0), __x.real()), copysign(__pi / _Tp(2), __x.imag()));
}
if (isnan(__x.imag())) {
if (isinf(__x.real()) || __x.real() == 0)
return complex<_Tp>(copysign(_Tp(0), __x.real()), __x.imag());
return complex<_Tp>(__x.imag(), __x.imag());
}
if (isnan(__x.real())) {
return complex<_Tp>(__x.real(), __x.real());
}
if (isinf(__x.real())) {
return complex<_Tp>(
copysign(_Tp(0), __x.real()), copysign(__pi / _Tp(2), __x.imag()));
}
if (abs(__x.real()) == _Tp(1) && __x.imag() == _Tp(0)) {
return complex<_Tp>(
copysign(_Tp(INFINITY), __x.real()), copysign(_Tp(0), __x.imag()));
}
complex<_Tp> __z = log((_Tp(1) + __x) / (_Tp(1) - __x)) / _Tp(2);
return complex<_Tp>(
copysign(__z.real(), __x.real()), copysign(__z.imag(), __x.imag()));
}
// sinh
template <class _Tp>
complex<_Tp> sinh(const complex<_Tp>& __x) {
if (isinf(__x.real()) && !isfinite(__x.imag()))
return complex<_Tp>(__x.real(), _Tp(NAN));
if (__x.real() == 0 && !isfinite(__x.imag()))
return complex<_Tp>(__x.real(), _Tp(NAN));
if (__x.imag() == 0 && !isfinite(__x.real()))
return __x;
return complex<_Tp>(
sinh(__x.real()) * cos(__x.imag()), cosh(__x.real()) * sin(__x.imag()));
}
// cosh
template <class _Tp>
complex<_Tp> cosh(const complex<_Tp>& __x) {
if (isinf(__x.real()) && !isfinite(__x.imag()))
return complex<_Tp>(abs(__x.real()), _Tp(NAN));
if (__x.real() == 0 && !isfinite(__x.imag()))
return complex<_Tp>(_Tp(NAN), __x.real());
if (__x.real() == 0 && __x.imag() == 0)
return complex<_Tp>(_Tp(1), __x.imag());
if (__x.imag() == 0 && !isfinite(__x.real()))
return complex<_Tp>(abs(__x.real()), __x.imag());
return complex<_Tp>(
cosh(__x.real()) * cos(__x.imag()), sinh(__x.real()) * sin(__x.imag()));
}
// tanh
template <class _Tp>
complex<_Tp> tanh(const complex<_Tp>& __x) {
if (isinf(__x.real())) {
if (!isfinite(__x.imag()))
return complex<_Tp>(copysign(_Tp(1), __x.real()), _Tp(0));
return complex<_Tp>(
copysign(_Tp(1), __x.real()),
copysign(_Tp(0), sin(_Tp(2) * __x.imag())));
}
if (isnan(__x.real()) && __x.imag() == 0)
return __x;
_Tp __2r(_Tp(2) * __x.real());
_Tp __2i(_Tp(2) * __x.imag());
_Tp __d(cosh(__2r) + cos(__2i));
_Tp __2rsh(sinh(__2r));
if (isinf(__2rsh) && isinf(__d))
return complex<_Tp>(
__2rsh > _Tp(0) ? _Tp(1) : _Tp(-1), __2i > _Tp(0) ? _Tp(0) : _Tp(-0.));
return complex<_Tp>(__2rsh / __d, sin(__2i) / __d);
}
// asin
template <class _Tp>
complex<_Tp> asin(const complex<_Tp>& __x) {
complex<_Tp> __z = asinh(complex<_Tp>(-__x.imag(), __x.real()));
return complex<_Tp>(__z.imag(), -__z.real());
}
// acos
template <class _Tp>
complex<_Tp> acos(const complex<_Tp>& __x) {
const _Tp __pi(atan2(+0., -0.));
if (isinf(__x.real())) {
if (isnan(__x.imag()))
return complex<_Tp>(__x.imag(), __x.real());
if (isinf(__x.imag())) {
if (__x.real() < _Tp(0))
return complex<_Tp>(_Tp(0.75) * __pi, -__x.imag());
return complex<_Tp>(_Tp(0.25) * __pi, -__x.imag());
}
if (__x.real() < _Tp(0))
return complex<_Tp>(__pi, signbit(__x.imag()) ? -__x.real() : __x.real());
return complex<_Tp>(_Tp(0), signbit(__x.imag()) ? __x.real() : -__x.real());
}
if (isnan(__x.real())) {
if (isinf(__x.imag()))
return complex<_Tp>(__x.real(), -__x.imag());
return complex<_Tp>(__x.real(), __x.real());
}
if (isinf(__x.imag()))
return complex<_Tp>(__pi / _Tp(2), -__x.imag());
if (__x.real() == 0 && (__x.imag() == 0 || isnan(__x.imag())))
return complex<_Tp>(__pi / _Tp(2), -__x.imag());
complex<_Tp> __z = log(__x + sqrt(__sqr(__x) - _Tp(1)));
if (signbit(__x.imag()))
return complex<_Tp>(abs(__z.imag()), abs(__z.real()));
return complex<_Tp>(abs(__z.imag()), -abs(__z.real()));
}
// atan
template <class _Tp>
complex<_Tp> atan(const complex<_Tp>& __x) {
complex<_Tp> __z = atanh(complex<_Tp>(-__x.imag(), __x.real()));
return complex<_Tp>(__z.imag(), -__z.real());
}
// sin
template <class _Tp>
complex<_Tp> sin(const complex<_Tp>& __x) {
complex<_Tp> __z = sinh(complex<_Tp>(-__x.imag(), __x.real()));
return complex<_Tp>(__z.imag(), -__z.real());
}
// cos
template <class _Tp>
inline complex<_Tp> cos(const complex<_Tp>& __x) {
return cosh(complex<_Tp>(-__x.imag(), __x.real()));
}
// tan
template <class _Tp>
complex<_Tp> tan(const complex<_Tp>& __x) {
complex<_Tp> __z = tanh(complex<_Tp>(-__x.imag(), __x.real()));
return complex<_Tp>(__z.imag(), -__z.real());
}
// Literal suffix for complex number literals [complex.literals]
inline namespace literals {
inline namespace complex_literals {
constexpr complex<double> operator""i(long double __im) {
return {0.0, static_cast<double>(__im)};
}
constexpr complex<double> operator""i(unsigned long long __im) {
return {0.0, static_cast<double>(__im)};
}
constexpr complex<float> operator""if(long double __im) {
return {0.0f, static_cast<float>(__im)};
}
constexpr complex<float> operator""if(unsigned long long __im) {
return {0.0f, static_cast<float>(__im)};
}
} // namespace complex_literals
} // namespace literals
} // namespace std
__device__ std::complex<double> lerp(
std::complex<double> start,
std::complex<double> end,
std::complex<double> weight) {
if (abs(weight) < 0.5) {
return start + weight * (end - start);
} else {
return end - (end - start) * (1.0 - weight);
}
}
__device__ std::complex<float> lerp(
std::complex<float> start,
std::complex<float> end,
std::complex<float> weight) {
if (abs(weight) < 0.5f) {
return start + weight * (end - start);
} else {
return end - (end - start) * (1.0f - weight);
}
}
__device__ std::complex<double> reciprocal(std::complex<double> x) {
return 1.0 / x;
}
__device__ std::complex<float> reciprocal(std::complex<float> x) {
return 1.0f / x;
}
__device__ std::complex<double> sigmoid(std::complex<double> x) {
return 1.0 / (1.0 + exp(-x));
}
__device__ std::complex<float> sigmoid(std::complex<float> x) {
return 1.0f / (1.0f + exp(-x));
}
// The reciprocal of a complex number z is
// 1/z = conj(z)/|z|^2.
// The principal square root of a complex number z can be obtained by [1]
// sqrt(z) = sqrt(|z|) (z + |z|) / |z + |z||.
// Combining these formulas we have
// 1/sqrt(z) = (conj(z) + |z|) / (sqrt(|z|) |z + |z||).
// [1] https://math.stackexchange.com/a/44500
__device__ std::complex<float> rsqrt(std::complex<float> z) {
auto a = std::real(z);
auto b = std::imag(z);
auto absa = ::fabsf(a);
auto absb = ::fabsf(b);
// scale to avoid precision loss due to underflow/overflow
auto scale = fmax(absa, absb);
a /= scale;
b /= scale;
auto a_sq = a * a;
auto b_sq = b * b;
auto modz_sq = a_sq + b_sq;
auto modz = ::sqrtf(modz_sq);
auto a_plus_modz = a + modz;
auto mod_zplusmodz_sq = a_plus_modz * a_plus_modz + b_sq;
auto fac = ::rsqrtf(scale * modz * mod_zplusmodz_sq);
return std::complex<float>(a_plus_modz * fac, -b * fac);
}
__device__ std::complex<double> rsqrt(std::complex<double> z) {
auto a = std::real(z);
auto b = std::imag(z);
auto absa = ::abs(a);
auto absb = ::abs(b);
// scale to avoid precision loss due to underflow/overflow
auto scale = fmax(absa, absb);
a /= scale;
b /= scale;
auto a_sq = a * a;
auto b_sq = b * b;
auto modz_sq = a_sq + b_sq;
auto modz = ::sqrt(modz_sq);
auto a_plus_modz = a + modz;
auto mod_zplusmodz_sq = a_plus_modz * a_plus_modz + b_sq;
auto fac = ::rsqrt(scale * modz * mod_zplusmodz_sq);
return std::complex<double>(a_plus_modz * fac, -b * fac);
}
template <typename T>
bool isfinite(std::complex<T> x) {
return ::isfinite(std::real(x)) && ::isfinite(std::imag(x));
}
template <typename T>
bool isinf(std::complex<T> x) {
return ::isinf(std::real(x)) || ::isinf(std::imag(x));
}
template <typename T>
bool isreal(std::complex<T> x) {
return std::imag(x) == 0;
}
#endif // __NVCC__
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
#define __NVFUSER_HALF_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
#define __NVFUSER_HALF_TO_CUS(var) \
*(reinterpret_cast<const unsigned short*>(&(var)))
struct __half;
__device__ __inline__ __half __float2half(const float);
struct __align__(2) __half {
__half() = default;
__half(const __half& other) {
__x = other.__x;
}
__half(const __half&& other) {
__x = other.__x;
}
__half(const volatile __half& other) {
__x = other.__x;
}
__half(const volatile __half&& other) {
__x = other.__x;
}
// Note: not returning reference for `__half::operator=`
// Doing so would requires us to return `volatile __half&` for the volatile
// variants, which would trigger a gcc warning `implicit dereference will not
// access object of type ‘volatile S’ in statement`
__device__ void operator=(const __half& other) {
__x = other.__x;
}
__device__ void operator=(const __half&& other) {
__x = other.__x;
}
__device__ void operator=(const volatile __half& other) {
__x = other.__x;
}
__device__ void operator=(const volatile __half&& other) {
__x = other.__x;
}
__device__ void operator=(const __half& other) volatile {
__x = other.__x;
}
__device__ void operator=(const __half&& other) volatile {
__x = other.__x;
}
__device__ void operator=(const volatile __half& other) volatile {
__x = other.__x;
}
__device__ void operator=(const volatile __half&& other) volatile {
__x = other.__x;
}
__device__ __half(const float f) {
__x = __float2half(f).__x;
}
__device__ uint16_t raw() const {
return __x;
}
protected:
unsigned short __x;
};
__device__ __inline__ __half __float2half(const float f) {
__half val;
asm("{ cvt.rn.f16.f32 %0, %1;}\n"
: "=h"(__NVFUSER_HALF_TO_US(val))
: "f"(f));
return val;
}
__device__ __inline__ __half __double2half(const double d) {
__half val;
asm("{ cvt.rn.f16.f64 %0, %1;}\n"
: "=h"(__NVFUSER_HALF_TO_US(val))
: "d"(d));
return val;
}
__device__ __inline__ __half __int2half(const int i) {
__half val;
asm("{ cvt.rn.f16.s32 %0, %1;}\n"
: "=h"(__NVFUSER_HALF_TO_US(val))
: "r"(i));
return val;
}
__device__ __inline__ __half __int2half(const int64_t i64) {
__half val;
asm("{ cvt.rn.f16.s64 %0, %1;}\n"
: "=h"(__NVFUSER_HALF_TO_US(val))
: "l"(i64));
return val;
}
__device__ __inline__ __half __int2half(const uint32_t i) {
__half val;
asm("{ cvt.rn.f16.u32 %0, %1;}\n"
: "=h"(__NVFUSER_HALF_TO_US(val))
: "r"(i));
return val;
}
__device__ __inline__ __half __int2half(const uint64_t i64) {
__half val;
asm("{ cvt.rn.f16.u64 %0, %1;}\n"
: "=h"(__NVFUSER_HALF_TO_US(val))
: "l"(i64));
return val;
}
__device__ __inline__ __half __bool2half(const bool b) {
return __int2half((int)b);
}
__device__ __inline__ float __half2float(const __half h) {
float val;
asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__NVFUSER_HALF_TO_CUS(h)));
return val;
}
__device__ __inline__ double __half2double(const __half h) {
double val;
asm("{ cvt.f64.f16 %0, %1;}\n" : "=d"(val) : "h"(__NVFUSER_HALF_TO_CUS(h)));
return val;
}
__device__ int __half2int32(const __half h) {
int val;
asm("{ cvt.rzi.s32.f16 %0, %1;}\n"
: "=r"(val)
: "h"(__NVFUSER_HALF_TO_CUS(h)));
return val;
}
__device__ __inline__ int64_t __half2int(const __half h) {
int64_t val;
asm("{ cvt.rzi.s64.f16 %0, %1;}\n"
: "=l"(val)
: "h"(__NVFUSER_HALF_TO_CUS(h)));
return val;
}
__device__ int __half2uint32(const __half h) {
int val;
asm("{ cvt.rzi.u32.f16 %0, %1;}\n"
: "=r"(val)
: "h"(__NVFUSER_HALF_TO_CUS(h)));
return val;
}
__device__ __inline__ int64_t __half2uint(const __half h) {
int64_t val;
asm("{ cvt.rzi.u64.f16 %0, %1;}\n"
: "=l"(val)
: "h"(__NVFUSER_HALF_TO_CUS(h)));
return val;
}
__device__ __inline__ void __half2int(const __half h, int& output) {
output = __half2int32(h);
}
__device__ __inline__ void __half2int(const __half h, int64_t& output) {
output = __half2int(h);
}
__device__ __inline__ void __half2int(const __half h, uint32_t& output) {
output = __half2uint32(h);
}
__device__ __inline__ void __half2int(const __half h, uint64_t& output) {
output = __half2uint(h);
}
__device__ __inline__ nvfuser_index_t __half2index(const __half h) {
nvfuser_index_t result;
__half2int(h, result);
return result;
}
__device__ __inline__ bool __half2bool(const __half h) {
return (bool)__half2float(h) != 0;
}
__device__ __inline__ __half __real_then_2half(const std::complex<float> c) {
return __float2half(std::real(c));
}
__device__ __inline__ __half __real_then_2half(const std::complex<double> c) {
return __double2half(std::real(c));
}
__device__ __inline__ bool __heq(const __half a, const __half b) {
// From cuda_fp16.hpp
unsigned short val;
asm("{ .reg .pred __$temp3;\n"
" setp.eq.f16 __$temp3, %1, %2;\n"
" selp.u16 %0, 1, 0, __$temp3;}"
: "=h"(val)
: "h"(__NVFUSER_HALF_TO_CUS(a)), "h"(__NVFUSER_HALF_TO_CUS(b)));
return (val != 0U) ? true : false;
}
__device__ __inline__ __half operator|(const __half x, const __half y) {
__half val;
asm("{ or.b16 %0, %1, %2;}\n"
: "=h"(__NVFUSER_HALF_TO_US(val))
: "h"(__NVFUSER_HALF_TO_CUS(x)), "h"(__NVFUSER_HALF_TO_CUS(y)));
return val;
}
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
#define __NVFUSER_BFLOAT_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
#define __NVFUSER_BFLOAT_TO_CUS(var) \
*(reinterpret_cast<const unsigned short*>(&(var)))
struct __bfloat;
__device__ __inline__ __bfloat __float2bfloat(const float);
struct __align__(2) __bfloat {
__bfloat() = default;
__bfloat(const __bfloat& other) {
__x = other.__x;
}
__bfloat(const __bfloat&& other) {
__x = other.__x;
}
__bfloat(const volatile __bfloat& other) {
__x = other.__x;
}
__bfloat(const volatile __bfloat&& other) {
__x = other.__x;
}
// Note: not returning reference for `__bfloat::operator=`
// Doing so would requires us to return `volatile __bfloat&` for the volatile
// variants, which would trigger a gcc warning `implicit dereference will not
// access object of type ‘volatile S’ in statement`
__device__ void operator=(const __bfloat& other) {
__x = other.__x;
}
__device__ void operator=(const __bfloat&& other) {
__x = other.__x;
}
__device__ void operator=(const volatile __bfloat& other) {
__x = other.__x;
}
__device__ void operator=(const volatile __bfloat&& other) {
__x = other.__x;
}
__device__ void operator=(const __bfloat& other) volatile {
__x = other.__x;
}
__device__ void operator=(const __bfloat&& other) volatile {
__x = other.__x;
}
__device__ void operator=(const volatile __bfloat& other) volatile {
__x = other.__x;
}
__device__ void operator=(const volatile __bfloat&& other) volatile {
__x = other.__x;
}
__device__ __bfloat(const float f) {
__x = __float2bfloat(f).__x;
}
__device__ uint16_t raw() const {
return __x;
}
protected:
unsigned short __x;
};
__device__ __inline__ __bfloat __float2bfloat(const float f) {
__bfloat val;
asm("{ cvt.rn.bf16.f32 %0, %1;}\n"
: "=h"(__NVFUSER_BFLOAT_TO_US(val))
: "f"(f));
return val;
}
__device__ __inline__ __bfloat __double2bfloat(const double d) {
#if __CUDA_ARCH__ >= 900
__bfloat val;
asm("{ cvt.rn.bf16.f64 %0, %1;}\n"
: "=h"(__NVFUSER_BFLOAT_TO_US(val))
: "d"(d));
return val;
#else
return __float2bfloat(static_cast<float>(d));
#endif
}
__device__ __inline__ __bfloat __int2bfloat(const int i) {
#if __CUDA_ARCH__ >= 900
__bfloat val;
asm("{ cvt.rn.bf16.s32 %0, %1;}\n"
: "=h"(__NVFUSER_BFLOAT_TO_US(val))
: "r"(i));
return val;
#else
return __float2bfloat(static_cast<float>(i));
#endif
}
__device__ __inline__ __bfloat __int2bfloat(const int64_t i64) {
#if __CUDA_ARCH__ >= 900
__bfloat val;
asm("{ cvt.rn.bf16.s64 %0, %1;}\n"
: "=h"(__NVFUSER_BFLOAT_TO_US(val))
: "l"(i64));
return val;
#else
return __float2bfloat(static_cast<float>(i64));
#endif
}
__device__ __inline__ __bfloat __int2bfloat(const uint32_t i) {
#if __CUDA_ARCH__ >= 900
__bfloat val;
asm("{ cvt.rn.bf16.u32 %0, %1;}\n"
: "=h"(__NVFUSER_BFLOAT_TO_US(val))
: "r"(i));
return val;
#else
return __float2bfloat(static_cast<float>(i));
#endif
}
__device__ __inline__ __bfloat __int2bfloat(const uint64_t i64) {
#if __CUDA_ARCH__ >= 900
__bfloat val;
asm("{ cvt.rn.bf16.u64 %0, %1;}\n"
: "=h"(__NVFUSER_BFLOAT_TO_US(val))
: "l"(i64));
return val;
#else
return __float2bfloat(static_cast<float>(i64));
#endif
}
__device__ __inline__ __bfloat __bool2bfloat(const bool b) {
return __int2bfloat((int)b);
}
__device__ __inline__ float __bfloat2float(const __bfloat h) {
float val;
asm("{ mov.b32 %0, {0,%1};}\n"
: "=f"(val)
: "h"(__NVFUSER_BFLOAT_TO_CUS(h)));
return val;
}
__device__ __inline__ double __bfloat2double(const __bfloat h) {
#if __CUDA_ARCH__ >= 900
double val;
asm("{ cvt.f64.bf16 %0, %1;}\n"
: "=d"(val)
: "h"(__NVFUSER_BFLOAT_TO_CUS(h)));
return val;
#else
return static_cast<double>(__bfloat2float(h));
#endif
}
__device__ int __bfloat2int32(const __bfloat h) {
#if __CUDA_ARCH__ >= 900
int val;
asm("{ cvt.rzi.s32.bf16 %0, %1;}\n"
: "=r"(val)
: "h"(__NVFUSER_BFLOAT_TO_CUS(h)));
return val;
#else
return static_cast<int>(__bfloat2float(h));
#endif
}
__device__ __inline__ int64_t __bfloat2int(const __bfloat h) {
#if __CUDA_ARCH__ >= 900
int64_t val;
asm("{ cvt.rzi.s64.bf16 %0, %1;}\n"
: "=l"(val)
: "h"(__NVFUSER_BFLOAT_TO_CUS(h)));
return val;
#else
return static_cast<int64_t>(__bfloat2float(h));
#endif
}
__device__ int __bfloat2uint32(const __bfloat h) {
#if __CUDA_ARCH__ >= 900
int val;
asm("{ cvt.rzi.u32.bf16 %0, %1;}\n"
: "=r"(val)
: "h"(__NVFUSER_BFLOAT_TO_CUS(h)));
return val;
#else
return static_cast<int>(__bfloat2float(h));
#endif
}
__device__ __inline__ int64_t __bfloat2uint(const __bfloat h) {
#if __CUDA_ARCH__ >= 900
int64_t val;
asm("{ cvt.rzi.u64.bf16 %0, %1;}\n"
: "=l"(val)
: "h"(__NVFUSER_BFLOAT_TO_CUS(h)));
return val;
#else
return static_cast<int64_t>(__bfloat2float(h));
#endif
}
__device__ __inline__ void __bfloat2int(const __bfloat h, int& output) {
output = __bfloat2int32(h);
}
__device__ __inline__ void __bfloat2int(const __bfloat h, int64_t& output) {
output = __bfloat2int(h);
}
__device__ __inline__ void __bfloat2int(const __bfloat h, uint32_t& output) {
output = __bfloat2uint32(h);
}
__device__ __inline__ void __bfloat2int(const __bfloat h, uint64_t& output) {
output = __bfloat2uint(h);
}
__device__ __inline__ nvfuser_index_t __bfloat2index(
const __bfloat h,
bool& output) {
nvfuser_index_t result;
__bfloat2int(h, result);
return result;
}
__device__ __inline__ bool __bfloat2bool(const __bfloat h) {
return (bool)__bfloat2float(h) != 0;
}
__device__ __inline__ __bfloat __half2bfloat(const __half h) {
#if __CUDA_ARCH__ >= 900
__bfloat val;
asm("{ cvt.rn.bf16.f16 %0, %1;}\n"
: "=h"(__NVFUSER_BFLOAT_TO_US(val))
: "h"(__NVFUSER_HALF_TO_CUS(h)));
return val;
#else
return __float2bfloat(__half2float(h));
#endif
}
__device__ __inline__ __half __bfloat2half(const __bfloat h) {
#if __CUDA_ARCH__ >= 900
__half val;
asm("{ cvt.rn.f16.bf16 %0, %1;}\n"
: "=h"(__NVFUSER_HALF_TO_US(val))
: "h"(__NVFUSER_BFLOAT_TO_CUS(h)));
return val;
#else
return __float2half(__bfloat2float(h));
#endif
}
__device__ __inline__ __bfloat __real_then_2bfloat(
const std::complex<float> c) {
return __float2bfloat(std::real(c));
}
__device__ __inline__ __bfloat __real_then_2bfloat(
const std::complex<double> c) {
return __double2bfloat(std::real(c));
}
__device__ __inline__ bool __heq(const __bfloat a, const __bfloat b) {
// From cuda_bf16.hpp
#if __CUDA_ARCH__ >= 900
unsigned short val;
asm("{ .reg .pred __$temp3;\n"
" setp.eq.bf16 __$temp3, %1, %2;\n"
" selp.u16 %0, 1, 0, __$temp3;}"
: "=h"(val)
: "h"(__NVFUSER_BFLOAT_TO_CUS(a)), "h"(__NVFUSER_BFLOAT_TO_CUS(b)));
#else
unsigned int val;
asm("{.reg .b32 a,b;\n"
" mov.b32 a, {0, %1};\n"
" mov.b32 b, {0, %2};\n"
" set.eq.f32.f32 %0, a, b;}\n"
: "=r"(val)
: "h"(__NVFUSER_BFLOAT_TO_CUS(a)), "h"(__NVFUSER_BFLOAT_TO_CUS(b)));
#endif
return (val != 0U) ? true : false;
}
__device__ __inline__ __bfloat operator|(const __bfloat x, const __bfloat y) {
__bfloat val;
asm("{ or.b16 %0, %1, %2;}\n"
: "=h"(__NVFUSER_BFLOAT_TO_US(val))
: "h"(__NVFUSER_BFLOAT_TO_CUS(x)), "h"(__NVFUSER_BFLOAT_TO_CUS(y)));
return val;
}
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
struct __e4m3;
__device__ __inline__ __e4m3 __float2e4m3(const float);
__device__ __inline__ __e4m3 __double2e4m3(const double);
struct __align__(1) __e4m3 {
__e4m3() = default;
__e4m3(const __e4m3& other) {
__x = other.__x;
}
__e4m3(const __e4m3&& other) {
__x = other.__x;
}
__e4m3(const volatile __e4m3& other) {
__x = other.__x;
}
__e4m3(const volatile __e4m3&& other) {
__x = other.__x;
}
// Note: not returning reference for `__e4m3::operator=`
// Doing so would requires us to return `volatile __e4m3&` for the volatile
// variants, which would trigger a gcc warning `implicit dereference will not
// access object of type ‘volatile S’ in statement`
__device__ void operator=(const __e4m3& other) {
__x = other.__x;
}
__device__ void operator=(const __e4m3&& other) {
__x = other.__x;
}
__device__ void operator=(const volatile __e4m3& other) {
__x = other.__x;
}
__device__ void operator=(const volatile __e4m3&& other) {
__x = other.__x;
}
__device__ void operator=(const __e4m3& other) volatile {
__x = other.__x;
}
__device__ void operator=(const __e4m3&& other) volatile {
__x = other.__x;
}
__device__ void operator=(const volatile __e4m3& other) volatile {
__x = other.__x;
}
__device__ void operator=(const volatile __e4m3&& other) volatile {
__x = other.__x;
}
__device__ __e4m3(const float f) {
__x = __float2e4m3(f).__x;
}
__device__ __e4m3(const double f) {
__x = __double2e4m3(f).__x;
}
__device__ __e4m3(const int x) : __x(x) {}
__device__ __e4m3(const long long x) : __x(x) {}
__device__ __e4m3(const uint8_t x) : __x(x) {}
__device__ __e4m3(const uint16_t x) : __x(x) {}
__device__ uint8_t raw() const {
return __x;
}
protected:
uint8_t __x;
};
// NOTE [ fp8 cast optimization ]
//
// For simplicity, we only provided fp8 <-> fp32 cast implementation, while
// relying on any other fp cast in the form of target_fp <-> fp32 <-> fp8.
// This avoids the complication of handling hardware specific instructions on
// various compute capabilities.
// But this simplicity could come at the cost of performance. In cuda_fp8.hpp,
// 1. bf16 -> fp8 is done via bf16 -> float -> fp8
// 2. fp16 -> fp8 is done with a conditional
// # if (> sm_89)
// fp16 -> fp8
// # else
// fp16 -> fp32 -> fp8
// # endif
// 3. fp64 -> fp8 is handled explicitly as bitwise operations.
// TODO consider cuda_fp8.hpp for performance optimized cast.
__device__ __inline__ __e4m3 __float2e4m3(const float f) {
constexpr float f_const_zero = 0.f;
unsigned short _tmp_buffer;
__e4m3 val;
asm("{cvt.rn.satfinite.e4m3x2.f32 %0, %1, %2;}"
: "=h"(_tmp_buffer)
: "f"(f_const_zero), "f"(f));
memcpy(&val, &_tmp_buffer, sizeof(uint8_t));
return val;
}
__device__ __inline__ float __e4m32float(const __e4m3 b) {
unsigned short _tmp_buffer;
memcpy(&_tmp_buffer, &b, sizeof(uint8_t));
float val;
asm("{\n\t"
".reg .b32 buf0;\n\t"
"cvt.rn.f16x2.e4m3x2 buf0, %1;\n\t"
"cvt.u16.u32 %1, buf0;\n\t"
"cvt.f32.f16 %0, %1;\n\t"
"}"
: "=f"(val)
: "h"(_tmp_buffer));
return val;
}
__device__ __inline__ __e4m3 __double2e4m3(const double d) {
return __float2e4m3(d);
}
__device__ __inline__ double __e4m32double(const __e4m3 b) {
return __e4m32float(b);
}
__device__ __inline__ __e4m3 __half2e4m3(const __half h) {
return __float2e4m3(__half2float(h));
}
__device__ __inline__ __half __e4m32half(const __e4m3 b) {
return __float2half(__e4m32float(b));
}
__device__ __inline__ __e4m3 __bfloat2e4m3(const __bfloat h) {
return __float2e4m3(__bfloat2float(h));
}
__device__ __inline__ __bfloat __e4m32bfloat(const __e4m3 b) {
return __float2bfloat(__e4m32float(b));
}
__device__ __inline__ __e4m3 operator|(const __e4m3 x, const __e4m3 y) {
unsigned short val;
unsigned short x_val = x.raw();
unsigned short y_val = y.raw();
asm("{ or.b16 %0, %1, %2;}\n" : "=h"(val) : "h"(x_val), "h"(y_val));
return __e4m3(val);
}
struct __e5m2;
__device__ __inline__ __e5m2 __float2e5m2(const float);
__device__ __inline__ __e5m2 __double2e5m2(const double);
struct __align__(1) __e5m2 {
__e5m2() = default;
__e5m2(const __e5m2& other) {
__x = other.__x;
}
__e5m2(const __e5m2&& other) {
__x = other.__x;
}
__e5m2(const volatile __e5m2& other) {
__x = other.__x;
}
__e5m2(const volatile __e5m2&& other) {
__x = other.__x;
}
// Note: not returning reference for `__e5m2::operator=`
// Doing so would requires us to return `volatile __e5m2&` for the volatile
// variants, which would trigger a gcc warning `implicit dereference will not
// access object of type ‘volatile S’ in statement`
__device__ void operator=(const __e5m2& other) {
__x = other.__x;
}
__device__ void operator=(const __e5m2&& other) {
__x = other.__x;
}
__device__ void operator=(const volatile __e5m2& other) {
__x = other.__x;
}
__device__ void operator=(const volatile __e5m2&& other) {
__x = other.__x;
}
__device__ void operator=(const __e5m2& other) volatile {
__x = other.__x;
}
__device__ void operator=(const __e5m2&& other) volatile {
__x = other.__x;
}
__device__ void operator=(const volatile __e5m2& other) volatile {
__x = other.__x;
}
__device__ void operator=(const volatile __e5m2&& other) volatile {
__x = other.__x;
}
__device__ __e5m2(const float f) {
__x = __float2e5m2(f).__x;
}
__device__ __e5m2(const double f) {
__x = __double2e5m2(f).__x;
}
__device__ __e5m2(const int x) : __x(x) {}
__device__ __e5m2(const long long x) : __x(x) {}
__device__ __e5m2(const uint8_t x) : __x(x) {}
__device__ __e5m2(const uint16_t x) : __x(x) {}
__device__ uint8_t raw() const {
return __x;
}
protected:
uint8_t __x;
};
// see NOTE [ fp8 cast optimization ]
__device__ __inline__ __e5m2 __float2e5m2(const float f) {
constexpr float f_const_zero = 0.f;
unsigned short _tmp_buffer;
__e5m2 val;
asm("{cvt.rn.satfinite.e5m2x2.f32 %0, %1, %2;}"
: "=h"(_tmp_buffer)
: "f"(f_const_zero), "f"(f));
memcpy(&val, &_tmp_buffer, sizeof(uint8_t));
return val;
}
__device__ __inline__ float __e5m22float(const __e5m2 b) {
unsigned short _tmp_buffer;
memcpy(&_tmp_buffer, &b, sizeof(uint8_t));
float val;
asm("{\n\t"
".reg .b32 buf0;\n\t"
"cvt.rn.f16x2.e5m2x2 buf0, %1;\n\t"
"cvt.u16.u32 %1, buf0;\n\t"
"cvt.f32.f16 %0, %1;\n\t"
"}"
: "=f"(val)
: "h"(_tmp_buffer));
return val;
}
__device__ __inline__ __e5m2 __double2e5m2(const double f) {
return __float2e5m2(f);
}
__device__ __inline__ double __e5m22double(const __e5m2 b) {
return __e5m22float(b);
}
__device__ __inline__ __e5m2 __half2e5m2(const __half h) {
return __float2e5m2(__half2float(h));
}
__device__ __inline__ __half __e5m22half(const __e5m2 b) {
return __float2half(__e5m22float(b));
}
__device__ __inline__ __e5m2 __bfloat2e5m2(const __bfloat h) {
return __float2e5m2(__bfloat2float(h));
}
__device__ __inline__ __bfloat __e5m22bfloat(const __e5m2 b) {
return __float2bfloat(__e5m22float(b));
}
__device__ __inline__ __e5m2 operator|(const __e5m2 x, const __e5m2 y) {
unsigned short val;
unsigned short x_val = x.raw();
unsigned short y_val = y.raw();
asm("{ or.b16 %0, %1, %2;}\n" : "=h"(val) : "h"(x_val), "h"(y_val));
return __e5m2(val);
}
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
// Type trait utils
template <typename Type, bool is_volatile>
struct MaybeVolatile;
template <typename Type>
struct MaybeVolatile<Type, true> {
using type = volatile Type;
};
template <typename Type>
struct MaybeVolatile<Type, false> {
using type = Type;
};
template <typename... Types>
struct TypeList {};
template <int idx, typename T, typename... Types>
struct TypeSelector {
using type = typename TypeSelector<idx - 1, Types...>::type;
};
template <typename T, typename... Types>
struct TypeSelector<0, T, Types...> {
using type = T;
};
template <typename T0, typename T1>
struct IsSameType {
static constexpr bool value = false;
};
template <typename T0>
struct IsSameType<T0, T0> {
static constexpr bool value = true;
};
template <typename T>
struct IsPointerType {
static constexpr bool value = false;
};
template <typename T>
struct IsPointerType<T*> {
static constexpr bool value = true;
};
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
// aligned register array for vectorized load/store
template <typename scalar_t, int size, int align_size = 1>
struct alignas(sizeof(scalar_t) * align_size) Array {
scalar_t array[size];
__device__ void set(scalar_t v) {
#pragma unroll
for (int i = 0; i < size; ++i) {
array[i] = v;
}
}
__device__ scalar_t& operator[](const unsigned int i) {
return array[i];
}
__device__ const scalar_t& operator[](const unsigned int i) const {
return array[i];
}
Array& operator=(const Array& a) {
#pragma unroll
for (int i = 0; i < size; ++i) {
array[i] = a[i];
}
return *this;
}
};
// Used for vectorized allocations that are not in registers
template <typename scalar_t, int vec_size>
__device__ void arraySet(scalar_t* buff, scalar_t val) {
#pragma unroll
for (int i = 0; i < vec_size; ++i) {
buff[i] = val;
}
}
template <typename scalar_t, int vec_size>
__device__ void loadGeneric(scalar_t* to, scalar_t* from) {
// It would be really nice to use memcpy here, but one example was failing
// with:
//
// memcpy(to, from, vec_size * sizeof(scalar_t));
//
// Yet passing with:
//
// for(int i = 0; i < vec_size; i++){
// to[i] = from[i];
// }
switch (sizeof(scalar_t) * vec_size) {
case 1:
*reinterpret_cast<uchar1*>(to) = *reinterpret_cast<uchar1*>(from);
break;
case 2:
*reinterpret_cast<uchar2*>(to) = *reinterpret_cast<uchar2*>(from);
break;
case 4:
*reinterpret_cast<uint1*>(to) = *reinterpret_cast<uint1*>(from);
break;
case 8:
*reinterpret_cast<uint2*>(to) = *reinterpret_cast<uint2*>(from);
break;
case 12:
*reinterpret_cast<uint3*>(to) = *reinterpret_cast<uint3*>(from);
break;
case 16:
*reinterpret_cast<uint4*>(to) = *reinterpret_cast<uint4*>(from);
break;
}
}
// Volatile version only works with c++ fundamnetal types
template <
typename scalar_t,
int vec_size,
bool is_volatile_to,
bool is_volatile_from>
__device__ void loadGenericVolatile(
typename MaybeVolatile<scalar_t, is_volatile_to>::type* to,
typename MaybeVolatile<scalar_t, is_volatile_from>::type* from) {
switch (sizeof(scalar_t) * vec_size) {
// Reinterpret cast like this with volatile types only works for C++
// fundamental types otherwise the = operator is not defined
case 1:
*reinterpret_cast<
typename MaybeVolatile<unsigned char, is_volatile_to>::type*>(to) =
*reinterpret_cast<
typename MaybeVolatile<unsigned char, is_volatile_from>::type*>(
from);
break;
case 2:
*reinterpret_cast<typename MaybeVolatile<short, is_volatile_to>::type*>(
to) =
*reinterpret_cast<
typename MaybeVolatile<short, is_volatile_from>::type*>(from);
break;
case 4:
*reinterpret_cast<
typename MaybeVolatile<unsigned int, is_volatile_to>::type*>(to) =
*reinterpret_cast<
typename MaybeVolatile<unsigned int, is_volatile_from>::type*>(
from);
break;
case 8:
*reinterpret_cast<typename MaybeVolatile<double, is_volatile_to>::type*>(
to) =
*reinterpret_cast<
typename MaybeVolatile<double, is_volatile_from>::type*>(from);
break;
}
}
template <typename scalar_t, int vec_size, bool is_volatile>
__device__ void loadLocalToGlobal(
typename MaybeVolatile<scalar_t, is_volatile>::type* to,
scalar_t* from) {
switch (sizeof(scalar_t) * vec_size) {
case 1:
case 2:
case 4:
loadGenericVolatile<scalar_t, vec_size, is_volatile, false>(to, from);
break;
case 8: {
uint2 const& data = *reinterpret_cast<uint2*>(from);
if (is_volatile) {
asm volatile(
"st.volatile.global.v2.s32 [%0], {%1,%2};" ::"l"(
(typename MaybeVolatile<uint2, is_volatile>::type*)to),
"r"(data.x),
"r"(data.y));
} else {
asm volatile(
"st.global.cs.v2.s32 [%0], {%1,%2};" ::"l"(
(typename MaybeVolatile<uint2, is_volatile>::type*)to),
"r"(data.x),
"r"(data.y));
}
break;
}
case 16: {
uint4 const& data = *reinterpret_cast<uint4*>(from);
if (is_volatile) {
asm volatile(
"st.volatile.global.v4.s32 [%0], {%1,%2,%3,%4};" ::"l"(
(typename MaybeVolatile<uint4, is_volatile>::type*)to),
"r"(data.x),
"r"(data.y),
"r"(data.z),
"r"(data.w));
} else {
asm volatile(
"st.global.cs.v4.s32 [%0], {%1,%2,%3,%4};" ::"l"(
(typename MaybeVolatile<uint4, is_volatile>::type*)to),
"r"(data.x),
"r"(data.y),
"r"(data.z),
"r"(data.w));
}
break;
}
}
}
// This is copied from csrc/type.h and should be kept consistent.
enum class CacheOp {
AllLevels,
Streaming,
Global,
};
template <typename T, CacheOp cache_op>
__device__ void loadGlobalToLocalCached(void* to, void* from) {
T* typed_to = reinterpret_cast<T*>(to);
T* typed_from = reinterpret_cast<T*>(from);
switch (cache_op) {
case CacheOp::AllLevels:
*typed_to = __ldca(typed_from);
break;
case CacheOp::Streaming:
*typed_to = __ldcs(typed_from);
break;
case CacheOp::Global:
*typed_to = __ldcg(typed_from);
break;
}
}
// For simplicity, cache_op is only used for non-volatile loads written in
// inline assembly. Other loads are done with the default cache operator --
// cache all levels. ld.volatile doesn't accept cache operator anyway.
template <typename scalar_t, int vec_size, bool is_volatile, CacheOp cache_op>
__device__ void loadGlobalToLocal(
scalar_t* to,
typename MaybeVolatile<scalar_t, is_volatile>::type* from) {
switch (sizeof(scalar_t) * vec_size) {
case 1:
case 2:
case 4:
loadGenericVolatile<scalar_t, vec_size, false, is_volatile>(to, from);
break;
case 8: {
if (is_volatile) {
uint2& data = *reinterpret_cast<uint2*>(to);
asm volatile("ld.volatile.global.v2.s32 {%0,%1}, [%2];"
: "=r"(data.x), "=r"(data.y)
: "l"((uint2*)from));
} else {
loadGlobalToLocalCached<uint2, cache_op>(
to, const_cast<scalar_t*>(from));
}
break;
}
case 16: {
if (is_volatile) {
uint4& data = *reinterpret_cast<uint4*>(to);
asm volatile("ld.volatile.global.v4.s32 {%0,%1,%2,%3}, [%4];"
: "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w)
: "l"((uint4*)from));
} else {
loadGlobalToLocalCached<uint4, cache_op>(
to, const_cast<scalar_t*>(from));
}
break;
}
}
}
template <
typename scalar_t,
int vec_size,
bool is_volatile_to,
bool is_volatile_from>
__device__ void loadGlobalToGlobal(
typename MaybeVolatile<scalar_t, is_volatile_to>::type* to,
typename MaybeVolatile<scalar_t, is_volatile_from>::type* from) {
switch (sizeof(scalar_t) * vec_size) {
// Reinterpret cast like this with volatile types only works for C++
// fundamental types otherwise the = operator is not defined
case 1:
case 2:
case 4:
case 8:
loadGenericVolatile<scalar_t, vec_size, is_volatile_to, is_volatile_from>(
to, from);
break;
case 12: {
uint3 local_intermediate;
loadGlobalToLocal<
scalar_t,
vec_size,
is_volatile_from,
CacheOp::Streaming>(
reinterpret_cast<scalar_t*>(&local_intermediate), from);
loadLocalToGlobal<scalar_t, vec_size, is_volatile_to>(
to, reinterpret_cast<scalar_t*>(&local_intermediate));
break;
}
case 16: {
uint4 local_intermediate;
loadGlobalToLocal<
scalar_t,
vec_size,
is_volatile_from,
CacheOp::Streaming>(
reinterpret_cast<scalar_t*>(&local_intermediate), from);
loadLocalToGlobal<scalar_t, vec_size, is_volatile_to>(
to, reinterpret_cast<scalar_t*>(&local_intermediate));
break;
}
}
}
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
// TMemTensor is a wrapper around a uint32_t that provides a convenient way to
// manipulate tensor memory addresses. Example usage:
// TMemTensor T0(0x12345678):
// -> address (lane=0x1234, col=0x5678):
// TMemTensor T1 = T0 + {64, 64}:
// -> address (lane=T0.lane+64, col=T0.col+64)
// TMemTensor T2(0x12345678, 32, 32):
// -> address (lane=0x1234+32, col=0x5678+32)
struct TMemTensor {
uint32_t raw_address;
public:
uint32_t static add(uint32_t base, Array<uint16_t, 2> offset) {
// Mentally, it makes more sense to think of TMem address as (lane, column)
// but because GPUs are little-endian, the address is stored in reverse
// order as (column, lane). So we swap the order of the offset before adding
// it to the base address.
uint16_t tmp = offset[0];
offset[0] = offset[1];
offset[1] = tmp;
return base + *reinterpret_cast<const uint32_t*>(&offset);
}
TMemTensor(uint32_t raw_address) : raw_address(raw_address) {}
TMemTensor(uint32_t base_address, uint16_t lane_offset, uint16_t col_offset)
: raw_address(add(base_address, {lane_offset, col_offset})) {}
operator uint32_t() const {
return raw_address;
}
uint32_t operator+(Array<uint16_t, 2> offset) const {
return add(raw_address, offset);
}
};
static_assert(
sizeof(TMemTensor) == sizeof(uint32_t),
"TMemTensor must be a uint32_t");
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
template <typename T, int Dims, int AllocDims = Dims>
struct Tensor {
__device__ T& operator[](nvfuser_index_t ind) {
return data[ind];
};
T* data;
Array<nvfuser_index_t, Dims, 1> logical_size;
Array<nvfuser_index_t, AllocDims, 1> alloc_stride;
};
// Specialization for 0-dim case as it does not need size and stride arrays.
// They will be an error as well since zero-length arrays are not allowed.
template <typename T>
struct Tensor<T, 0> {
__device__ T& operator[](nvfuser_index_t i) {
return *data;
};
T* data;
};
// Specialization for 0-dim case that's easy to pass in a CPU based tensor.
template <typename T>
struct CpuScalarTensor {
__device__ T& operator[](int i) {
return data;
};
T data;
};
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
__device__ unsigned int mulhilo32(
unsigned int a,
unsigned int b,
unsigned int* result_high) {
*result_high = __umulhi(a, b);
return a * b;
}
__device__ Array<uint32_t, 4> single_round(
Array<uint32_t, 4> ctr,
Array<uint32_t, 2> key) {
constexpr unsigned long kPhiloxSA = 0xD2511F53;
constexpr unsigned long kPhiloxSB = 0xCD9E8D57;
unsigned int hi0;
unsigned int hi1;
unsigned int lo0 = mulhilo32(kPhiloxSA, ctr[0], &hi0);
unsigned int lo1 = mulhilo32(kPhiloxSB, ctr[2], &hi1);
Array<uint32_t, 4> ret = {
hi1 ^ ctr[1] ^ key[0], lo1, hi0 ^ ctr[3] ^ key[1], lo0};
return ret;
}
__device__ Array<uint32_t, 4> philox(
unsigned long long seed,
unsigned long long subsequence,
unsigned long long offset) {
constexpr unsigned long kPhilox10A = 0x9E3779B9;
constexpr unsigned long kPhilox10B = 0xBB67AE85;
Array<uint32_t, 2> key;
key[0] = (unsigned int)seed;
key[1] = (unsigned int)(seed >> 32);
Array<uint32_t, 4> counter;
counter[0] = (unsigned int)(offset);
counter[1] = (unsigned int)(offset >> 32);
counter[2] = (unsigned int)(subsequence);
counter[3] = (unsigned int)(subsequence >> 32);
Array<uint32_t, 4> output = {};
Array<uint32_t, 2> key_ = key;
Array<uint32_t, 4> counter_ = counter;
for (int i = 0; i < 9; i++) {
counter_ = single_round(counter_, key_);
key_[0] += (kPhilox10A);
key_[1] += (kPhilox10B);
}
output = single_round(counter_, key_);
return output;
}
// This is a uniform double in the range (0, 1]
__device__ double raw_uniform_double(unsigned int x, unsigned int y) {
constexpr double scale = 1.0 / (double)(1ll << 53);
const unsigned long long z =
(unsigned long long)x ^ ((unsigned long long)y << (53 - 32));
return (double)z * scale + 0.5 * scale;
}
// This is a uniform float in the range (0, 1]
__device__ float raw_uniform_float(unsigned int x) {
constexpr float scale = (float)(1.0 / (double)(1ll << 32));
return (float)x * scale + 0.5f * scale;
}
__device__ __half uniform_half(unsigned int x) {
__half result = __float2half(raw_uniform_float(x));
return __heq(result, __float2half(1.0f)) ? __float2half(0.0f) : result;
}
__device__ __bfloat uniform_bfloat(unsigned int x) {
__bfloat result = __float2bfloat(raw_uniform_float(x));
return __heq(result, __float2bfloat(1.0f)) ? __float2bfloat(0.0f) : result;
}
__device__ float uniformf(unsigned int x) {
float result = raw_uniform_float(x);
return result == 1.0f ? 0.0f : result;
}
__device__ double uniform(unsigned int x, unsigned int y) {
double result = raw_uniform_double(x, y);
return result == 1.0 ? 0.0 : result;
}
__device__ double rng_uniform(
const Array<uint32_t, 4>& rng_result,
int rng_component) {
return uniform(
rng_result[rng_component * 2], rng_result[rng_component * 2 + 1]);
}
__device__ float rng_uniformf(
const Array<uint32_t, 4>& rng_result,
int rng_component) {
return uniformf(rng_result[rng_component]);
}
__device__ __half
rng_uniform_half(const Array<uint32_t, 4>& rng_result, int rng_component) {
return uniform_half(rng_result[rng_component]);
}
__device__ __bfloat
rng_uniform_bfloat(const Array<uint32_t, 4>& rng_result, int rng_component) {
return uniform_bfloat(rng_result[rng_component]);
}
__device__ double rng_uniform_range(
const Array<uint32_t, 4>& rng_result,
int rng_component,
double from,
double to) {
auto range = to - from;
auto uniform01 = rng_uniform(rng_result, rng_component);
return from + range * uniform01;
}
__device__ float rng_uniform_rangef(
const Array<uint32_t, 4>& rng_result,
int rng_component,
float from,
float to) {
auto range = to - from;
auto uniform01 = rng_uniformf(rng_result, rng_component);
return from + range * uniform01;
}
__device__ __half rng_uniform_range_half(
const Array<uint32_t, 4>& rng_result,
int rng_component,
float from,
float to) {
auto range = to - from;
float uniform01 = raw_uniform_float(rng_result[rng_component]);
__half result = __float2half(from + range * uniform01);
return __heq(result, __float2half(to)) ? __float2half(from) : result;
}
__device__ __bfloat rng_uniform_range_bfloat(
const Array<uint32_t, 4>& rng_result,
int rng_component,
float from,
float to) {
auto range = to - from;
float uniform01 = raw_uniform_float(rng_result[rng_component]);
__bfloat result = __float2bfloat(from + range * uniform01);
return __heq(result, __float2bfloat(to)) ? __float2bfloat(from) : result;
}
__device__ float normalf(unsigned int x, unsigned int y, int rng_component) {
float u = uniformf(x);
float v = uniformf(y) * 6.2831855f;
if (rng_component % 2 == 0) {
return sqrtf(-2.0f * logf(u)) * sinf(v);
} else {
return sqrtf(-2.0f * logf(u)) * cosf(v);
}
}
__device__ double normal(
unsigned int x0,
unsigned int x1,
unsigned int y0,
unsigned int y1,
int rng_component) {
double u = uniform(x0, x1);
double v = uniform(y0, y1) * 6.2831853071795860;
if (rng_component % 2 == 0) {
return sqrt(-2.0 * log(u)) * sin(v);
} else {
return sqrt(-2.0 * log(u)) * cos(v);
}
}
__device__ double rng_normal_standard(
const Array<uint32_t, 4>& rng_result,
int rng_component) {
return normal(
rng_result[0],
rng_result[1],
rng_result[2],
rng_result[3],
rng_component);
}
__device__ float rng_normal_standardf(
const Array<uint32_t, 4>& rng_result,
int rng_component) {
return normalf(
rng_result[rng_component / 2 * 2],
rng_result[1 + rng_component / 2 * 2],
rng_component);
}
__device__ __half rng_normal_standard_half(
const Array<uint32_t, 4>& rng_result,
int rng_component) {
return __float2half(normalf(
rng_result[rng_component / 2 * 2],
rng_result[1 + rng_component / 2 * 2],
rng_component));
}
__device__ __bfloat rng_normal_standard_bfloat(
const Array<uint32_t, 4>& rng_result,
int rng_component) {
return __float2bfloat(normalf(
rng_result[rng_component / 2 * 2],
rng_result[1 + rng_component / 2 * 2],
rng_component));
}
__device__ double rng_normal_general(
const Array<uint32_t, 4>& rng_result,
int rng_component,
double mean,
double std) {
auto normal01 = rng_normal_standard(rng_result, rng_component);
return normal01 * std + mean;
}
__device__ float rng_normal_generalf(
const Array<uint32_t, 4>& rng_result,
int rng_component,
float mean,
float std) {
auto normal01 = rng_normal_standardf(rng_result, rng_component);
return normal01 * std + mean;
}
__device__ __half rng_normal_general_half(
const Array<uint32_t, 4>& rng_result,
int rng_component,
float mean,
float std) {
auto normal01 = normalf(
rng_result[rng_component / 2 * 2],
rng_result[1 + rng_component / 2 * 2],
rng_component);
return __float2half(normal01 * std + mean);
}
__device__ __bfloat rng_normal_general_bfloat(
const Array<uint32_t, 4>& rng_result,
int rng_component,
float mean,
float std) {
auto normal01 = normalf(
rng_result[rng_component / 2 * 2],
rng_result[1 + rng_component / 2 * 2],
rng_component);
return __float2bfloat(normal01 * std + mean);
}
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
#define NVFUSER_DEFINE_MAGIC_ZERO \
__shared__ int nvfuser_zero_s; \
if (threadIdx.x == 0) \
nvfuser_zero_s = 0; \
__syncthreads(); \
atomicMin(&nvfuser_zero_s, threadIdx.x); \
int nvfuser_zero = nvfuser_zero_s;
#define NVFUSER_UPDATE_MAGIC_ZERO \
do { \
nvfuser_zero <<= 1; \
} while (0);
#ifdef __NVCC__
#include <assert.h>
#endif // __NVCC__
__device__ constexpr int ceilDiv(int a, int b) {
return (a + b - 1) / b;
}
__device__ constexpr int64_t ceilDiv(int64_t a, int64_t b) {
return (a + b - 1) / b;
}
__device__ constexpr int64_t ceilDiv(int64_t a, int b) {
return ceilDiv(a, (int64_t)b);
}
__device__ constexpr int64_t ceilDiv(int a, int64_t b) {
return ceilDiv((int64_t)a, b);
}
__device__ constexpr double ceilDiv(double a, double b) {
return std::ceil(a / b);
}
__device__ constexpr double ceilDiv(double a, int64_t b) {
return std::ceil(a / b);
}
__device__ constexpr double ceilDiv(int64_t a, double b) {
return std::ceil(a / b);
}
// Monotonic and precise lerp is described here:
// https://math.stackexchange.com/a/1798323
__device__ double lerp(double start, double end, double weight) {
if (weight < 0.5) {
return start + weight * (end - start);
} else {
return end - (end - start) * (1.0 - weight);
}
}
__device__ float lerp(float start, float end, float weight) {
if (weight < 0.5f) {
return start + weight * (end - start);
} else {
return end - (end - start) * (1.0f - weight);
}
}
__device__ float lerp(float start, float end, double weight) {
return lerp(start, end, static_cast<float>(weight));
}
__device__ constexpr int max(int a, int b) {
return a > b ? a : b;
}
__device__ constexpr int64_t max(int64_t a, int b) {
return a > (int64_t)b ? a : (int64_t)b;
}
__device__ constexpr int64_t max(int a, int64_t b) {
return (int64_t)a > b ? (int64_t)a : b;
}
__device__ constexpr int64_t max(int64_t a, int64_t b) {
return a > b ? a : b;
}
__device__ double fmax(double a, double b) {
// check and propagate NaN
if (a != a) {
return a;
} else { // If b is nan, it will be returned in the next line
return a > b ? a : b;
}
}
__device__ float fmax(float a, float b) {
// check and propagate NaN
if (a != a) {
return a;
} else { // If b is nan, it will be returned in the next line
return a > b ? a : b;
}
}
__device__ constexpr int min(int a, int b) {
return a > b ? b : a;
}
__device__ constexpr int64_t min(int64_t a, int b) {
return (int64_t)a > b ? b : (int64_t)a;
}
__device__ constexpr int64_t min(int a, int64_t b) {
return a > (int64_t)b ? (int64_t)b : a;
}
__device__ constexpr int64_t min(int64_t a, int64_t b) {
return a > b ? b : a;
}
__device__ double fmin(double a, double b) {
// check and propagate NaN
if (b != b) {
return b;
} else { // If a is nan, it will be returned in the next line
return a > b ? b : a;
}
}
__device__ float fmin(float a, float b) {
// check and propagate NaN
if (b != b) {
return b;
} else { // If a is nan, it will be returned in the next line
return a > b ? b : a;
}
}
__device__ constexpr int alignBufferSize(int buffer, int size) {
return (buffer + (size - 1)) & ~(size - 1);
}
__device__ double clamp(double x, double minv, double maxv) {
return fmin(fmax(x, minv), maxv);
}
__device__ float clamp(float x, double minv, double maxv) {
return fmin(fmax((double)x, minv), maxv);
}
__device__ int clamp(int x, int64_t minv, int64_t maxv) {
return min(max((int64_t)x, minv), maxv);
}
__device__ int64_t clamp(int64_t x, int64_t minv, int64_t maxv) {
return min(max(x, minv), maxv);
}
__device__ double frac(double x) {
return x - trunc(x);
}
__device__ float frac(float x) {
return x - trunc(x);
}
__device__ double reciprocal(double x) {
return 1 / x;
}
__device__ float reciprocal(float x) {
return 1 / x;
}
__device__ double relu(double x) {
return x <= 0 ? 0 : x;
}
__device__ float relu(float x) {
return x <= 0 ? 0 : x;
}
__device__ float relu(int64_t x) {
return x <= 0 ? 0 : x;
}
__device__ float relu(int x) {
return x <= 0 ? 0 : x;
}
__device__ double remainder(double a, double b) {
auto mod = ::fmod(a, b);
if ((mod != 0) && ((b < 0) != (mod < 0)))
mod += b;
return mod;
}
__device__ float remainder(float a, float b) {
auto mod = ::fmod(a, b);
if ((mod != 0) && ((b < 0) != (mod < 0)))
mod += b;
return mod;
}
__device__ double sigmoid(double x) {
return 1.0 / (1.0 + exp(-x));
}
__device__ float sigmoid(float x) {
return 1.0f / (1.0f + exp(-x));
}
__device__ double silu(double x) {
return x * sigmoid(x);
}
__device__ float silu(float x) {
return x * sigmoid(x);
}
__device__ double threshold(double x, double t, double v) {
return x <= t ? v : x;
}
__device__ float threshold(float x, double t, double v) {
return x <= t ? v : x;
}
__device__ int threshold(int x, int64_t t, int64_t v) {
return x <= t ? v : x;
}
__device__ int64_t threshold(int64_t x, int64_t t, int64_t v) {
return x <= t ? v : x;
}
__device__ constexpr int64_t remainder(int64_t a, int64_t b) {
auto mod = a % b;
if ((mod != 0) && ((b < 0) != (mod < 0)))
mod += b;
return mod;
}
__device__ constexpr int remainder(int a, int b) {
auto mod = a % b;
if ((mod != 0) && ((b < 0) != (mod < 0)))
mod += b;
return mod;
}
__device__ constexpr int64_t fmod(int64_t a, int64_t b) {
return a % b;
}
__device__ constexpr int fmod(int a, int b) {
return a % b;
}
__device__ constexpr double fmod(double a, double b) {
return ::fmod(a, b);
}
__device__ constexpr float fmod(float a, float b) {
return ::fmod(a, b);
}
__device__ constexpr double nextafter(double a, double b) {
return ::nextafter(a, b);
}
__device__ constexpr float nextafter(float a, float b) {
return ::nextafterf(a, b);
}
template <typename T>
__device__ T pow(T a, T b) {
if (b < 0) {
if (a == 1) {
return 1;
} else if (a == -1) {
auto negative = (-b) % static_cast<T>(2);
return negative ? -1 : 1;
} else {
return 0;
}
} else {
T result = 1;
while (b) {
if (b & 1) {
result *= a;
}
b /= 2;
a *= a;
}
return result;
}
}
template __device__ int pow<int>(int a, int b);
template __device__ int64_t pow<int64_t>(int64_t a, int64_t b);
template <>
__device__ float pow<float>(float a, float b) {
return ::pow(a, b);
}
template <>
__device__ double pow<double>(double a, double b) {
return ::pow(a, b);
}
__device__ float pow(float a, int b) {
return pow(a, (float)b);
}
__device__ double pow(double a, int b) {
return pow(a, (double)b);
}
__device__ float pow(float a, int64_t b) {
return pow(a, (float)b);
}
__device__ double pow(double a, int64_t b) {
return pow(a, (double)b);
}
__device__ int64_t pow(int64_t a, int b) {
return pow(a, (int64_t)b);
}
__device__ int64_t pow(int a, int64_t b) {
return pow((int64_t)a, b);
}
__device__ double rsqrt(double z) {
return ::rsqrt(z);
}
__device__ float rsqrt(float z) {
return ::rsqrtf(z);
}
__device__ int rsqrt(int z) {
return ::rsqrtf((float)z);
}
__device__ int64_t rsqrt(int64_t z) {
return ::rsqrt((double)z);
}
__device__ double signbit(double a) {
return ::signbit(a);
}
__device__ float signbit(float a) {
return ::signbit(a);
}
__device__ int signbit(int a) {
return a < 0;
}
__device__ int64_t signbit(int64_t a) {
return a < 0;
}
// Reference:
// https://en.wikipedia.org/wiki/Euclidean_algorithm#Implementations
// https://github.com/pytorch/pytorch/blob/c9f4f01981fd73fcc7c27676cc50230cd1b5bc22/aten/src/ATen/native/Math.h#L1232
template <typename T>
__device__ T gcd(T a, T b) {
a = abs(a);
b = abs(b);
while (b != 0) {
auto t = b;
b = a % b;
a = t;
}
return a;
}
template <typename T>
bool isfinite(T x) {
return ::isfinite(x);
}
// ref:
// https://github.com/NVIDIA/cutlass/blob/6fbc0d33800008d3180d3fefed4e1a653e5f72a0/include/cutlass/bfloat16.h#L213
template <>
bool isfinite<__bfloat>(__bfloat x) {
const auto exponent_biased = int((x.raw() >> 7) & 0x0ff);
return exponent_biased != 0x0ff;
}
// ref:
// https://github.com/NVIDIA/cutlass/blob/6fbc0d33800008d3180d3fefed4e1a653e5f72a0/include/cutlass/half.h#L511
template <>
bool isfinite<__half>(__half x) {
const auto exponent_biased = int((x.raw() >> 10) & 0x1f);
return exponent_biased != 0x1f;
}
template <typename T>
bool isinf(T x) {
return ::isinf(x);
}
////////////////////////////////////////////////////////////
// TODO: the following overloads are only needed for CUDA //
// 10.2 Please remove when CUDA 10.2 support is dropped //
////////////////////////////////////////////////////////////
bool isinf(int64_t x) {
return false;
}
bool isinf(int x) {
return false;
}
bool isinf(short x) {
return false;
}
bool isinf(char x) {
return false;
}
bool isinf(unsigned char x) {
return false;
}
bool isinf(bool x) {
return false;
}
bool isfinite(int64_t x) {
return true;
}
bool isfinite(int x) {
return true;
}
bool isfinite(short x) {
return true;
}
bool isfinite(char x) {
return true;
}
bool isfinite(unsigned char x) {
return true;
}
bool isfinite(bool x) {
return true;
}
////////////////////////////////////////////////////////////
// End TODO //
////////////////////////////////////////////////////////////
template <typename T>
bool isnan(T x) {
return x != x;
}
template <typename T>
bool isneginf(T x) {
return x < 0 && isinf(x);
}
template <typename T>
bool isposinf(T x) {
return x > 0 && isinf(x);
}
template <typename T>
bool isreal(T x) {
return true;
}
// Return the current value of the cycle counter
__device__ inline int64_t readCycleCounter() {
// Ensures preceding memory operations are completed. Doing this
// would make sense for measuring elapsed times enclosed with this
// function.
__threadfence();
return clock64();
}
__device__ float print_impl(const char* name, float value) {
printf(
"%s = %f @ threadIdx=(%d,%d,%d), blockIdx=(%d,%d,%d)\n",
name,
value,
(int)threadIdx.x,
(int)threadIdx.y,
(int)threadIdx.z,
(int)blockIdx.x,
(int)blockIdx.y,
(int)blockIdx.z);
return value;
}
__device__ double print_impl(const char* name, double value) {
printf(
"%s = %lf @ threadIdx=(%d,%d,%d), blockIdx=(%d,%d,%d)\n",
name,
value,
(int)threadIdx.x,
(int)threadIdx.y,
(int)threadIdx.z,
(int)blockIdx.x,
(int)blockIdx.y,
(int)blockIdx.z);
return value;
}
__device__ int print_impl(const char* name, int value) {
printf(
"%s = %d @ threadIdx=(%d,%d,%d), blockIdx=(%d,%d,%d)\n",
name,
value,
(int)threadIdx.x,
(int)threadIdx.y,
(int)threadIdx.z,
(int)blockIdx.x,
(int)blockIdx.y,
(int)blockIdx.z);
return value;
}
__device__ int64_t print_impl(const char* name, int64_t value) {
printf(
"%s = %ld @ threadIdx=(%d,%d,%d), blockIdx=(%d,%d,%d)\n",
name,
value,
(int)threadIdx.x,
(int)threadIdx.y,
(int)threadIdx.z,
(int)blockIdx.x,
(int)blockIdx.y,
(int)blockIdx.z);
return value;
}
__device__ bool print_impl(const char* name, bool value) {
printf(
"%s = %s @ threadIdx=(%d,%d,%d), blockIdx=(%d,%d,%d)\n",
name,
value ? "true" : "false",
(int)threadIdx.x,
(int)threadIdx.y,
(int)threadIdx.z,
(int)blockIdx.x,
(int)blockIdx.y,
(int)blockIdx.z);
return value;
}
__device__ __half print_impl(const char* name, __half value) {
printf(
"%s = %f @ threadIdx=(%d,%d,%d), blockIdx=(%d,%d,%d)\n",
name,
__half2float(value),
(int)threadIdx.x,
(int)threadIdx.y,
(int)threadIdx.z,
(int)blockIdx.x,
(int)blockIdx.y,
(int)blockIdx.z);
return value;
}
#if __CUDACC_VER_MAJOR__ >= 11
__device__ __bfloat print_impl(const char* name, __bfloat value) {
printf(
"%s = %f @ threadIdx=(%d,%d,%d), blockIdx=(%d,%d,%d)\n",
name,
__bfloat2float(value),
(int)threadIdx.x,
(int)threadIdx.y,
(int)threadIdx.z,
(int)blockIdx.x,
(int)blockIdx.y,
(int)blockIdx.z);
return value;
}
#endif
#define print(...) print_impl(#__VA_ARGS__, (__VA_ARGS__))
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
namespace index_utils {
// Utility functions
// Total size of provided dimension
template <typename _dim3>
__device__ __forceinline__ nvfuser_index_t size(const _dim3& d) {
return (nvfuser_index_t)d.x * (nvfuser_index_t)d.y * (nvfuser_index_t)d.z;
}
// Linearized indexing of idx based on dim, if bool==false that dimension does
// not participate
template <bool X, bool Y, bool Z, typename _dim3, typename _dim3_2>
__device__ nvfuser_index_t maskedOffset(const _dim3& idx, const _dim3_2& dim) {
nvfuser_index_t offset = 0;
if (Z)
offset += idx.z;
if (Y)
offset = offset * dim.y + idx.y;
if (X)
offset = offset * dim.x + idx.x;
return offset;
}
// Linearized indexing of idx based on dim. All dimensions participate.
template <typename _dim3, typename _dim3_2>
__device__ nvfuser_index_t offset(const _dim3& idx, const _dim3_2& dim) {
nvfuser_index_t offset = idx.z;
offset = offset * dim.y + idx.y;
offset = offset * dim.x + idx.x;
return offset;
}
// Masks the provided dim3, those == false get truncated to 1
template <bool X, bool Y, bool Z, typename _dim3>
__device__ dim3 maskedDims(const _dim3& dim) {
return dim3{
X ? (unsigned)dim.x : 1U,
Y ? (unsigned)dim.y : 1U,
Z ? (unsigned)dim.z : 1U};
}
// Provides total size of dim with masking, those dims == false do not
// participate in the size calculation
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, typename _dim3>
__device__ nvfuser_index_t maskedSize(const _dim3& dim) {
return size(maskedDims<X_BLOCK, Y_BLOCK, Z_BLOCK>(dim));
}
// Checks if provided idx is zero on those dims == true
template <bool X, bool Y, bool Z, typename _dim3>
__device__ bool maskedIsZero(const _dim3& idx) {
bool isZero = true;
if (X)
isZero = isZero && idx.x == 0;
if (Y)
isZero = isZero && idx.y == 0;
if (Z)
isZero = isZero && idx.z == 0;
return isZero;
}
// Checks if provided idx is zero on those dims == true
template <bool X, bool Y, bool Z, typename _dim3, typename _dim3_2>
__device__ bool maskedIsLast(const _dim3& idx, const _dim3_2& dim) {
bool isZero = true;
if (X)
isZero = isZero && idx.x == dim.x - 1;
if (Y)
isZero = isZero && idx.y == dim.y - 1;
if (Z)
isZero = isZero && idx.z == dim.z - 1;
return isZero;
}
} // namespace index_utils
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
// std::tuple-like type
template <typename... Types>
struct Tuple;
#define TUPLE_INCREMENT_PTR(idx) \
do { \
static_assert( \
IsPointerType<T##idx>::value, "Invalid for non-pointer types"); \
val##idx += offset; \
} while (0)
template <typename T0>
struct Tuple<T0> {
T0 val0;
Tuple() = default;
__device__ Tuple(T0 _val0) : val0(_val0) {}
// Only valid when instantiated for pointer types
__device__ void operator+=(nvfuser_index_t offset) {
TUPLE_INCREMENT_PTR(0);
}
};
template <typename T0, typename T1>
struct Tuple<T0, T1> {
T0 val0;
T1 val1;
Tuple() = default;
__device__ Tuple(T0 _val0, T1 _val1) : val0(_val0), val1(_val1) {}
// Only valid when instantiated for pointer types
__device__ void operator+=(nvfuser_index_t offset) {
TUPLE_INCREMENT_PTR(0);
TUPLE_INCREMENT_PTR(1);
}
};
template <typename T0, typename T1, typename T2>
struct Tuple<T0, T1, T2> {
T0 val0;
T1 val1;
T2 val2;
Tuple() = default;
__device__ Tuple(T0 _val0, T1 _val1, T2 _val2)
: val0(_val0), val1(_val1), val2(_val2) {}
// Only valid when instantiated for pointer types
__device__ void operator+=(nvfuser_index_t offset) {
TUPLE_INCREMENT_PTR(0);
TUPLE_INCREMENT_PTR(1);
TUPLE_INCREMENT_PTR(2);
}
};
template <typename T0, typename T1, typename T2, typename T3>
struct Tuple<T0, T1, T2, T3> {
T0 val0;
T1 val1;
T2 val2;
T3 val3;
Tuple() = default;
__device__ Tuple(T0 _val0, T1 _val1, T2 _val2, T3 _val3)
: val0(_val0), val1(_val1), val2(_val2), val3(_val3) {}
// Only valid when instantiated for pointer types
__device__ void operator+=(nvfuser_index_t offset) {
TUPLE_INCREMENT_PTR(0);
TUPLE_INCREMENT_PTR(1);
TUPLE_INCREMENT_PTR(2);
TUPLE_INCREMENT_PTR(3);
}
};
template <typename T0, typename T1, typename T2, typename T3, typename T4>
struct Tuple<T0, T1, T2, T3, T4> {
T0 val0;
T1 val1;
T2 val2;
T3 val3;
T4 val4;
Tuple() = default;
__device__ Tuple(T0 _val0, T1 _val1, T2 _val2, T3 _val3, T4 _val4)
: val0(_val0), val1(_val1), val2(_val2), val3(_val3), val4(_val4) {}
// Only valid when instantiated for pointer types
__device__ void operator+=(nvfuser_index_t offset) {
TUPLE_INCREMENT_PTR(0);
TUPLE_INCREMENT_PTR(1);
TUPLE_INCREMENT_PTR(2);
TUPLE_INCREMENT_PTR(3);
TUPLE_INCREMENT_PTR(4);
}
};
template <
typename T0,
typename T1,
typename T2,
typename T3,
typename T4,
typename T5>
struct Tuple<T0, T1, T2, T3, T4, T5> {
T0 val0;
T1 val1;
T2 val2;
T3 val3;
T4 val4;
T5 val5;
Tuple() = default;
__device__ Tuple(T0 _val0, T1 _val1, T2 _val2, T3 _val3, T4 _val4, T5 _val5)
: val0(_val0),
val1(_val1),
val2(_val2),
val3(_val3),
val4(_val4),
val5(_val5) {}
// Only valid when instantiated for pointer types
__device__ void operator+=(nvfuser_index_t offset) {
TUPLE_INCREMENT_PTR(0);
TUPLE_INCREMENT_PTR(1);
TUPLE_INCREMENT_PTR(2);
TUPLE_INCREMENT_PTR(3);
TUPLE_INCREMENT_PTR(4);
TUPLE_INCREMENT_PTR(5);
}
};
template <
typename T0,
typename T1,
typename T2,
typename T3,
typename T4,
typename T5,
typename T6>
struct Tuple<T0, T1, T2, T3, T4, T5, T6> {
T0 val0;
T1 val1;
T2 val2;
T3 val3;
T4 val4;
T5 val5;
T6 val6;
Tuple() = default;
__device__ Tuple(
T0 _val0,
T1 _val1,
T2 _val2,
T3 _val3,
T4 _val4,
T5 _val5,
T6 _val6)
: val0(_val0),
val1(_val1),
val2(_val2),
val3(_val3),
val4(_val4),
val5(_val5),
val6(_val6) {}
// Only valid when instantiated for pointer types
__device__ void operator+=(nvfuser_index_t offset) {
TUPLE_INCREMENT_PTR(0);
TUPLE_INCREMENT_PTR(1);
TUPLE_INCREMENT_PTR(2);
TUPLE_INCREMENT_PTR(3);
TUPLE_INCREMENT_PTR(4);
TUPLE_INCREMENT_PTR(5);
TUPLE_INCREMENT_PTR(6);
}
};
template <
typename T0,
typename T1,
typename T2,
typename T3,
typename T4,
typename T5,
typename T6,
typename T7>
struct Tuple<T0, T1, T2, T3, T4, T5, T6, T7> {
T0 val0;
T1 val1;
T2 val2;
T3 val3;
T4 val4;
T5 val5;
T6 val6;
T7 val7;
Tuple() = default;
__device__ Tuple(
T0 _val0,
T1 _val1,
T2 _val2,
T3 _val3,
T4 _val4,
T5 _val5,
T6 _val6,
T7 _val7)
: val0(_val0),
val1(_val1),
val2(_val2),
val3(_val3),
val4(_val4),
val5(_val5),
val6(_val6),
val7(_val7) {}
// Only valid when instantiated for pointer types
__device__ void operator+=(nvfuser_index_t offset) {
TUPLE_INCREMENT_PTR(0);
TUPLE_INCREMENT_PTR(1);
TUPLE_INCREMENT_PTR(2);
TUPLE_INCREMENT_PTR(3);
TUPLE_INCREMENT_PTR(4);
TUPLE_INCREMENT_PTR(5);
TUPLE_INCREMENT_PTR(6);
TUPLE_INCREMENT_PTR(7);
}
};
template <
typename T0,
typename T1,
typename T2,
typename T3,
typename T4,
typename T5,
typename T6,
typename T7,
typename T8,
typename T9,
typename T10,
typename T11,
typename T12,
typename T13,
typename T14,
typename T15>
struct Tuple<
T0,
T1,
T2,
T3,
T4,
T5,
T6,
T7,
T8,
T9,
T10,
T11,
T12,
T13,
T14,
T15> {
T0 val0;
T1 val1;
T2 val2;
T3 val3;
T4 val4;
T5 val5;
T6 val6;
T7 val7;
T8 val8;
T9 val9;
T10 val10;
T11 val11;
T12 val12;
T13 val13;
T14 val14;
T15 val15;
Tuple() = default;
__device__ Tuple(
T0 _val0,
T1 _val1,
T2 _val2,
T3 _val3,
T4 _val4,
T5 _val5,
T6 _val6,
T7 _val7,
T8 _val8,
T9 _val9,
T10 _val10,
T11 _val11,
T12 _val12,
T13 _val13,
T14 _val14,
T15 _val15)
: val0(_val0),
val1(_val1),
val2(_val2),
val3(_val3),
val4(_val4),
val5(_val5),
val6(_val6),
val7(_val7),
val8(_val8),
val9(_val9),
val10(_val10),
val11(_val11),
val12(_val12),
val13(_val13),
val14(_val14),
val15(_val15) {}
// Only valid when instantiated for pointer types
__device__ void operator+=(nvfuser_index_t offset) {
TUPLE_INCREMENT_PTR(0);
TUPLE_INCREMENT_PTR(1);
TUPLE_INCREMENT_PTR(2);
TUPLE_INCREMENT_PTR(3);
TUPLE_INCREMENT_PTR(4);
TUPLE_INCREMENT_PTR(5);
TUPLE_INCREMENT_PTR(6);
TUPLE_INCREMENT_PTR(7);
TUPLE_INCREMENT_PTR(8);
TUPLE_INCREMENT_PTR(9);
TUPLE_INCREMENT_PTR(10);
TUPLE_INCREMENT_PTR(11);
TUPLE_INCREMENT_PTR(12);
TUPLE_INCREMENT_PTR(13);
TUPLE_INCREMENT_PTR(14);
TUPLE_INCREMENT_PTR(15);
}
};
#undef TUPLE_INCREMENT_PTR
// Accessor for Tuple
template <int idx>
struct get;
#define DEFINE_TUPLE_GET(idx) \
template <> \
struct get<idx> { \
template <typename Tuple> \
__device__ auto& operator()(Tuple& vals) { \
return vals.val##idx; \
} \
template <typename Tuple> \
__device__ const auto& operator()(const Tuple& vals) { \
return vals.val##idx; \
} \
};
DEFINE_TUPLE_GET(0);
DEFINE_TUPLE_GET(1);
DEFINE_TUPLE_GET(2);
DEFINE_TUPLE_GET(3);
DEFINE_TUPLE_GET(4);
DEFINE_TUPLE_GET(5);
DEFINE_TUPLE_GET(6);
DEFINE_TUPLE_GET(7);
DEFINE_TUPLE_GET(8);
DEFINE_TUPLE_GET(9);
DEFINE_TUPLE_GET(10);
DEFINE_TUPLE_GET(11);
DEFINE_TUPLE_GET(12);
DEFINE_TUPLE_GET(13);
DEFINE_TUPLE_GET(14);
DEFINE_TUPLE_GET(15);
#undef DEFINE_TUPLE_GET
template <typename DstType, typename SrcType>
__inline__ __device__ static void copyTuple(
DstType& dst,
nvfuser_index_t dst_offset,
const SrcType& src,
nvfuser_index_t src_offset = 0);
template <typename DstType, typename SrcType>
__inline__ __device__ static void copyTuple(
DstType& dst,
const SrcType& src,
nvfuser_index_t src_offset = 0);
template <typename DstType>
__inline__ __device__ static void setTuple(
DstType& dst,
typename DstType::template ValType<0> src);
template <typename... Types>
class LocalTuple {
public:
static constexpr int num_vals = sizeof...(Types);
using ValTypes = TypeList<Types...>;
template <int idx>
using ValType = typename TypeSelector<idx, Types...>::type;
LocalTuple() = default;
__device__ explicit LocalTuple(Types... args) : vals_(args...) {}
__device__ LocalTuple(const LocalTuple& other) : vals_(other.vals_) {}
template <template <typename...> typename TupleType>
__device__ LocalTuple(const TupleType<Types...>& other) {
copyTuple(*this, other);
}
__device__ LocalTuple& operator=(const LocalTuple<Types...>& other) {
copyTuple(*this, other);
return *this;
}
template <template <typename...> typename TupleType>
__device__ LocalTuple& operator=(const TupleType<Types...>& other) {
copyTuple(*this, other);
return *this;
}
template <int val_idx>
__device__ auto& val(nvfuser_index_t ptr_offset = 0) {
static_assert(val_idx < num_vals, "Out-of-range value index");
return get<val_idx>()(vals_);
}
template <int val_idx>
__device__ const auto& val(nvfuser_index_t ptr_offset = 0) const {
static_assert(val_idx < num_vals, "Out-of-range value index");
return get<val_idx>()(vals_);
}
private:
Tuple<Types...> vals_;
};
template <bool is_volatile, typename... Types>
class PtrTupleBase {
public:
static constexpr int num_vals = sizeof...(Types);
using ValTypes = TypeList<Types...>;
template <int idx>
using ValType = typename TypeSelector<idx, Types...>::type;
template <int val_idx>
using TypeIMaybeVolatile = typename MaybeVolatile<
typename TypeSelector<val_idx, Types...>::type,
is_volatile>::type;
__device__ PtrTupleBase(Types*... args) : vals_(args...) {}
__device__ PtrTupleBase(const PtrTupleBase& other) : vals_(other.vals_) {}
// Note: this is a deep copy
__device__ PtrTupleBase& operator=(
const PtrTupleBase<is_volatile, Types...>& other) {
copyTuple(*this, other);
return *this;
}
template <template <typename...> typename TupleType>
__device__ PtrTupleBase& operator=(const TupleType<Types...>& other) {
copyTuple(*this, other);
return *this;
}
template <int val_idx>
__device__ TypeIMaybeVolatile<val_idx>& val(nvfuser_index_t ptr_offset = 0) {
static_assert(val_idx < num_vals, "Out-of-range value index");
return ((TypeIMaybeVolatile<val_idx>*)get<val_idx>()(vals_))[ptr_offset];
}
template <int val_idx>
__device__ const TypeIMaybeVolatile<val_idx>& val(
nvfuser_index_t ptr_offset = 0) const {
static_assert(val_idx < num_vals, "Out-of-range value index");
return ((TypeIMaybeVolatile<val_idx>*)get<val_idx>()(vals_))[ptr_offset];
}
__device__ void operator+=(nvfuser_index_t ptr_offset) {
vals_ += ptr_offset;
}
private:
Tuple<Types*...> vals_;
};
template <typename... Types>
class RefTuple {
public:
static constexpr int num_vals = sizeof...(Types);
using ValTypes = TypeList<Types...>;
template <int idx>
using ValType = typename TypeSelector<idx, Types...>::type;
__device__ RefTuple(Types&... args) : vals_(args...) {}
__device__ RefTuple(const RefTuple& other) : vals_(other.vals_) {}
template <template <typename...> typename TupleType>
__device__ RefTuple(const TupleType<Types...>& other) {
copyTuple(*this, other);
}
__device__ RefTuple& operator=(const RefTuple<Types...>& other) {
copyTuple(*this, other);
return *this;
}
template <template <typename...> typename TupleType>
__device__ RefTuple& operator=(const TupleType<Types...>& other) {
copyTuple(*this, other);
return *this;
}
template <int val_idx>
__device__ auto& val(nvfuser_index_t ptr_offset = 0) {
static_assert(val_idx < num_vals, "Out-of-range value index");
return get<val_idx>()(vals_);
}
template <int val_idx>
__device__ const auto& val(nvfuser_index_t ptr_offset = 0) const {
static_assert(val_idx < num_vals, "Out-of-range value index");
return get<val_idx>()(vals_);
}
private:
Tuple<Types&...> vals_;
};
template <typename DstType, typename SrcType, int num_vals>
struct TupleCopy {
__inline__ __device__ static void copy(
DstType& dst,
nvfuser_index_t dst_offset,
const SrcType& src,
nvfuser_index_t src_offset) {
static_assert(
IsSameType<typename DstType::ValTypes, typename SrcType::ValTypes>::
value,
"Invalid value types");
TupleCopy<DstType, SrcType, num_vals - 1>::copy(
dst, dst_offset, src, src_offset);
dst.val<num_vals - 1>(dst_offset) = src.val<num_vals - 1>(src_offset);
}
};
template <typename DstType, typename SrcType>
struct TupleCopy<DstType, SrcType, 0> {
__inline__ __device__ static void copy(
DstType& dst,
nvfuser_index_t dst_offset,
const SrcType& src,
nvfuser_index_t src_offset) {}
};
template <typename DstType, typename SrcType>
__inline__ __device__ static void copyTuple(
DstType& dst,
nvfuser_index_t dst_offset,
const SrcType& src,
nvfuser_index_t src_offset) {
static_assert(
IsSameType<typename DstType::ValTypes, typename SrcType::ValTypes>::value,
"Invalid value types");
TupleCopy<DstType, SrcType, DstType::num_vals>::copy(
dst, dst_offset, src, src_offset);
};
template <typename DstType, typename SrcType>
__inline__ __device__ static void copyTuple(
DstType& dst,
const SrcType& src,
nvfuser_index_t src_offset) {
copyTuple<DstType, SrcType>(dst, 0, src, src_offset);
};
template <typename DstType, int num_vals>
struct TupleSet {
__inline__ __device__ static void set(
DstType& dst,
nvfuser_index_t dst_offset,
typename DstType::template ValType<0> src) {
static_assert(
IsSameType<
typename DstType::template ValType<num_vals - 1>,
typename DstType::template ValType<0>>::value,
"Invalid value types");
TupleSet<DstType, num_vals - 1>::set(dst, dst_offset, src);
dst.val<num_vals - 1>(dst_offset) = src;
}
};
template <typename DstType>
struct TupleSet<DstType, 0> {
__inline__ __device__ static void set(
DstType& dst,
nvfuser_index_t dst_offset,
typename DstType::template ValType<0> src) {}
};
template <typename DstType>
__inline__ __device__ static void setTuple(
DstType& dst,
nvfuser_index_t dst_offset,
typename DstType::template ValType<0> src) {
TupleSet<DstType, DstType::num_vals>::set(dst, dst_offset, src);
};
template <typename DstType>
__inline__ __device__ static void setTuple(
DstType& dst,
typename DstType::template ValType<0> src) {
setTuple(dst, 0, src);
};
template <typename DstType, typename SrcType, typename PredType, int num_vals>
struct PredicatedTupleCopy {
__inline__ __device__ static void copy(
DstType& dst,
nvfuser_index_t dst_offset,
const SrcType& src,
nvfuser_index_t src_offset,
const PredType& pred) {
static_assert(
IsSameType<typename PredType::template ValType<num_vals - 1>, bool>::
value,
"Invalid predicate type");
PredicatedTupleCopy<DstType, SrcType, PredType, num_vals - 1>::copy(
dst, dst_offset, src, src_offset, pred);
if (pred.val<num_vals - 1>(0)) {
dst.val<num_vals - 1>(dst_offset) = src.val<num_vals - 1>(src_offset);
}
}
};
template <typename DstType, typename SrcType, typename PredType>
struct PredicatedTupleCopy<DstType, SrcType, PredType, 0> {
__inline__ __device__ static void copy(
DstType& dst,
nvfuser_index_t dst_offset,
const SrcType& src,
nvfuser_index_t src_offset,
const PredType& pred) {}
};
template <typename DstType, typename SrcType, typename PredType>
__inline__ __device__ static void copyTupleIf(
DstType& dst,
nvfuser_index_t dst_offset,
const SrcType& src,
nvfuser_index_t src_offset,
const PredType& pred) {
static_assert(
IsSameType<typename DstType::ValTypes, typename SrcType::ValTypes>::value,
"Invalid value types");
static_assert(
PredType::num_vals == DstType::num_vals, "Invalid predicate type");
PredicatedTupleCopy<DstType, SrcType, PredType, DstType::num_vals>::copy(
dst, dst_offset, src, src_offset, pred);
};
template <typename DstType, typename SrcType, typename PredType>
__inline__ __device__ static void copyTupleIf(
DstType& dst,
const SrcType& src,
nvfuser_index_t src_offset,
const PredType& pred) {
copyTupleIf(dst, 0, src, src_offset, pred);
};
template <typename DstType, typename SrcType, typename PredType>
__inline__ __device__ static void copyTupleIf(
DstType& dst,
const SrcType& src,
const PredType& pred) {
copyTupleIf(dst, 0, src, 0, pred);
};
// Can a generic const and non-const RefTupe be defined?
template <typename... Types>
class ConstRefTuple {
public:
static constexpr int num_vals = sizeof...(Types);
using ValTypes = TypeList<Types...>;
__device__ ConstRefTuple(const Types&... args) : vals_(args...) {}
__device__ ConstRefTuple(const ConstRefTuple& other) : vals_(other.vals_) {}
template <template <typename...> typename TupleType>
__device__ ConstRefTuple(const TupleType<Types...>& other) {
copyTuple(*this, other);
}
template <int val_idx>
__device__ const auto& val(nvfuser_index_t ptr_offset = 0) const {
static_assert(val_idx < num_vals, "Out-of-range value index");
return get<val_idx>()(vals_);
}
private:
Tuple<const Types&...> vals_;
};
template <typename... Types>
using PtrTuple = PtrTupleBase<false, Types...>;
template <typename... Types>
using VolatilePtrTuple = PtrTupleBase<true, Types...>;
// Define a LocalTuple of NumVals values of type Type
template <int NumVals, typename Type>
struct MakeLocalTuple;
template <typename Type>
struct MakeLocalTuple<1, Type> {
using type = LocalTuple<Type>;
};
template <typename Type>
struct MakeLocalTuple<2, Type> {
using type = LocalTuple<Type, Type>;
};
template <typename Type>
struct MakeLocalTuple<3, Type> {
using type = LocalTuple<Type, Type, Type>;
};
template <typename Type>
struct MakeLocalTuple<4, Type> {
using type = LocalTuple<Type, Type, Type, Type>;
};
template <typename Type>
struct MakeLocalTuple<5, Type> {
using type = LocalTuple<Type, Type, Type, Type, Type>;
};
template <typename Type>
struct MakeLocalTuple<6, Type> {
using type = LocalTuple<Type, Type, Type, Type, Type, Type>;
};
template <typename Type>
struct MakeLocalTuple<7, Type> {
using type = LocalTuple<Type, Type, Type, Type, Type, Type, Type>;
};
template <typename Type>
struct MakeLocalTuple<8, Type> {
using type = LocalTuple<Type, Type, Type, Type, Type, Type, Type, Type>;
};
template <typename Type>
struct MakeLocalTuple<16, Type> {
using type = LocalTuple<
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type>;
};
template <int NumVals, typename Type>
struct MakeRefTuple;
template <typename Type>
struct MakeRefTuple<1, Type> {
using type = RefTuple<Type>;
};
template <typename Type>
struct MakeRefTuple<2, Type> {
using type = RefTuple<Type, Type>;
};
template <typename Type>
struct MakeRefTuple<3, Type> {
using type = RefTuple<Type, Type, Type>;
};
template <typename Type>
struct MakeRefTuple<4, Type> {
using type = RefTuple<Type, Type, Type, Type>;
};
template <typename Type>
struct MakeRefTuple<5, Type> {
using type = RefTuple<Type, Type, Type, Type, Type>;
};
template <typename Type>
struct MakeRefTuple<6, Type> {
using type = RefTuple<Type, Type, Type, Type, Type, Type>;
};
template <typename Type>
struct MakeRefTuple<7, Type> {
using type = RefTuple<Type, Type, Type, Type, Type, Type, Type>;
};
template <typename Type>
struct MakeRefTuple<8, Type> {
using type = RefTuple<Type, Type, Type, Type, Type, Type, Type, Type>;
};
template <typename Type>
struct MakeRefTuple<16, Type> {
using type = RefTuple<
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type>;
};
template <int NumVals, typename Type>
struct MakeConstRefTuple;
template <typename Type>
struct MakeConstRefTuple<1, Type> {
using type = ConstRefTuple<Type>;
};
template <typename Type>
struct MakeConstRefTuple<2, Type> {
using type = ConstRefTuple<Type, Type>;
};
template <typename Type>
struct MakeConstRefTuple<3, Type> {
using type = ConstRefTuple<Type, Type, Type>;
};
template <typename Type>
struct MakeConstRefTuple<4, Type> {
using type = ConstRefTuple<Type, Type, Type, Type>;
};
template <typename Type>
struct MakeConstRefTuple<5, Type> {
using type = ConstRefTuple<Type, Type, Type, Type, Type>;
};
template <typename Type>
struct MakeConstRefTuple<6, Type> {
using type = ConstRefTuple<Type, Type, Type, Type, Type, Type>;
};
template <typename Type>
struct MakeConstRefTuple<7, Type> {
using type = ConstRefTuple<Type, Type, Type, Type, Type, Type, Type>;
};
template <typename Type>
struct MakeConstRefTuple<8, Type> {
using type = ConstRefTuple<Type, Type, Type, Type, Type, Type, Type, Type>;
};
template <typename Type>
struct MakeConstRefTuple<16, Type> {
using type = ConstRefTuple<
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type>;
};
template <int NumVals, typename Type>
struct MakeVolatilePtrTuple;
template <typename Type>
struct MakeVolatilePtrTuple<1, Type> {
using type = VolatilePtrTuple<Type>;
};
template <typename Type>
struct MakeVolatilePtrTuple<2, Type> {
using type = VolatilePtrTuple<Type, Type>;
};
template <typename Type>
struct MakeVolatilePtrTuple<3, Type> {
using type = VolatilePtrTuple<Type, Type, Type>;
};
template <typename Type>
struct MakeVolatilePtrTuple<4, Type> {
using type = VolatilePtrTuple<Type, Type, Type, Type>;
};
template <typename Type>
struct MakeVolatilePtrTuple<5, Type> {
using type = VolatilePtrTuple<Type, Type, Type, Type, Type>;
};
template <typename Type>
struct MakeVolatilePtrTuple<6, Type> {
using type = VolatilePtrTuple<Type, Type, Type, Type, Type, Type>;
};
template <typename Type>
struct MakeVolatilePtrTuple<7, Type> {
using type = VolatilePtrTuple<Type, Type, Type, Type, Type, Type, Type>;
};
template <typename Type>
struct MakeVolatilePtrTuple<8, Type> {
using type = VolatilePtrTuple<Type, Type, Type, Type, Type, Type, Type, Type>;
};
template <typename Type>
struct MakeVolatilePtrTuple<16, Type> {
using type = VolatilePtrTuple<
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type>;
};
// Utility definitions. Currently only used with LocalTuple
template <int idx, typename BinaryFunc, typename... DataTypes>
struct TupleBinaryOp {
static __inline__ __device__ void apply(
BinaryFunc func,
const LocalTuple<DataTypes...>& lhs,
const LocalTuple<DataTypes...>& rhs,
LocalTuple<DataTypes...>& result) {
TupleBinaryOp<idx - 1, BinaryFunc, DataTypes...>::apply(
func, lhs, rhs, result);
result.val<idx - 1>(0) = func(lhs.val<idx - 1>(0), rhs.val<idx - 1>(0));
}
};
template <typename BinaryFunc, typename... DataTypes>
struct TupleBinaryOp<0, BinaryFunc, DataTypes...> {
static __inline__ __device__ void apply(
BinaryFunc func,
const LocalTuple<DataTypes...>& lhs,
const LocalTuple<DataTypes...>& rhs,
LocalTuple<DataTypes...>& result) {}
};
template <typename BinaryFunc, typename... DataTypes>
__inline__ __device__ LocalTuple<DataTypes...> apply(
BinaryFunc func,
const LocalTuple<DataTypes...>& lhs,
const LocalTuple<DataTypes...>& rhs) {
LocalTuple<DataTypes...> result = lhs;
TupleBinaryOp<sizeof...(DataTypes), BinaryFunc, DataTypes...>::apply(
func, result, rhs, result);
return result;
}
template <typename... BoolTypes>
__inline__ __device__ LocalTuple<BoolTypes...> operator&&(
const LocalTuple<BoolTypes...>& lhs,
const LocalTuple<BoolTypes...>& rhs) {
return apply([](bool x, bool y) { return x && y; }, lhs, rhs);
}
template <typename... BoolTypes>
__inline__ __device__ LocalTuple<BoolTypes...> operator&&(
bool lhs,
const LocalTuple<BoolTypes...>& rhs) {
LocalTuple<BoolTypes...> lhs_tuple;
setTuple(lhs_tuple, lhs);
return lhs_tuple && rhs;
}
template <typename... BoolTypes>
__inline__ __device__ LocalTuple<BoolTypes...> operator&&(
const LocalTuple<BoolTypes...>& lhs,
bool rhs) {
LocalTuple<BoolTypes...> rhs_tuple;
setTuple(rhs_tuple, rhs);
return lhs && rhs_tuple;
}
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
// Basically just blockDim, but wrapped as a struct so that we have a mechanism
// to know at compile time that whether we are just using blockDim or some
// custom value. For a kernel without warp specialization, we just use blockDim,
// but for a kernel with warp specialization, we use a custom block_dim whose
// dimension are the dimensions of the compute warps.
struct DefaultBlockDim {
const uint32_t x, y, z;
__device__ DefaultBlockDim() : x(blockDim.x), y(blockDim.y), z(blockDim.z) {}
__device__ operator dim3() const {
return blockDim;
}
};
// Default block synchronization. Just use __barrier_sync
namespace block_sync {
__forceinline__ __device__ void init() {}
// Thread-block synchronization
template <bool aligned, typename BlockDimT>
__forceinline__ __device__ void sync(BlockDimT block_dim) {
if constexpr (aligned) {
__syncthreads();
} else if constexpr (std::is_same_v<BlockDimT, DefaultBlockDim>) {
__barrier_sync(0);
} else {
uint32_t num_threads = block_dim.x * block_dim.y * block_dim.z;
asm volatile("bar.sync 0, %0;" : : "r"(num_threads) : "memory");
}
}
} // namespace block_sync
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
namespace grid_sync {
// Get the first bit in a 64 bit integer
#define FIRST_UINT64_BIT ((uint64_t)1 << (sizeof(uint64_t) * 8 - 1))
template <typename T>
__device__ T globalAsVolatile(volatile T& global_val) {
return global_val;
}
// A grid synchronization that can be called multiple times in a kernel assuming
// all the blocks fit on device at once. The semaphore is an integer semaphore
// assumed to be initialized to 0 before launching the kernel. The persistent
// option should be envoked if this sync will be called multiple times in one
// kernel (i.e. having a grid reduce within a loop). Having multiple grid syncs
// called once in the same kernel does not require persistent mode. Segment size
// is the number of blocks participating in the sync in the dimensions marked by
// [X,Y,Z]_BLOCK. The granularity of this sync are those dimensions. I.E.
// Marking X and Y but not Z means there should be Z semaphores of size X*Y.
template <
bool X_BLOCK,
bool Y_BLOCK,
bool Z_BLOCK,
bool PERSISTENT,
bool Aligned,
typename BlockDimT>
__device__ void sync(
int64_t& semaphore,
const uint64_t& segment_size,
const bool last_block,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
// Finish all global memory transactions before synchronizing
__threadfence();
// Synchronize all threads in a block before synchronizing blocks
block_sync::sync<Aligned>(block_dim);
// Only allow linear_tid == 0 to participate in the synchronization
if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
// Get increment value, only want a single block to have the large
// increment, doesn't really matter which one, the goal is to flip/flop the
// first bit of a uint64_t value, since our semaphores are actualy int64_t
// we will just reinterpret_cast it to act as a uint64_t
uint64_t semaphore_increment = 1;
// Makes the assumption that blocks are in increasing order, this is not
// guaranteed by CUDA but this is the current behavior, and unlikely to
// change.
if (last_block) {
semaphore_increment = FIRST_UINT64_BIT - (segment_size - 1);
}
uint64_t oldArrive =
atomicAdd(reinterpret_cast<uint64_t*>(&semaphore), semaphore_increment);
// If for persistent kernels, lock all blocks until the semaphore has been
// reached. Make sure we access semaphore as a volatile address so we get
// the global memory updates.
unsigned int ns = 8;
while ((PERSISTENT || last_block) &&
((oldArrive ^ globalAsVolatile(semaphore)) & FIRST_UINT64_BIT) ==
0) {
// Put a sleep here so we have some breaks in probing the global
// semaphore, giving a better chance for other warps/blocks to catch up.
#if __CUDA_ARCH__ >= 700
// __nanosleep only available on compute capability 7.0 or higher
__nanosleep(ns); // avoids busy waiting
if (ns < 256) {
ns *= 2;
}
#endif
}
}
// Sync block to make sure all other threads are waiting on the sync
block_sync::sync<Aligned>(block_dim);
}
template <
bool X_BLOCK,
bool Y_BLOCK,
bool Z_BLOCK,
bool PERSISTENT,
bool Aligned,
typename BlockDimT>
__device__ void sync(
int64_t& semaphore,
const uint64_t& segment_size,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT, Aligned>(
semaphore,
segment_size,
index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim),
block_dim);
}
// Grid sync that can be called multiple times in the same kernel without all
// blocks being resident on device. This allows grid sync to be called multiple
// times as long as it's not broadcasted on the parallel axis it was reduced on.
//
// n_entrances is how many times every block is expected to enter into this
// function. All blocks must enter n_entrances times. The last block is only
// allowed to proceed once all other blocks have entered n_entrance
// times.
//
// Note that this is not currently used by grid and welford reduction
// as they use a separate sync flag for each each grid sync call.
template <
bool X_BLOCK,
bool Y_BLOCK,
bool Z_BLOCK,
bool Aligned,
typename BlockDimT>
__device__ void sync(
int64_t& semaphore,
const uint64_t& segment_size,
const nvfuser_index_t n_entrances,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
// Finish all global memory transactions before synchronizing
__threadfence();
// Synchronize all threads in a block before synchronizing blocks
block_sync::sync<Aligned>(block_dim);
// Only allow linear_tid == 0 to participate in the synchronization
if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
// Makes the assumption that blocks are in increasing order, this is not
// guaranteed by CUDA but this is the current behavior, and unlikely to
// change.
bool last_block =
index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
if (last_block) {
int64_t finished_val =
((int64_t)(index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(
gridDim) -
1)) *
((int64_t)n_entrances);
unsigned int ns = 8;
// Last block needs to wait for all other blocks to finish
while (globalAsVolatile(semaphore) < finished_val) {
#if __CUDA_ARCH__ >= 700
// __nanosleep only available on compute capability 7.0 or higher
__nanosleep(ns); // avoids busy waiting
if (ns < 256) {
ns *= 2;
}
#endif
}
} else {
auto old = atomicAdd(reinterpret_cast<uint64_t*>(&semaphore), 1);
}
}
// Sync block to make sure all other threads are waiting on the sync
block_sync::sync<Aligned>(block_dim);
}
// Non-blocking function to read the semaphore value in each calling thread
__device__ int64_t semaphoreFetch(int64_t* semaphore) {
int64_t state;
// NOTE: acquire/release operations require sm_70 or higher
// https://docs.nvidia.com/cuda/archive/12.3.0/parallel-thread-execution/index.html#scopes-and-applicability
asm volatile("ld.global.acquire.gpu.b64 %0, [%1];\n"
: "=l"(state)
: "l"(semaphore));
return state;
}
// Non-blocking function to set semaphore to new_value
__device__ void semaphoreRelease(int64_t* semaphore, int64_t new_value) {
if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
// NOTE: acquire/release operations require sm_70 or higher
// https://docs.nvidia.com/cuda/archive/12.3.0/parallel-thread-execution/index.html#scopes-and-applicability
asm volatile("st.global.release.gpu.b64 [%0], %1;\n"
:
: "l"(semaphore), "l"(new_value));
}
}
// First thread waits until fetched semaphore value matches trigger
__device__ void semaphoreWait(int64_t* semaphore, int64_t trigger_value) {
int64_t status = -1;
// Cutlass uses a loop like this, and has a facility where any thread can
// fetch the semaphore value ahead of waiting. This could reduce the wait
// time potentially but requires placement of the early fetch.
// https://github.com/NVIDIA/cutlass/blob/main/include/cutlass/semaphore.h
// while (__syncthreads_and(status != trigger_value)) {
// As soon as any thread in the block observes the trigger then it is
// safe to proceed
// Instead, we simply use the first thread in the block to do busy waiting.
if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
while (status != trigger_value) {
status = semaphoreFetch(semaphore);
}
}
}
// Serialize blocks in segments indicated by the [XYZ]_BLOCK template arguments.
// This should be called at the beginning of the section to be serialized.
// Assumes semaphore is initialized to zero. This function always synchronizes
// the thread block.
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK>
__device__ void blockSerializeWait(int64_t* semaphore) {
int segment_size =
index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);
int block_idx_in_segment =
index_utils::maskedOffset<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
if (block_idx_in_segment > 0) {
semaphoreWait(semaphore, block_idx_in_segment);
}
__syncthreads();
}
// Serialize blocks in segments indicated by the [XYZ]_BLOCK template arguments.
// This should be called at the end of the section to be serialized.
// This function always cleans up the semaphore; i.e. the last block writes the
// value 0 to the semaphore when complete. This function always synchronizes
// the thread block.
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK>
__device__ void blockSerializeRelease(int64_t* semaphore) {
int segment_size =
index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);
int block_idx_in_segment =
index_utils::maskedOffset<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
bool last_block = block_idx_in_segment == segment_size - 1;
// Block until writes from all threads in this block are visible to all other
// blocks before releasing semaphore using thread 0.
//
// Consider this simple example using two blocks:
//
// 1. Block 1 acquires lock using blockSerializeWait
// 2. Block 1 writes values to tensor T3
// 3. Block 1 releases lock using blockSerializeRelease
// 4. Block 2 acquires lock using blockSerializeWait
// 5. Block 2 uses values in tensor T3 to compute new values and writes them
// back to T3.
// 6. Block 2 releases lock using blockSerializeRelease
//
// Without a global thread fence, the writes to T3 from Block 1 in step 2
// might not be visible to Block 2 at step 5, meaning Block 2 would compute
// an invalid update.
//
// We use __syncthreads also, which implies a __threadfence_block but that
// only guarantees that all writes are visible to threads _within the same
// block_, so the __threadfence is still needed.
__threadfence();
__syncthreads();
semaphoreRelease(semaphore, last_block ? 0 : block_idx_in_segment + 1);
}
} // namespace grid_sync
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
// Reference:
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#asynchronous-barrier
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier
// https://github.com/NVIDIA/cutlass/blob/main/include/cute/arch/copy_sm90_desc.hpp
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
namespace mbarrier {
__device__ inline void init(
uint32_t smem_barrier_ptr,
uint32_t thread_count = 1) {
asm volatile(
"mbarrier.init.shared.b64 [%0], %1;\n" ::"r"(smem_barrier_ptr),
"r"(thread_count));
}
__device__ inline void inval(uint32_t smem_barrier_ptr) {
asm volatile("mbarrier.inval.shared.b64 [%0];\n" ::"r"(smem_barrier_ptr));
}
__device__ inline uint64_t arrive(uint32_t smem_barrier_ptr) {
volatile uint64_t state;
asm volatile("mbarrier.arrive.shared.b64 %0, [%1];\n"
: "=l"(state)
: "r"(smem_barrier_ptr));
return state;
}
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
__device__ inline uint64_t arriveExpectTX(
uint32_t smem_barrier_ptr,
uint32_t tx_count) {
volatile uint64_t state;
asm volatile("mbarrier.arrive.expect_tx.shared.b64 %0, [%1], %2;\n"
: "=l"(state)
: "r"(smem_barrier_ptr), "r"(tx_count));
return state;
}
__device__ inline void arrive(uint32_t smem_barrier_ptr, uint32_t cta_id) {
asm volatile(
"{.reg .b32 remaddr32;\n"
"mapa.shared::cluster.u32 remaddr32, %0, %1;\n"
"mbarrier.arrive.shared::cluster.b64 _, [remaddr32];\n"
"}"
:
: "r"(smem_barrier_ptr), "r"(cta_id));
}
#endif
__device__ inline void wait(uint32_t smem_barrier_ptr, uint64_t state) {
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
asm volatile(
"{\n"
".reg .pred complete;\n"
"waitLoop:\n"
"mbarrier.try_wait.shared.b64 complete, [%0], %1;\n"
"@!complete bra waitLoop;\n"
"}\n" ::"r"(smem_barrier_ptr),
"l"(state));
#else
asm volatile(
"{\n"
".reg .pred P1;\n"
"LAB_WAIT:\n"
"mbarrier.test_wait.shared.b64 P1, [%0], %1;\n"
"@P1 bra.uni DONE;\n"
"nanosleep.u32 20;\n"
"bra.uni LAB_WAIT;\n"
"DONE:\n"
"}\n" ::"r"(smem_barrier_ptr),
"l"(state));
#endif
}
__device__ inline void waitParity(uint32_t smem_barrier_ptr, uint32_t parity) {
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
asm volatile(
"{\n"
".reg .pred complete;\n"
"waitLoop:\n"
"mbarrier.try_wait.parity.shared.b64 complete, [%0], %1;\n"
"@!complete bra waitLoop;\n"
"}\n" ::"r"(smem_barrier_ptr),
"r"(parity));
#else
asm volatile(
"{\n"
".reg .pred P1;\n"
"LAB_WAIT:\n"
"mbarrier.test_wait.parity.shared.b64 P1, [%0], %1;\n"
"@P1 bra.uni DONE;\n"
"nanosleep.u32 20;\n"
"bra.uni LAB_WAIT;\n"
"DONE:\n"
"}\n" ::"r"(smem_barrier_ptr),
"r"(parity));
#endif
}
} // namespace mbarrier
#endif // (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
// [Z,Y,X]_THREADS is the number of participating threads in the z, y, x
// dimension of the block. If set to false the dimension doesn't
// participate in the reduction. We could start with warp reductions, then
// reduce the warps, this could save some shared memory, but could be slower in
// some instances.
//
// EXAMPLE USAGE:
// blockReduceSum<X_THREADS, Y_THREADS, Z_THREADS>
// (output[output_index], inputs[input_index],
// [] __device__ (T& a, const T b) { a += b; });
template <
bool X_REDUCE,
bool Y_REDUCE,
bool Z_REDUCE,
bool Aligned,
typename T,
typename Func,
typename BlockDimT>
__device__ void blockReduce(
T& out,
const T& inp_val,
Func reduction_op,
T* shared_mem,
bool read_pred,
bool write_pred,
T init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
// If this thread will output a final result
bool should_write =
index_utils::maskedIsZero<X_REDUCE, Y_REDUCE, Z_REDUCE>(threadIdx);
// Size of the reduction segments
unsigned int reduction_size =
index_utils::maskedSize<X_REDUCE, Y_REDUCE, Z_REDUCE>(block_dim);
// Index into the reduction segment
unsigned int reduction_tid =
index_utils::maskedOffset<X_REDUCE, Y_REDUCE, Z_REDUCE>(
threadIdx, block_dim);
// Index of the reduction segment
unsigned int reduction_idx =
index_utils::maskedOffset<!X_REDUCE, !Y_REDUCE, !Z_REDUCE>(
threadIdx, block_dim);
// number of reductions per block
unsigned int reduction_num =
index_utils::maskedSize<!X_REDUCE, !Y_REDUCE, !Z_REDUCE>(block_dim);
// smem_offset is the offset into shared memory for the current thread.
// To ensure coalesced access to shared memory, we need to ensure
// each transaction is accessing a contiguous block of 128 bytes.
// For outer reduction where TIDy is in the reduction dimension and TIDx
// is in the iteration dimension and TIDz is not used. We have
// reduction_tid = TIDy and reduction_idx = TIDx. If we directly use the
// offset based on reduction_tid and reduction_idx, we will have stride
// access to shared memory. For example:
// offset = reduction_idx * reduction_size + reduction_tid
// = TIDx * blockDim.y + TIDy
// To avoid this, we should always use the offset based on the indexing of
// threads within a block.
// Offset into smem for the current thread
unsigned int smem_offset = threadIdx.x + threadIdx.y * block_dim.x +
threadIdx.z * block_dim.x * block_dim.y;
// The peer stride represents the distance between the current element and its
// nearest reduction peer. It depends on the reduction dimension. A reduction
// peer refers to elements that belong to the same reduction segment. For
// example, if the reduction is across TIDy, all the elements in the same
// column (with the same TIDx) are considered peers of each other. The
// distance between an element and its nearest peer is block_dim.x.
constexpr int num_redu_dims = (int)X_REDUCE + (int)Y_REDUCE + (int)Z_REDUCE;
constexpr bool xz_reduce = (num_redu_dims == 2 && !Y_REDUCE);
// reduction in 3 dimensions, XYZ, stride is 1
unsigned int peer_stride = 1;
if (num_redu_dims == 1) {
// Reduction only in 1 dimension, X or Y or Z
// e.g. inner or outer reduction
// If X_REDUCE, reducing in neighbor cols in smem, peer_stride is 1
// If Y_REDUCE, reducing in neighbor rows in smem, peer_stride is
// block_dim.x If Z_REDUCE, reducing in neighbor planes in smem, peer_stride
// is block_dim.x * block_dim.y
peer_stride = X_REDUCE ? 1
: Y_REDUCE ? block_dim.x
: block_dim.x * block_dim.y;
} else if (num_redu_dims == 2) {
// Reduction in 2 dimensions, only one dimension is not reduced, !X, !Y, !Z
// If !Z_REDUCE, merge XY, reducing neighbor cols, peer_stride is 1
// If !X_REDUCE, merge ZY, reducing neighbor rows, peer_stride is
// block_dim.x If !Y_REDUCE, if block_dim.y == 1, merge XZ, peer_stride
// is 1. otherwise, needs carefully calculate offset to the reduction peer:
// (1) redu_offset = reduction_tid + tree_fold_factor
// (2) idz = redu_offset / block_dim.x
// (3) idx = redu_offset % block_dim.x
// (4) smem_offset = idx + threadIdx.y * block_dim.x + idz * block_dim.x *
// block_dim.y
if (!Y_REDUCE) {
peer_stride = 1;
} else {
peer_stride = !Z_REDUCE ? 1 : block_dim.x;
}
}
// Initialize shared memory
if (read_pred) {
shared_mem[smem_offset] = inp_val;
} else {
shared_mem[smem_offset] = init_val;
}
block_sync::sync<Aligned>(block_dim);
// Reduce down to nearest power of 2 for the tree reduction:
int np2 = 1 << (31 - __clz(reduction_size));
if (reduction_tid < np2 && reduction_tid + np2 < reduction_size) {
int peer_offset = smem_offset + np2 * peer_stride;
if constexpr (xz_reduce) {
if (block_dim.y > 1) {
int redu_offset = reduction_tid + np2;
int idz = redu_offset / block_dim.x;
int idx = redu_offset % block_dim.x;
peer_offset =
idx + threadIdx.y * block_dim.x + idz * block_dim.x * block_dim.y;
}
}
reduction_op(shared_mem[smem_offset], shared_mem[peer_offset]);
}
block_sync::sync<Aligned>(block_dim);
// loop peel the final iteration to save one syncthread for the end
for (int factor = np2 / 2; factor > 1; factor >>= 1) {
if (reduction_tid < factor) {
int peer_offset = smem_offset + factor * peer_stride;
if constexpr (xz_reduce) {
if (block_dim.y > 1) {
int redu_offset = reduction_tid + factor;
int idz = redu_offset / block_dim.x;
int idx = redu_offset % block_dim.x;
peer_offset =
idx + threadIdx.y * block_dim.x + idz * block_dim.x * block_dim.y;
}
}
reduction_op(shared_mem[smem_offset], shared_mem[peer_offset]);
}
block_sync::sync<Aligned>(block_dim);
}
if (should_write && write_pred) {
T result = out;
reduction_op(result, shared_mem[smem_offset]);
if (reduction_size > 1) {
reduction_op(result, shared_mem[smem_offset + peer_stride]);
}
out = result;
}
block_sync::sync<Aligned>(block_dim);
}
// Use the same pred for both reads and writes
template <
bool X_REDUCE,
bool Y_REDUCE,
bool Z_REDUCE,
bool Aligned,
typename T,
typename Func,
typename BlockDimT>
__device__ void blockReduce(
T& out,
const T& inp_val,
Func reduction_op,
T* shared_mem,
bool read_write_pred,
T init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
blockReduce<X_REDUCE, Y_REDUCE, Z_REDUCE, Aligned, T, Func>(
out,
inp_val,
reduction_op,
shared_mem,
read_write_pred,
read_write_pred,
init_val,
block_dim);
}
// Each thread in the iteration dimension processes N elements
// Typical usage is in outer reduction where the iteration dimension
// is parallelized by vectorized loads, bidmx. The reduction dimension
// is parallelized by bdimy. This function works as follows:
// (1) Each thread vectorized loads N elements from input register array to
// smem. (2) do N * bdimx parallel reductions in smem.
template <
bool Aligned,
int N, // Number of elements per input array
typename T,
typename Func,
typename BlockDimT>
__device__ void blockIterGroupedYdimReduce(
T out[N],
const T inp_val[N],
Func reduction_op,
T* shared_mem,
bool read_pred,
bool write_pred,
T init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
// N should be a valid vectorization factor
static_assert(
N == 2 || N == 4 || N == 8 || N == 16,
"N should be a valid vectorization factor, one of (2, 4, 8, 16)!");
bool should_write = threadIdx.y == 0;
unsigned int reduction_size = block_dim.y;
unsigned int reduction_tid = threadIdx.y;
// In shared memory, each row has 128 bytes, if sizeof(T) * N = 32 bytes, each
// row has 128 / 32 = 4 threads. Each transaction can only load data from one
// row, with a max of 16 bytes per thread. So the total bytes per transaction
// is 4 x 16 = 64 bytes which is only half of the maximum 128 bytes per
// transaction. we should change the layout from [TIDy, TIDx, N] to [N/4,
// TIDy, TIDx, 4]
constexpr unsigned int array_bytes = sizeof(T) * N;
constexpr unsigned int total_loads =
array_bytes / 16 > 1 ? array_bytes / 16 : 1;
constexpr unsigned int elements_per_load =
16 / sizeof(T) > N ? N : 16 / sizeof(T);
constexpr unsigned int align_size = array_bytes > 16 ? 16 : array_bytes;
// assume TIDy is the reduction dimension, TIDx is the iteration dimension
// TIDz is not used
unsigned int peer_stride = elements_per_load * block_dim.x;
unsigned int smem_offset_inter =
block_dim.x * block_dim.y * elements_per_load;
unsigned int smem_offset_intra =
(threadIdx.y * block_dim.x + threadIdx.x) * elements_per_load;
// load to [total_loads] sections of shared memory
#pragma unroll
for (unsigned int i = 0; i < total_loads; ++i) {
loadGeneric<T, elements_per_load>(
shared_mem + smem_offset_inter * i + smem_offset_intra,
const_cast<T*>(inp_val) + i * elements_per_load);
}
block_sync::sync<Aligned>(block_dim);
// Reduce down to nearest power of 2 for the tree reduction:
// Perform parallel reduction for each element in the array
int np2 = 1 << (31 - __clz(reduction_size));
if (reduction_tid < np2 && reduction_tid + np2 < reduction_size) {
// vectorized load from smem to regs
__align__(align_size) T self[N];
__align__(align_size) T peer[N];
#pragma unroll
for (unsigned int i = 0; i < total_loads; ++i) {
int self_offset = smem_offset_inter * i + smem_offset_intra;
int peer_offset = self_offset + np2 * peer_stride;
loadGeneric<T, elements_per_load>(
self + i * elements_per_load, shared_mem + self_offset);
loadGeneric<T, elements_per_load>(
peer + i * elements_per_load, shared_mem + peer_offset);
}
// reduction
#pragma unroll
for (int i = 0; i < N; ++i) {
reduction_op(self[i], peer[i]);
}
// write self back to smem
#pragma unroll
for (unsigned int i = 0; i < total_loads; ++i) {
int self_offset = smem_offset_inter * i + smem_offset_intra;
loadGeneric<T, elements_per_load>(
shared_mem + self_offset, self + i * elements_per_load);
}
}
block_sync::sync<Aligned>(block_dim);
// Tree reduction
for (int factor = np2 / 2; factor > 1; factor >>= 1) {
if (reduction_tid < factor) {
// vectorized load from smem to regs
__align__(align_size) T self[N];
__align__(align_size) T peer[N];
#pragma unroll
for (unsigned int i = 0; i < total_loads; ++i) {
int self_offset = smem_offset_inter * i + smem_offset_intra;
int peer_offset = self_offset + factor * peer_stride;
loadGeneric<T, elements_per_load>(
self + i * elements_per_load, shared_mem + self_offset);
loadGeneric<T, elements_per_load>(
peer + i * elements_per_load, shared_mem + peer_offset);
}
// reduction
#pragma unroll
for (int i = 0; i < N; ++i) {
reduction_op(self[i], peer[i]);
}
// write self back to smem
#pragma unroll
for (unsigned int i = 0; i < total_loads; ++i) {
int self_offset = smem_offset_inter * i + smem_offset_intra;
loadGeneric<T, elements_per_load>(
shared_mem + self_offset, self + i * elements_per_load);
}
}
block_sync::sync<Aligned>(block_dim);
}
// last reduction
if (should_write && write_pred) {
// init result
__align__(align_size) T result[N];
#pragma unroll
for (int i = 0; i < N; ++i) {
result[i] = out[i];
}
// copy first element to result
__align__(align_size) T self[N];
#pragma unroll
for (unsigned int i = 0; i < total_loads; ++i) {
int self_offset = smem_offset_inter * i + smem_offset_intra;
loadGeneric<T, elements_per_load>(
self + i * elements_per_load, shared_mem + self_offset);
}
#pragma unroll
for (int i = 0; i < N; ++i) {
reduction_op(result[i], self[i]);
}
// reduction of the 2nd last element
if (reduction_size > 1) {
__align__(align_size) T peer[N];
#pragma unroll
for (unsigned int i = 0; i < total_loads; ++i) {
int peer_offset =
smem_offset_inter * i + smem_offset_intra + peer_stride;
loadGeneric<T, elements_per_load>(
peer + i * elements_per_load, shared_mem + peer_offset);
}
#pragma unroll
for (int i = 0; i < N; ++i) {
reduction_op(result[i], peer[i]);
}
}
#pragma unroll
for (int i = 0; i < N; ++i) {
out[i] = result[i];
}
}
block_sync::sync<Aligned>(block_dim);
}
// Use the same pred for both reads and writes
template <
bool Aligned,
int N, // Number of elements per input array
typename T,
typename Func,
typename BlockDimT>
__device__ void blockIterGroupedYdimReduce(
T out[N],
const T inp_val[N],
Func reduction_op,
T* shared_mem,
bool read_write_pred,
T init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
blockIterGroupedYdimReduce<Aligned, N, T, Func>(
out,
inp_val,
reduction_op,
shared_mem,
read_write_pred,
read_write_pred,
init_val,
block_dim);
}
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
// Inter-block reduction.
//
// The gridReduce function performs point-wise reductions of scalars across
// thread blocks. Thread blocks are disjointly partitioned into groups,
// "reduction segments", that are collectively defined by boolean template
// parameters, X_BLOCK, Y_BLOCK and Z_BLOCK. Each of X/Y/Z_BLOCK determines
// whether thread blocks along the dimension should be grouped into the same
// reduction segment. Cross-block reducitons are independently done within each
// segment and generates distinctive results per segment. For instance, if all
// of X/Y/Z_BLOCK are true, reductions will be done across all thread blocks
// since there will be just a single segment consisting of all thread blocks. If
// none of them are true, each thread block will become a segment by itself, so
// no reduction will be performed.
//
// The input scalars to reduce within each segment are a certain subset of
// thread-private scalars provided as part of the gridReduce function
// parameters. Boolean template parameters, X_THREAD, Y_THREAD and Z_THREAD,
// determine which subset of the scalars should be used for inter-block
// reductions. Specifically, all the input scalars of threads along each
// dimension will be used when X/Y/Z_THREAD are true. Otherwise, only the value
// held at offset 0 of each dimension will be used. Thus, for example, if all of
// X/Y/Z_THREAD are true, the scalars of all threads in each block will
// participate in inter-block reductions. If all of them are false, only one
// scalar of the thread at threadIdx.x == threadIdx.y == threadIdx.z == 0 will
// be used. In the code below, we call the subset of threads a "reduction
// block". "Participating" thread dimensions here are similar to the
// "non-participating" block dimensions. They come from a block dimension that
// has not been reduced before hitting this grid reduction.
//
// Inter-block reductions perform point-wise reductions of scalars of reduction
// blocks within each reduction segment. More specifically, let rb be a
// reduction block and rs be a reduction segment. Let IN(thread_idx, block_idx)
// denote the input scalar of thread at thread_idx and block_idx. The result of
// each reduction segment, OUT(thread_idx, block_idx_out), is defined only for
// each thread_idx in thread block block_idx_out in the segment as follows:
//
// OUT(thread_idx, block_idx_out) =
// Reduction of IN(thread_idx, block_idx) for
// all block_idx in a reduction segment
//
// OUT is not given for all threads that are not in block_idx_out and the
// reduction block.
//
// See also the function comment of gridReduce.
namespace reduction {
// Reduces all the reduction blocks in each reduction segment. This is the
// "cleanup" stage of a grid reduction.
//
// This is only called by one thread block per reduction segment. The input
// reduction blocks of the segment are stored in an intermediate buffer pointed
// by parameter in. Template parameters X/Y/Z_THREAD denote how the reduction
// block is formed.
//
// The size of a reduction block is by definition smaller or equal to the size
// of a thread block. We use the remaining threads to parallelize reductions
// across reduction blocks. For example, when X/Y/Z_THREAD = {true, false,
// false}, we use blockDim.y*blockDim.z threads for each output value. This is
// done first by loading the input values in parallel and then by reducing
// across threads of dimensions whose XYZ_THREAD are false.
//
// Note that what is done here after the loading from global memory is similar
// to what the existing blockReduce function does.
template <
bool X_THREAD,
bool Y_THREAD,
bool Z_THREAD,
bool Aligned,
typename T,
typename Func,
typename BlockDimT>
__device__ void gridReduceLastBlock(
T& out,
const volatile T* in,
const nvfuser_index_t
grid_reduction_segment_size, // Number of reductions across
// grid reduce dimensions
const nvfuser_index_t
block_reduction_segment_size, // Number of reductions across the block
Func reduction_op,
T* shared_buf,
bool write_pred,
T init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
// We have to do num_reductions across reduction_size. The reductions are
// contiguous, but offset by reduction_size. There is an entry in "in" for
// every block, and every thread marked as true. Threads in dimensions marked
// as false can be used to parallelize the reduction.
// Find the reduction id of the participating threads
const auto block_reduction_segment_idx =
index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
threadIdx, block_dim);
// Find an id associated within a reduction segment for all
// "non-participating" threads, which will parallelize the reductions for the
// "participating" threads
const auto id_in_block_segment =
index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
threadIdx, block_dim);
// Stride by the "non-participating" threads
const auto input_stride_for_thread_in_segment =
index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(block_dim);
T inp = init_val;
// Block stride across the reduction until we only have one value per thread
for (nvfuser_index_t reduction_i = id_in_block_segment;
reduction_i < grid_reduction_segment_size;
reduction_i += input_stride_for_thread_in_segment) {
auto work_buf_offset = reduction_i * block_reduction_segment_size +
block_reduction_segment_idx;
reduction_op(inp, in[work_buf_offset]);
}
// Block reduce the per thread values into per "participating" thread values
T inp_tmp = init_val;
blockReduce<!X_THREAD, !Y_THREAD, !Z_THREAD, Aligned>(
inp_tmp, inp, reduction_op, shared_buf, true, init_val, block_dim);
const bool should_write = (X_THREAD || threadIdx.x == 0) &&
(Y_THREAD || threadIdx.y == 0) && (Z_THREAD || threadIdx.z == 0);
if (should_write && write_pred) {
reduction_op(out, inp_tmp);
}
}
// Reduces per-thread values across threads and thread blocks.
//
// Function parameters:
// - out: Per-thread output location
// - inp_val: Per-thread input value
// - reduction_op: Scalar reduction function
// - work_buf: Temporary buffer for cross-block reductions
// - sync_flags: A vector of integers for synchronizations
// - shared_buf: Shared memory buffer for intra-block reduction
//
// Thread has valid results based on if it's the last block in the grid
// reduction dimension
//
// Template parameters:
// - X/Y/Z_BLOCK/THREAD: When true, reduces across thread blocks along the X/Y/Z
// dimensions
// - PERSISTENT_REDUCTION: Indicates grid reduction will be called in a loop, or
// the result of the grid reduction will be broadcasted and used across the
// grid. These requires cross grid communication and the grid synchronizations
// here to actually synchronize across the entire grid. When false the grid is
// not synchronized, the last block just waits for everyone else to finish and
// the other blocks can exit early.
// - T: Scalar data type of input/output data
// - Func: Type of scalara reduction function
//
// Template parameters X/Y/Z_BLOCK define a group of thread blocks that are
// reduced together. We call it a reduction segment. Some examples are:
//
// Case 1: X/Y/Z_BLOCK == true/true/true -> There is only one segment, which
// includes all thread blocks. It is effecively the same as the grid.
//
// Case 2: X/Y/Z_BLOCK == false/false/false -> Each thread block comprises an
// individual segment by itself.
//
// Case 3: X/Y/Z_BLOCK == true/false/false -> Each segment contains thread
// blocks that have the same blockDim.x. There will be blockDim.y*blockDim.z
// such segments.
//
// X/Y/Z_THREAD also works similarly as X/Y/Z_BLOCK and defines a
// group of threads that are reduced togather.
//
// After the function completes, only one thread block per reduction segment
// gets valid reduction results. There is no guarantee which particular block
// gets the final results.
//
// entrance_ind and n_entrances are allowed when PERSISTENT_REDUCTION = false.
// If a grid reduction call is only called once per thread, entrance_ind == 0
// and n_entrances == 1. However, grid reduction can be called in a loop in a
// thread, in that case entrance_ind is the count of times the function has been
// called, and n_entrances is the total number of times it will be called.
template <
bool X_BLOCK,
bool Y_BLOCK,
bool Z_BLOCK,
bool X_THREAD,
bool Y_THREAD,
bool Z_THREAD,
bool PERSISTENT_REDUCTION,
bool Aligned,
typename T,
typename Func,
typename BlockDimT>
__device__ void gridReduce(
T& out,
const T& inp_val,
Func reduction_op,
volatile T* work_buf,
int64_t* sync_flags,
T* shared_buf,
bool read_pred,
bool write_pred,
T init_val,
const nvfuser_index_t entrance_ind,
const nvfuser_index_t n_entrances,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
T block_reduction_val = init_val;
// Do block reduction when required
if (X_THREAD || Y_THREAD || Z_THREAD) {
blockReduce<X_THREAD, Y_THREAD, Z_THREAD, Aligned>(
block_reduction_val,
inp_val,
reduction_op,
shared_buf,
read_pred,
true,
init_val,
block_dim);
} else if (read_pred) {
block_reduction_val = inp_val;
}
// Number of values to reduce in the reduction segment
const auto grid_reduction_segment_size =
index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);
// Index of the reduction we're performing out of the
// grid_reduction_segment_size
const auto idx_in_grid_segment =
index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(
blockIdx, gridDim);
// Number of threads we can use in final reduction, Seems to assume all
// threads in the block participate
const auto block_reduction_segment_size =
index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(block_dim);
// Number of reductions in the grid
const nvfuser_index_t grid_segment_size = PERSISTENT_REDUCTION
? 1
: index_utils::maskedSize<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(gridDim);
// advance to the offset for this segment
// index of reduction * size of the reduction * size of threads
work_buf += (entrance_ind * grid_segment_size + idx_in_grid_segment) *
grid_reduction_segment_size * block_reduction_segment_size;
if ((!X_THREAD || threadIdx.x == 0) && (!Y_THREAD || threadIdx.y == 0) &&
(!Z_THREAD || threadIdx.z == 0)) {
auto block_offset =
index_utils::maskedOffset<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
auto thread_offset =
index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
threadIdx, block_dim);
auto work_buf_offset =
block_offset * block_reduction_segment_size + thread_offset;
work_buf[work_buf_offset] = block_reduction_val;
}
if (PERSISTENT_REDUCTION) {
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION, Aligned>(
sync_flags[idx_in_grid_segment],
grid_reduction_segment_size,
block_dim);
} else {
// Use a different sync flag for each call
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION, Aligned>(
sync_flags[entrance_ind * grid_segment_size + idx_in_grid_segment],
grid_reduction_segment_size,
block_dim);
}
bool last_block =
index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
if (last_block) {
// Cleanup with block reduction
gridReduceLastBlock<!X_THREAD, !Y_THREAD, !Z_THREAD, Aligned>(
out,
(T*)work_buf,
grid_reduction_segment_size,
block_reduction_segment_size,
reduction_op,
shared_buf,
write_pred,
init_val,
block_dim);
}
if (PERSISTENT_REDUCTION) {
// Make sure we're done with global memory before we allow the kernel to
// continue
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION, Aligned>(
sync_flags[idx_in_grid_segment],
grid_reduction_segment_size,
block_dim);
}
}
// This is just a wrapper of the above grid reduction routine to
// measure the elapsed cycles. The measurement must be done just by
// one thread, and in this case it should be done by one of the
// threads in the last thread block.
#ifdef NVFUSER_PROFILE_KERNEL
template <
bool X_BLOCK,
bool Y_BLOCK,
bool Z_BLOCK,
bool X_THREAD,
bool Y_THREAD,
bool Z_THREAD,
bool PERSISTENT_REDUCTION,
bool Aligned,
typename T,
typename Func,
typename BlockDimT>
__device__ void gridReduce(
T& out,
const T& inp_val,
Func reduction_op,
volatile T* work_buf,
int64_t* sync_flags,
T* shared_buf,
bool read_pred,
bool write_pred,
T init_val,
const nvfuser_index_t entrance_ind,
const nvfuser_index_t n_entrances,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
int64_t& cycles,
int64_t& count) {
int64_t start_counter = 0;
if (index_utils::maskedIsLast<true, true, true>(blockIdx, gridDim) &&
index_utils::maskedIsZero<true, true, true>(threadIdx)) {
start_counter = readCycleCounter();
}
gridReduce<
X_BLOCK,
Y_BLOCK,
Z_BLOCK,
X_THREAD,
Y_THREAD,
Z_THREAD,
PERSISTENT_REDUCTION,
Aligned,
T,
Func>(
out,
inp_val,
reduction_op,
work_buf,
sync_flags,
shared_buf,
read_pred,
write_pred,
init_val,
entrance_ind,
n_entrances,
block_dim);
if (index_utils::maskedIsLast<true, true, true>(blockIdx, gridDim) &&
index_utils::maskedIsZero<true, true, true>(threadIdx)) {
cycles += readCycleCounter() - start_counter;
++count;
}
}
#endif // NVFUSER_PROFILE_KERNEL
template <
bool X_BLOCK,
bool Y_BLOCK,
bool Z_BLOCK,
bool X_THREAD,
bool Y_THREAD,
bool Z_THREAD,
bool Aligned,
typename T,
typename Func,
typename BlockDimT>
__device__ void gridReduce2PartialReduction(
const T& inp_val,
T init_val,
Func reduction_op,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
volatile T* work_buf,
T* shared_buf,
bool read_pred,
nvfuser_index_t grid_reduction_segment_size,
nvfuser_index_t idx_in_grid_segment,
nvfuser_index_t block_reduction_segment_size) {
T block_reduction_val = init_val;
// Do block reduction when required
if (X_THREAD || Y_THREAD || Z_THREAD) {
blockReduce<X_THREAD, Y_THREAD, Z_THREAD, Aligned>(
block_reduction_val,
inp_val,
reduction_op,
shared_buf,
read_pred,
true,
init_val,
block_dim);
} else if (read_pred) {
block_reduction_val = inp_val;
}
if ((!X_THREAD || threadIdx.x == 0) && (!Y_THREAD || threadIdx.y == 0) &&
(!Z_THREAD || threadIdx.z == 0)) {
auto block_offset =
index_utils::maskedOffset<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
auto thread_offset =
index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
threadIdx, block_dim);
auto work_buf_offset =
block_offset * block_reduction_segment_size + thread_offset;
work_buf[work_buf_offset] = block_reduction_val;
}
}
// 2-way horizontally fused grid reduction
template <
bool X_BLOCK,
bool Y_BLOCK,
bool Z_BLOCK,
bool X_THREAD,
bool Y_THREAD,
bool Z_THREAD,
bool PERSISTENT_REDUCTION,
bool Aligned,
typename T1,
typename Func1,
typename T2,
typename Func2,
typename BlockDimT>
__device__ void gridReduceGroup(
T1& out1,
const T1& inp_val1,
T1 init_val1,
Func1 reduction_op1,
volatile T1* work_buf1,
T2& out2,
const T2& inp_val2,
T2 init_val2,
Func2 reduction_op2,
volatile T2* work_buf2,
int64_t* sync_flags,
void* shared_buf,
bool read_pred,
bool write_pred,
const nvfuser_index_t entrance_ind,
const nvfuser_index_t n_entrances,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
// Number of values to reduce in the reduction segment
const auto grid_reduction_segment_size =
index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);
// Index of the reduction we're performing out of the
// grid_reduction_segment_size
const auto idx_in_grid_segment =
index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(
blockIdx, gridDim);
// Number of threads we can use in final reduction, Seems to assume all
// threads in the block participate
const auto block_reduction_segment_size =
index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(block_dim);
// Number of reductions in the grid
const nvfuser_index_t grid_segment_size = PERSISTENT_REDUCTION
? 1
: index_utils::maskedSize<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(gridDim);
// advance to the offset for this segment
// index of reduction * size of the reduction * size of threads
work_buf1 += (entrance_ind * grid_segment_size + idx_in_grid_segment) *
grid_reduction_segment_size * block_reduction_segment_size;
work_buf2 += (entrance_ind * grid_segment_size + idx_in_grid_segment) *
grid_reduction_segment_size * block_reduction_segment_size;
gridReduce2PartialReduction<
X_BLOCK,
Y_BLOCK,
Z_BLOCK,
X_THREAD,
Y_THREAD,
Z_THREAD,
Aligned>(
inp_val1,
init_val1,
reduction_op1,
block_dim,
work_buf1,
(T1*)shared_buf,
read_pred,
grid_reduction_segment_size,
idx_in_grid_segment,
block_reduction_segment_size);
gridReduce2PartialReduction<
X_BLOCK,
Y_BLOCK,
Z_BLOCK,
X_THREAD,
Y_THREAD,
Z_THREAD,
Aligned>(
inp_val2,
init_val2,
reduction_op2,
block_dim,
work_buf2,
(T2*)shared_buf,
read_pred,
grid_reduction_segment_size,
idx_in_grid_segment,
block_reduction_segment_size);
if (PERSISTENT_REDUCTION) {
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION, Aligned>(
sync_flags[idx_in_grid_segment],
grid_reduction_segment_size,
block_dim);
} else {
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION, Aligned>(
sync_flags[entrance_ind * grid_segment_size + idx_in_grid_segment],
grid_reduction_segment_size,
block_dim);
}
bool last_block =
index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
if (last_block) {
// Cleanup with block reduction
gridReduceLastBlock<!X_THREAD, !Y_THREAD, !Z_THREAD, Aligned>(
out1,
work_buf1,
grid_reduction_segment_size,
block_reduction_segment_size,
reduction_op1,
(T1*)shared_buf,
write_pred,
init_val1,
block_dim);
gridReduceLastBlock<!X_THREAD, !Y_THREAD, !Z_THREAD, Aligned>(
out2,
work_buf2,
grid_reduction_segment_size,
block_reduction_segment_size,
reduction_op2,
(T2*)shared_buf,
write_pred,
init_val2,
block_dim);
}
if (PERSISTENT_REDUCTION) {
// Make sure we're done with global memory before we allow the kernel to
// continue
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION, Aligned>(
sync_flags[idx_in_grid_segment],
grid_reduction_segment_size,
block_dim);
}
}
#ifdef NVFUSER_PROFILE_KERNEL
template <
bool X_BLOCK,
bool Y_BLOCK,
bool Z_BLOCK,
bool X_THREAD,
bool Y_THREAD,
bool Z_THREAD,
bool PERSISTENT_REDUCTION,
bool Aligned,
typename T1,
typename Func1,
typename T2,
typename Func2,
typename BlockDimT>
__device__ void gridReduceGroup(
T1& out1,
const T1& inp_val1,
T1 init_val1,
Func1 reduction_op1,
volatile T1* work_buf1,
T2& out2,
const T2& inp_val2,
T2 init_val2,
Func2 reduction_op2,
volatile T2* work_buf2,
int64_t* sync_flags,
void* shared_buf,
bool read_pred,
bool write_pred,
const nvfuser_index_t entrance_ind,
const nvfuser_index_t n_entrances,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
int64_t& cycles,
int64_t& count) {
int64_t start_counter = 0;
if (index_utils::maskedIsLast<true, true, true>(blockIdx, gridDim) &&
index_utils::maskedIsZero<true, true, true>(threadIdx)) {
start_counter = readCycleCounter();
}
gridReduceGroup<
X_BLOCK,
Y_BLOCK,
Z_BLOCK,
X_THREAD,
Y_THREAD,
Z_THREAD,
PERSISTENT_REDUCTION,
Aligned,
T1,
Func1,
T2,
Func2>(
out1,
inp_val1,
init_val1,
reduction_op1,
work_buf1,
out2,
inp_val2,
init_val2,
reduction_op2,
work_buf2,
sync_flags,
shared_buf,
read_pred,
write_pred,
entrance_ind,
n_entrances,
block_dim);
if (index_utils::maskedIsLast<true, true, true>(blockIdx, gridDim) &&
index_utils::maskedIsZero<true, true, true>(threadIdx)) {
cycles += readCycleCounter() - start_counter;
++count;
}
}
#endif // NVFUSER_PROFILE_KERNEL
// This performs a single reduction step, combining a single element "in" with
// a previous value "work". For a serial grid reduction, "work" resides in
// global memory, while "in" and "out" are in registers.
//
// If the write predicate is false, this function returns early (noop). If the
// read predicate is false, "init" is used in place of "in".
//
// If first_step is false, "work" will be read and reduction_op will be called.
// The result will be written back to "work" unless last_step is true.
template <int64_t vec_size, typename T, typename Func>
__device__ void serialReductionStep(
T* out,
T* in,
T init,
volatile T* work,
Func reduction_op,
bool first_step,
bool last_step,
bool read_pred,
bool write_pred) {
if (!write_pred) {
return;
}
if (read_pred) {
loadGeneric<T, vec_size>(out, in);
} else {
#pragma unroll
for (int i = 0; i < vec_size; ++i) {
out[i] = init;
}
}
if (!first_step) {
T work_reg[vec_size];
loadGlobalToLocal<T, vec_size, true, CacheOp::Global>(work_reg, work);
#pragma unroll
for (int i = 0; i < vec_size; ++i) {
reduction_op(out[i], work_reg[i]);
}
}
if (!last_step) {
loadLocalToGlobal<T, vec_size, true>(work, out);
}
}
// check required transactions based on data type and vectorization factor
// ensure each thread in each transaction has no more than 16 bytes which
// is the maximum allowed vectorization width.
template <typename T, int vec_size>
constexpr __device__ int getTransactions() {
constexpr int total_bytes = vec_size * sizeof(T);
return total_bytes <= 16 ? 1 : total_bytes / 16;
}
template <typename T, int vec_size>
constexpr __device__ int getElementsPerTransaction() {
return vec_size * sizeof(T) <= 16 ? vec_size : 16 / sizeof(T);
}
// calculate elements per section
__inline__ __device__ nvfuser_index_t getElementsPerSection(
nvfuser_index_t row_len,
nvfuser_index_t col_len,
nvfuser_index_t elements_per_thread) {
return row_len * col_len * elements_per_thread;
}
// calculate offset within a section
__inline__ __device__ nvfuser_index_t getOffsetWithinSection(
nvfuser_index_t row_len,
nvfuser_index_t row_id,
nvfuser_index_t col_id,
nvfuser_index_t elements_per_thread) {
return (row_id * row_len + col_id) * elements_per_thread;
}
// vectorized reduction
template <
bool X_THREAD,
bool Y_THREAD,
bool Z_THREAD,
bool Aligned,
int vec_size,
typename T,
typename Func,
typename BlockDimT>
__device__ void iterGroupedGridReduceLastBlock(
T* out,
const volatile T* in,
const nvfuser_index_t
grid_reduction_segment_size, // Number of reductions across
// grid reduce dimensions
const nvfuser_index_t
block_segment_size, // Number of reductions across the block
Func reduction_op,
T* shared_buf,
bool write_pred,
T init_val,
const nvfuser_index_t grid_segment_size,
const nvfuser_index_t idx_in_grid_segment,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
// We have to do num_reductions across reduction_size. The reductions are
// contiguous, but offset by reduction_size. There is an entry in "in" for
// every block, and every thread marked as true. Threads in dimensions marked
// as false can be used to parallelize the reduction.
// Find the reduction id of the participating threads
const auto block_reduction_segment_idx =
index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
threadIdx, block_dim);
// Find an id associated within a reduction segment for all
// "non-participating" threads, which will parallelize the reductions for the
// "participating" threads
const auto id_in_block_segment =
index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
threadIdx, block_dim);
// index into iteration dim.
// Its calculation is same to that in [iterGroupedGridReduce]. Becuase when
// [iterGroupedGridReduceLastBlock] is called from [iterGroupedGridReduce],
// X_THREAD, Y_THREAD, Z_THREAD are flipped.
const auto thread_offset =
index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
threadIdx, block_dim);
// Stride by the "non-participating" threads
const auto input_stride_for_thread_in_segment =
index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(block_dim);
constexpr unsigned int max_align_bytes = 16;
constexpr unsigned int vec_bytes = sizeof(T) * vec_size;
constexpr unsigned int align_bytes =
vec_bytes > max_align_bytes ? max_align_bytes : vec_bytes;
// Ensure alignment for vectorized load/store to smem in grouped block
// reduction
__align__(align_bytes) T inp[vec_size];
#pragma unroll
for (int i = 0; i < vec_size; i++) {
inp[i] = init_val;
}
// Max vectorized load/store size is 16 bytes, if each thread has more than
// 16 bytes, split into multiple sections to ensure each thread occupies only
// 16 bytes at most. For example, if each thread has 8 fp32 which occupies 32
// bytes, split into 2 sections, in each secdtion each thread holds 4 fp32 or
// 16 bytes. Thread-0 processes elements [0,7], the first 4 elements [0,3] are
// stored in the first section and the last 4 elements [4,7] are stored in the
// 2nd section. The data layout in gmem is:
// |-----------section 1-----------|-----------section 2-----------|
// TIDx: |000|001|002|003|004|005|006|007|000|001|002|003|004|005|006|007|
// GMEM: |000|016|032|048|064|080|096|112|128|144|160|176|192|208|224|240|
// Element:|000|008|016|024|032|040|048|056|004|012|020|028|036|044|052|060|
// This layout ensures coalesced access to gmem and each transaction loads 128
// bytes.
constexpr auto n_transactions = getTransactions<T, vec_size>();
constexpr auto n_elements_per_transaction =
getElementsPerTransaction<T, vec_size>();
const auto elements_per_section = getElementsPerSection(
block_segment_size * grid_segment_size, // row len
grid_reduction_segment_size, // col len
n_elements_per_transaction);
// Block stride across the reduction until we only have one value per thread
for (nvfuser_index_t reduction_i = id_in_block_segment;
reduction_i < grid_reduction_segment_size;
reduction_i += input_stride_for_thread_in_segment) {
auto offset_in_section = getOffsetWithinSection(
block_segment_size * grid_segment_size, // row len
reduction_i, // row id
block_segment_size * idx_in_grid_segment + thread_offset, // col id
n_elements_per_transaction);
#pragma unroll
for (auto i = 0; i < n_transactions; i++) {
auto i_offset = i * n_elements_per_transaction;
T in_reg[n_elements_per_transaction];
loadGlobalToLocal<T, n_elements_per_transaction, true, CacheOp::Global>(
&in_reg[0],
const_cast<T*>(in + elements_per_section * i + offset_in_section));
#pragma unroll
for (auto j = 0; j < n_elements_per_transaction; j++) {
reduction_op(inp[i_offset + j], in_reg[j]);
}
}
}
// Block reduce the per thread values into per "participating" thread values.
// inp_tmp stores output results, not being vectorized loaded to smem, no need
// to enforce alignment.
T inp_tmp[vec_size];
#pragma unroll
for (int i = 0; i < vec_size; i++) {
inp_tmp[i] = init_val;
}
blockIterGroupedYdimReduce<Aligned, vec_size>(
inp_tmp, inp, reduction_op, shared_buf, true, init_val, block_dim);
const bool should_write = (X_THREAD || threadIdx.x == 0) &&
(Y_THREAD || threadIdx.y == 0) && (Z_THREAD || threadIdx.z == 0);
if (should_write && write_pred) {
#pragma unroll
for (int i = 0; i < vec_size; i++) {
reduction_op(out[i], inp_tmp[i]);
}
}
}
// Main algorithm is same to gridReduce: start with block reduce then write
// results to gmem, the last block load from gmem and finalize with a block
// reduction. Main differences:
// (1) each thread in the iter dim does [vec_size] reductions instead of 1.
// (2) using [blockIterGroupedYdimReduce] instead of [blockReduce].
// (3) ensures vectorized load/store to gmem.
// Specifically, the new para [vec_size] is the vecotrization factor in the
// iteration dimension. It is used in outer reduction to reduce calling this
// grid reduction from [vec_size] times to only 1 time. Its value is limited
// to 1, 2, 4, 8, 16 based on the hardware support and input data type.
template <
bool X_BLOCK,
bool Y_BLOCK,
bool Z_BLOCK,
bool X_THREAD,
bool Y_THREAD,
bool Z_THREAD,
bool PERSISTENT_REDUCTION,
bool Aligned,
int vec_size,
typename T,
typename Func,
typename BlockDimT>
__device__ void iterGroupedGridReduce(
T* out,
const T* inp_val,
Func reduction_op,
volatile T* work_buf,
int64_t* sync_flags,
T* shared_buf,
bool read_pred,
bool write_pred,
T init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
// inp or block reduction results
T block_reduction_val[vec_size];
// Do block reduction when required
if (X_THREAD || Y_THREAD || Z_THREAD) {
#pragma unroll
for (int i = 0; i < vec_size; i++) {
block_reduction_val[i] = init_val;
}
blockIterGroupedYdimReduce<Aligned, vec_size>(
block_reduction_val,
inp_val,
reduction_op,
shared_buf,
read_pred,
true,
init_val,
block_dim);
} else if (read_pred) {
#pragma unroll
for (int i = 0; i < vec_size; i++) {
block_reduction_val[i] = inp_val[i];
}
}
// Number of values to reduce in the reduction segment
const auto grid_reduction_segment_size =
index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);
// Index of the reduction we're performing out of the
// grid_reduction_segment_size
const auto idx_in_grid_segment =
index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(
blockIdx, gridDim);
// Number of reductions in each block
const auto block_segment_size =
index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(block_dim);
// Number of reductions in the grid
const nvfuser_index_t grid_segment_size = PERSISTENT_REDUCTION
? 1
: index_utils::maskedSize<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(gridDim);
// advance to the offset for this segment
// index of reduction * size of the reduction * size of threads
if ((!X_THREAD || threadIdx.x == 0) && (!Y_THREAD || threadIdx.y == 0) &&
(!Z_THREAD || threadIdx.z == 0)) {
auto block_offset =
index_utils::maskedOffset<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
auto thread_offset =
index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
threadIdx, block_dim);
// Max vectorized load/store size is 16 bytes, if each thread has more than
// 16 bytes, split into multiple sections to ensure each thread occupies
// only 16 bytes at most. For example, if each thread has 8 fp32 which
// occupies 32 bytes, split into 2 sections, in each secdtion each thread
// holds 4 fp32 or 16 bytes. Thread-0 processes elements [0,7], the first 4
// elements [0,3] are stored in the first section and the last 4 elements
// [4,7] are stored in the 2nd section. The data layout in gmem is:
// |-----------section 1-----------|-----------section 2-----------|
// TIDx: |000|001|002|003|004|005|006|007|000|001|002|003|004|005|006|007|
// GMEM: |000|016|032|048|064|080|096|112|128|144|160|176|192|208|224|240|
// Element:|000|008|016|024|032|040|048|056|004|012|020|028|036|044|052|060|
// This layout ensures coalesced access to gmem and each transaction loads
// 128 bytes.
constexpr auto n_transactions = getTransactions<T, vec_size>();
constexpr auto n_elements_per_transaction =
getElementsPerTransaction<T, vec_size>();
// get elements per section, used to offset between different sections
// number of elements in each thread: [n_elements_per_transaction]
// number of threads in each row: [block_segment_size] * [grid_segment_size]
// number of rows in each section: [grid_reduction_segment_size]
auto elements_per_section = getElementsPerSection(
block_segment_size * grid_segment_size, // row len
grid_reduction_segment_size, // col len
n_elements_per_transaction);
// index to the right position in [work_buf] to store block reduction
// results. Consider a typical outer reduction case where iteration dim is
// TIDx and BIDx and reduction dim is TIDy and BIDy. block_offset = BIDy
// block_segment_size = blockDim.x
// grid_segment_size = gridDim.x
// idx_in_grid_segment = BIDx
// thread_offset = TIDx
auto offset_in_section = getOffsetWithinSection(
block_segment_size * grid_segment_size, // row len
block_offset, // row id
block_segment_size * idx_in_grid_segment + thread_offset, // col id
n_elements_per_transaction);
#pragma unroll
for (int i = 0; i < n_transactions; i++) {
loadLocalToGlobal<T, n_elements_per_transaction, true>(
&work_buf[elements_per_section * i + offset_in_section],
&block_reduction_val[i * n_elements_per_transaction]);
}
}
if (PERSISTENT_REDUCTION) {
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION, Aligned>(
sync_flags[idx_in_grid_segment],
grid_reduction_segment_size,
block_dim);
} else {
// there is only one vectorized call
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION, Aligned>(
sync_flags[idx_in_grid_segment],
grid_reduction_segment_size,
block_dim);
}
bool last_block =
index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
if (last_block) {
// Cleanup with block reduction
iterGroupedGridReduceLastBlock<
!X_THREAD,
!Y_THREAD,
!Z_THREAD,
Aligned,
vec_size>(
out,
(T*)work_buf,
grid_reduction_segment_size,
block_segment_size,
reduction_op,
shared_buf,
write_pred,
init_val,
grid_segment_size,
idx_in_grid_segment,
block_dim);
}
if (PERSISTENT_REDUCTION) {
// Make sure we're done with global memory before we allow the kernel to
// continue
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION, Aligned>(
sync_flags[idx_in_grid_segment],
grid_reduction_segment_size,
block_dim);
}
}
} // namespace reduction
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
namespace grid_broadcast {
// Broadcasts per-thread values across threads and blocks.
//
// Function parameters:
// - out: Per-thread output location
// - inp_val: Per-thread input value
// - work_buf: Temporary buffer for communication across threads/blocks
// - sync_flags: A vector of integers for synchronizations
//
// Template parameters:
// - X/Y/Z_BLOCK: When true, broadcasts across thread blocks along the X/Y/Z
// dimensions
// - X/Y/Z_THREAD: When true, broadcasts across threads along the X/Y/Z
// dimensions
template <
bool X_BLOCK,
bool Y_BLOCK,
bool Z_BLOCK,
bool X_THREAD,
bool Y_THREAD,
bool Z_THREAD,
bool Aligned,
typename T,
typename BlockDimT>
__device__ void broadcast(
T& out,
const T& inp_val,
volatile T* work_buf,
Tensor<int64_t, 1> sync_flags,
bool read_write_pred,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
// Number of values broadcasted in the grid dimensions
const auto grid_seg_size =
index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);
// Index of the broadcast we're performing out of the grid_seg_size
const auto grid_seg_idx =
index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(
blockIdx, gridDim);
// Number of threads not participating in a broadcast dimension, this is the
// number of thread entries to expect in the work buffer, therefore a striding
const auto block_stride =
index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(block_dim);
// Which broadcast in the block this is to line up the entry with the work
// buffer
const auto thread_offset =
index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
threadIdx, block_dim);
const bool has_valid_data = (!X_BLOCK || blockIdx.x == gridDim.x - 1) &&
(!Y_BLOCK || blockIdx.y == gridDim.y - 1) &&
(!Z_BLOCK || blockIdx.z == gridDim.z - 1) &&
(!X_THREAD || threadIdx.x == 0) && (!Y_THREAD || threadIdx.y == 0) &&
(!Z_THREAD || threadIdx.z == 0);
if (has_valid_data && read_write_pred) {
work_buf[grid_seg_idx * block_stride + thread_offset] = inp_val;
__threadfence();
}
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, true, Aligned>(
sync_flags[grid_seg_idx], grid_seg_size, block_dim);
if (read_write_pred) {
out = work_buf[grid_seg_idx * block_stride + thread_offset];
}
// Make sure everyone has read from the buffer before continuing the kernel
// and potentially overwriting
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, true, Aligned>(
sync_flags[grid_seg_idx], grid_seg_size, block_dim);
}
} // namespace grid_broadcast
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
namespace broadcast {
// Broadcasts within partitioned groups of threads.
//
// X_THREAD: Broadcast from threadIdx.x == 0 if true
// Y_THREAD: Broadcast from threadIdx.y == 0 if true
// Z_THREAD: Broadcast from threadIdx.z == 0 if true
// Aligned: Called from aligned threads if true
// inp_val: Per-thread source value. Only valid when the thread is a source.
// out: Per-thread output location
//
template <
bool X_THREAD,
bool Y_THREAD,
bool Z_THREAD,
bool Aligned,
typename T,
typename BlockDimT>
__device__ void blockBroadcast(
T& out,
const T& inp_val,
T* shared_mem,
bool read_write_pred,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
const bool has_valid_data = (!X_THREAD || threadIdx.x == 0) &&
(!Y_THREAD || threadIdx.y == 0) && (!Z_THREAD || threadIdx.z == 0);
const auto shared_offset =
index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
threadIdx, block_dim);
if (has_valid_data && read_write_pred) {
shared_mem[shared_offset] = inp_val;
}
block_sync::sync<Aligned>(block_dim);
if (read_write_pred) {
out = shared_mem[shared_offset];
}
block_sync::sync<Aligned>(block_dim);
}
} // namespace broadcast
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
template <typename DataType>
struct WelfordTriplet {
DataType avg;
DataType var;
nvfuser_index_t N;
};
template <typename DataType>
__inline__ __device__ void copyTriplet(
DataType* dst_avg,
DataType* dst_var,
nvfuser_index_t* dst_N,
const WelfordTriplet<DataType>& src) {
*dst_avg = src.avg;
*dst_var = src.var;
*dst_N = src.N;
}
template <typename DataType>
__inline__ __device__ void copyTriplet(
WelfordTriplet<DataType>& dst,
const DataType* src_avg,
const DataType* src_var,
const nvfuser_index_t* src_N) {
dst.avg = *src_avg;
dst.var = *src_var;
dst.N = *src_N;
}
template <typename DataType>
__inline__ __device__ void copyTriplet(
WelfordTriplet<DataType>& dst,
const WelfordTriplet<DataType>& src) {
dst.avg = src.avg;
dst.var = src.var;
dst.N = src.N;
}
// -----------------------------------------------------------------------------------------------
// Block Welford Primitives
// -----------------------------------------------------------------------------------------------
// Basic utility for welford update. Can be used to scan one value, or two merge
// two welford results
template <typename T, typename TN>
__inline__ __device__ void welfordCombine(
T& a_avg,
T& a_M2,
TN& a_N,
const T b_avg,
const T b_M2,
TN b_N) {
if (b_N == 0) {
return;
}
TN ab_N = a_N + b_N;
T b_N_div_ab_N = ((T)(nvfuser_index_t)(b_N)) / ((T)(nvfuser_index_t)(ab_N));
T delta = b_avg - a_avg;
a_avg += delta * b_N_div_ab_N;
a_M2 += b_M2 + delta * delta * ((T)(nvfuser_index_t)(a_N)) * b_N_div_ab_N;
a_N = ab_N;
}
template <typename T, bool OutputGmem>
__inline__ __device__ void welfordVectorized(
T& a_avg,
T& a_M2,
nvfuser_index_t& a_N,
const T b_avg,
const T b_N_div_ab_N,
const nvfuser_index_t ab_N,
const bool pred) {
// Want only predicated statements and don't want to have
// "if", but for gmem output writes can be illegal, so needs to
// bail out here.
if (OutputGmem && !pred) {
return;
}
T predicated_b_avg = pred ? b_avg : a_avg;
T delta0 = predicated_b_avg - a_avg;
a_avg += delta0 * b_N_div_ab_N;
T delta1 = predicated_b_avg - a_avg;
a_M2 += delta0 * delta1;
a_N = ab_N;
}
// Non predicated version
template <typename T>
__inline__ __device__ void welfordVectorized(
T& a_avg,
T& a_M2,
nvfuser_index_t& a_N,
const T b_avg,
const T b_N_div_ab_N,
const nvfuser_index_t ab_N) {
T delta0 = b_avg - a_avg;
a_avg += delta0 * b_N_div_ab_N;
T delta1 = b_avg - a_avg;
a_M2 += delta0 * delta1;
a_N = ab_N;
}
// [Z,Y,X]_THREADS is the number of participating threads in the z, y, x
// dimension of the block.
template <
bool X_REDUCE,
bool Y_REDUCE,
bool Z_REDUCE,
bool Aligned,
typename T,
typename TN,
typename BlockDimT>
__inline__ __device__ void blockWelford(
T& out_avg,
T& out_M2,
TN& out_N,
const T& in_avg,
const T& in_M2,
const TN& in_N,
T* shared_mem_avg,
T* shared_mem_M2,
TN* shared_mem_N,
bool read_pred,
bool write_pred,
T init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
// If this thread will output a final result
bool should_write =
index_utils::maskedIsZero<X_REDUCE, Y_REDUCE, Z_REDUCE>(threadIdx);
// Size of the reduction segments
unsigned int reduction_size =
index_utils::maskedSize<X_REDUCE, Y_REDUCE, Z_REDUCE>(block_dim);
// Index into the reduction segment
unsigned int reduction_tid =
index_utils::maskedOffset<X_REDUCE, Y_REDUCE, Z_REDUCE>(
threadIdx, block_dim);
// Index of the reduction segment
unsigned int reduction_idx =
index_utils::maskedOffset<!X_REDUCE, !Y_REDUCE, !Z_REDUCE>(
threadIdx, block_dim);
// Offset into smem for the current thread
unsigned int smem_offset = reduction_idx * reduction_size + reduction_tid;
if (read_pred) {
shared_mem_avg[smem_offset] = in_avg;
shared_mem_M2[smem_offset] = in_M2;
shared_mem_N[smem_offset] = in_N;
} else {
shared_mem_avg[smem_offset] = init_val;
shared_mem_M2[smem_offset] = init_val;
shared_mem_N[smem_offset] = 0;
}
block_sync::sync<Aligned>(block_dim);
// Reduce down to nearest power of 2:
int np2 = 1 << (31 - __clz(reduction_size));
if (reduction_tid < np2 && reduction_tid + np2 < reduction_size) {
welfordCombine(
shared_mem_avg[smem_offset],
shared_mem_M2[smem_offset],
shared_mem_N[smem_offset],
shared_mem_avg[smem_offset + np2],
shared_mem_M2[smem_offset + np2],
shared_mem_N[smem_offset + np2]);
}
block_sync::sync<Aligned>(block_dim);
// loop peel the final iteration to save one syncthread for the end
for (int factor = np2 / 2; factor > 1; factor >>= 1) {
if (reduction_tid < factor) {
welfordCombine(
shared_mem_avg[smem_offset],
shared_mem_M2[smem_offset],
shared_mem_N[smem_offset],
shared_mem_avg[smem_offset + factor],
shared_mem_M2[smem_offset + factor],
shared_mem_N[smem_offset + factor]);
}
block_sync::sync<Aligned>(block_dim);
}
if (should_write && write_pred) {
T res_avg = out_avg;
T res_M2 = out_M2;
TN res_N = out_N;
welfordCombine(
res_avg,
res_M2,
res_N,
shared_mem_avg[smem_offset],
shared_mem_M2[smem_offset],
shared_mem_N[smem_offset]);
if (reduction_size > 1) {
welfordCombine(
res_avg,
res_M2,
res_N,
shared_mem_avg[smem_offset + 1],
shared_mem_M2[smem_offset + 1],
shared_mem_N[smem_offset + 1]);
}
out_avg = res_avg;
out_M2 = res_M2;
out_N = res_N;
}
block_sync::sync<Aligned>(block_dim);
}
// Use the same pred for both reads and writes
template <
bool X_REDUCE,
bool Y_REDUCE,
bool Z_REDUCE,
bool Aligned,
typename T,
typename TN,
typename BlockDimT>
__inline__ __device__ void blockWelford(
T& out_avg,
T& out_M2,
TN& out_N,
const T& in_avg,
const T& in_M2,
const TN& in_N,
T* shared_mem_avg,
T* shared_mem_M2,
TN* shared_mem_N,
bool read_write_pred,
T init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
blockWelford<X_REDUCE, Y_REDUCE, Z_REDUCE, Aligned, T, TN>(
out_avg,
out_M2,
out_N,
in_avg,
in_M2,
in_N,
shared_mem_avg,
shared_mem_M2,
shared_mem_N,
read_write_pred,
read_write_pred,
init_val,
block_dim);
}
// -----------------------------------------------------------------------------------------------
// Grid Welford Prototype
// -----------------------------------------------------------------------------------------------
namespace welford {
template <
bool X_THREAD,
bool Y_THREAD,
bool Z_THREAD,
bool Aligned,
typename T,
typename TN,
typename BlockDimT>
__device__ void gridWelfordLastBlock(
T& out_avg,
T& out_M2,
TN& out_N,
const volatile T* in_avg,
const volatile T* in_M2,
const volatile TN* in_N,
const nvfuser_index_t
grid_reduction_segment_size, // Number of reductions across
// grid reduce dimensions
const nvfuser_index_t block_reduction_segment_size,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
T* shared_buf_avg,
T* shared_buf_M2,
TN* shared_buf_N,
bool write_pred,
T init_val) {
// We have to do num_reductions across reduction_size. The reductions are
// contiguous, but offset by reduction_size. There is an entry in "in" for
// every block, and every thread marked as true. Threads in dimensions marked
// as false can be used to parallelize the reduction.
// Find the reduction id of the participating threads
const auto block_reduction_segment_idx =
index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
threadIdx, block_dim);
// Find an id associated within a reduction segment for all
// "non-participating" threads, which will parallelize the reductions for the
// "participating" threads
const auto id_in_block_segment =
index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
threadIdx, block_dim);
// Stride by the "non-participating" threads
const auto input_stride_for_thread_in_segment =
index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(block_dim);
T inp_avg = init_val;
T inp_M2 = init_val;
TN inp_N = 0;
// Block stride across the reduction until we only have one value per thread
for (nvfuser_index_t reduction_i = id_in_block_segment;
reduction_i < grid_reduction_segment_size;
reduction_i += input_stride_for_thread_in_segment) {
auto work_buf_offset = reduction_i * block_reduction_segment_size +
block_reduction_segment_idx;
welfordCombine(
inp_avg,
inp_M2,
inp_N,
in_avg[work_buf_offset],
in_M2[work_buf_offset],
in_N[work_buf_offset]);
}
// Block reduce the per thread values into per "participating" thread values
T inp_avg_tmp = init_val;
T inp_M2_tmp = init_val;
TN inp_N_tmp = 0;
blockWelford<!X_THREAD, !Y_THREAD, !Z_THREAD, Aligned>(
inp_avg_tmp,
inp_M2_tmp,
inp_N_tmp,
inp_avg,
inp_M2,
inp_N,
shared_buf_avg,
shared_buf_M2,
shared_buf_N,
true,
init_val,
block_dim);
const bool should_write = (X_THREAD || threadIdx.x == 0) &&
(Y_THREAD || threadIdx.y == 0) && (Z_THREAD || threadIdx.z == 0);
if (should_write && write_pred) {
welfordCombine(out_avg, out_M2, out_N, inp_avg_tmp, inp_M2_tmp, inp_N_tmp);
}
}
// Grid welford combine. See GridReduction for more information
template <
bool X_BLOCK,
bool Y_BLOCK,
bool Z_BLOCK,
bool X_THREAD,
bool Y_THREAD,
bool Z_THREAD,
bool PERSISTENT_REDUCTION,
bool Aligned,
typename T,
typename TN,
typename BlockDimT>
__device__ void gridWelford(
T& out_avg,
T& out_M2,
TN& out_N,
const T& inp_avg,
const T& inp_M2,
const TN& inp_N,
volatile T* work_buf_avg,
volatile T* work_buf_M2,
volatile TN* work_buf_N,
Tensor<int64_t, 1> sync_flags,
T* shared_buf_avg,
T* shared_buf_M2,
TN* shared_buf_N,
bool read_pred,
bool write_pred,
T init_val,
const nvfuser_index_t entrance_ind,
const nvfuser_index_t n_entrances,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
// entrance index only matters for non-persistent re-entrant grid reductions.
const nvfuser_index_t entrance_ind_ = PERSISTENT_REDUCTION ? 0 : entrance_ind;
const nvfuser_index_t n_entrances_ = PERSISTENT_REDUCTION ? 1 : n_entrances;
// Number of values to reduce in the reduction segment
const auto grid_reduction_segment_size =
index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);
// Index of the reduction we're performing out of the
// grid_reduction_segment_size
const auto idx_in_grid_segment =
index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(
blockIdx, gridDim);
// Number of threads we can use in final reduction, Seems to assume all
// threads in the block participate
const auto block_reduction_segment_size =
index_utils::maskedSize<X_THREAD, Y_THREAD, Z_THREAD>(block_dim);
// Number of reductions in the grid
const nvfuser_index_t grid_segment_size = PERSISTENT_REDUCTION
? 1
: index_utils::maskedSize<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(gridDim);
// advance to the offset for this segment
// index of reduction * size of the reduction * size of threads
work_buf_avg += (entrance_ind_ * grid_segment_size + idx_in_grid_segment) *
grid_reduction_segment_size * block_reduction_segment_size;
work_buf_M2 += (entrance_ind_ * grid_segment_size + idx_in_grid_segment) *
grid_reduction_segment_size * block_reduction_segment_size;
work_buf_N += (entrance_ind_ * grid_segment_size + idx_in_grid_segment) *
grid_reduction_segment_size * block_reduction_segment_size;
if ((X_THREAD || threadIdx.x == 0) && (Y_THREAD || threadIdx.y == 0) &&
(Z_THREAD || threadIdx.z == 0)) {
auto block_offset =
index_utils::maskedOffset<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
auto thread_offset =
index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
threadIdx, block_dim);
auto work_buf_offset =
block_offset * block_reduction_segment_size + thread_offset;
if (read_pred) {
work_buf_avg[work_buf_offset] = inp_avg;
work_buf_M2[work_buf_offset] = inp_M2;
work_buf_N[work_buf_offset] = inp_N;
} else {
work_buf_avg[work_buf_offset] = init_val;
work_buf_M2[work_buf_offset] = init_val;
work_buf_N[work_buf_offset] = 0;
}
}
if (PERSISTENT_REDUCTION) {
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION, Aligned>(
sync_flags[idx_in_grid_segment],
grid_reduction_segment_size,
block_dim);
} else {
// Use a different sync flag for each call
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION, Aligned>(
sync_flags[entrance_ind_ * grid_segment_size + idx_in_grid_segment],
grid_reduction_segment_size,
block_dim);
}
bool last_block =
index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
if (last_block) {
// final reduction
gridWelfordLastBlock<X_THREAD, Y_THREAD, Z_THREAD, Aligned>(
out_avg,
out_M2,
out_N,
work_buf_avg,
work_buf_M2,
work_buf_N,
grid_reduction_segment_size,
block_reduction_segment_size,
block_dim,
shared_buf_avg,
shared_buf_M2,
shared_buf_N,
write_pred,
init_val);
}
if (PERSISTENT_REDUCTION) {
// Make sure we're done with global memory before we allow the kernel to
// continue
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION, Aligned>(
sync_flags[idx_in_grid_segment],
grid_reduction_segment_size,
block_dim);
}
}
} // namespace welford
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
namespace warp {
template <typename T>
__device__ __forceinline__ T shfl_xor(T var, int laneMask, int width = 32) {
return __shfl_xor_sync(0xffffffff, var, laneMask, width);
}
template <typename T>
__device__ __forceinline__ std::complex<T> shfl_xor(
std::complex<T> var,
int laneMask,
int width = 32) {
T real = __shfl_xor_sync(0xffffffff, var.real(), laneMask, width);
T imag = __shfl_xor_sync(0xffffffff, var.imag(), laneMask, width);
return std::complex<T>(real, imag);
}
template <
bool SINGLE_WARP,
bool Aligned,
typename T,
typename Func,
typename BlockDimT>
__device__ void warpReduceTIDX(
T& out,
const T& inp_val,
Func reduction_op,
T* shared_mem,
bool read_write_pred,
T init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
constexpr int WARP_SIZE = 32;
// Assume input padded to multiples of a warp
T reduce_val = init_val;
// Do warp reduction
if (read_write_pred) {
reduce_val = inp_val;
}
// Reduce within each warp
for (int i = 16; i >= 1; i /= 2) {
reduction_op(reduce_val, shfl_xor(reduce_val, i, WARP_SIZE));
}
// Reduce across warp if needed
// Load value to shared mem
if (!SINGLE_WARP) {
unsigned int warp_idx = threadIdx.x / WARP_SIZE;
unsigned int lane_idx = threadIdx.x % WARP_SIZE;
unsigned int reduce_group_id = threadIdx.z * block_dim.y + threadIdx.y;
bool is_warp_head = lane_idx == 0;
unsigned int reduction_size = block_dim.x;
unsigned int num_of_warps = reduction_size / WARP_SIZE;
unsigned int smem_offset = reduce_group_id * num_of_warps;
block_sync::sync<Aligned>(block_dim);
if (is_warp_head) {
shared_mem[smem_offset + warp_idx] = reduce_val;
}
block_sync::sync<Aligned>(block_dim);
if (warp_idx == 0) {
// This assumes num_of_warps will be < 32, meaning < 1024 threads.
// Should be true for long enough.
assert(num_of_warps <= 32);
reduce_val = lane_idx < num_of_warps ? shared_mem[smem_offset + lane_idx]
: init_val;
// Reduce within warp 0
for (int i = 16; i >= 1; i /= 2) {
reduction_op(reduce_val, shfl_xor(reduce_val, i, 32));
}
}
if (is_warp_head) {
reduction_op(out, reduce_val);
}
// needs sync, otherwise other warps may access shared memory before this
// reduction is done.
block_sync::sync<Aligned>(block_dim);
} else {
reduction_op(out, reduce_val);
}
}
template <
int BDIMX,
int BDIMY,
bool Aligned,
typename T,
typename Func,
typename BlockDimT>
__device__ void warpReduceTIDXY(
T& out,
const T& inp_val,
Func reduction_op,
T* shared_mem,
bool read_write_pred,
T init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
constexpr int WARP_SIZE = 32;
constexpr int num_of_warps = BDIMX * BDIMY / WARP_SIZE;
// Assume input padded to multiples of a warp
T reduce_val = init_val;
// Do warp reduction
if (read_write_pred) {
reduce_val = inp_val;
}
// Reduce within each warp
for (int i = 16; i >= 1; i /= 2) {
reduction_op(reduce_val, shfl_xor(reduce_val, i, WARP_SIZE));
}
// Reduce across warp if needed
// Load value to shared mem
if (num_of_warps > 1) {
unsigned int idx = threadIdx.x + threadIdx.y * BDIMX;
unsigned int warp_idx = idx / WARP_SIZE;
unsigned int lane_idx = idx % WARP_SIZE;
block_sync::sync<Aligned>(block_dim);
if (lane_idx == 0) {
shared_mem[warp_idx] = reduce_val;
}
block_sync::sync<Aligned>(block_dim);
if (warp_idx == 0) {
reduce_val = lane_idx < num_of_warps ? shared_mem[lane_idx] : init_val;
// Reduce within warp 0
for (int i = 16; i >= 1; i /= 2) {
reduction_op(reduce_val, shfl_xor(reduce_val, i, 32));
}
}
if (lane_idx == 0) {
reduction_op(out, reduce_val);
}
// needs sync, otherwise other warps may access shared memory before this
// reduction is done.
block_sync::sync<Aligned>(block_dim);
} else {
reduction_op(out, reduce_val);
}
}
} // namespace warp
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
// Utility for converting generic pointer to SMEM pointer in PTX.
// We should review vectorized load/stores with shared memory.
// SMEM memory movement PTX is only Global -> SMEM, SMEM -> Local, Local ->
// SMEM, and this is needed for these PTX instructions to provide the SMEM
// pointer.
__device__ inline unsigned toSmem(const void* raw_ptr) {
unsigned smem_ptr_uint;
asm("{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %1; cvt.u32.u64 %0, smem_ptr; }\n"
: "=r"(smem_ptr_uint)
: "l"(raw_ptr));
return smem_ptr_uint;
}
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750))
namespace Turing {
// LdMatrix has .x1, .x2 and .x4 options, currently we actively use .x2 and
// .x4. In .x2 option. the the address register of upper half warp (lane 16-31)
// are un-used but on Turing [sm75,sm80) architecture these un-used addresses
// need to be valid, in the sense that:
// 1. The data it points to has to be within allocated shared mem buffer.
// 2. The address needs to be aligned to 16 byte.
// See also:
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-ldmatrix
// This function addresses 2. above by masking out the sub-16B component
// of the address in upper warp and 1. is guaranteed by ldmatrix swizzle
// util.
// This will **not** affect any functionality. This is just modification
// of unused pointers to satisfy the alignment requirement on Turing
// hardware.
// The alignment requirement is lifted on sm80+,
// so this function is a no-op on Ampere or above.
template <unsigned num_valid_addresses>
__device__ inline unsigned adjustPartialLdMatrixAddrInTuring(
unsigned addr_in_byte) {
const unsigned lane = threadIdx.x % 32;
if (lane >= num_valid_addresses) {
return 0;
}
return addr_in_byte;
}
} // namespace Turing
#endif // Arch 75
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
namespace Hopper {
// Description: Elect a leader thread from a set of threads in a warp
//
// The common pattern is to select any thread from the first warp without
// creating a serialized, peeling loop.
//
// Code example: threadIdx.x / 32 == 0 && ptx::elect_sync(~0)
//
// Compile Explorer Reference: https://ce.nvidia.com/z/d9x4q8
//
// Document Reference:
// https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-elect-sync
__device__ inline bool electSync(const uint32_t& membermask) {
uint32_t is_elected;
asm volatile(
"{\n\t .reg .pred P_OUT; \n\t"
"elect.sync _|P_OUT, %1;\n\t"
"selp.b32 %0, 1, 0, P_OUT; \n"
"}"
: "=r"(is_elected)
: "r"(membermask)
:);
return static_cast<bool>(is_elected);
}
// References:
//
// TMA:
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
// https://github.com/NVIDIA/cutlass/blob/main/include/cute/arch/copy_sm90_tma.hpp
//
// Tensor map:
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html
// UBLK:
// https://github.com/NVIDIA/cutlass/blob/main/include/cute/arch/copy_sm90_tma.hpp#L1400
// UBLK Load:
struct CpAsyncBulkG2SIndex {
const void* raw_gmem_addr;
uint32_t bytes;
uint32_t mbarrier;
};
__device__ inline void cpAsyncBulkG2S(
const CpAsyncBulkG2SIndex& src,
uint32_t smem_addr) {
asm volatile(
"cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];\n"
:
: "r"(smem_addr),
"l"(src.raw_gmem_addr),
"r"(src.bytes),
"r"(src.mbarrier)
: "memory");
}
// UBLK Store:
struct CpAsyncBulkS2GIndex {
const void* raw_gmem_addr;
uint32_t bytes;
};
__device__ inline void cpAsyncBulkS2G(
const CpAsyncBulkS2GIndex& dst,
uint32_t smem_addr) {
asm volatile("cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2;\n"
:
: "l"(dst.raw_gmem_addr), "r"(smem_addr), "r"(dst.bytes)
: "memory");
}
// TMA Loads:
template <int dim>
struct CpAsyncBulkTensorTileG2SIndex {
const TensorMap* descriptor;
Array<int32_t, dim> crds;
uint32_t mbarrier;
};
__device__ inline void cpAsyncBulkTensorTileG2S(
const CpAsyncBulkTensorTileG2SIndex<1>& src,
uint32_t smem_addr) {
uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(src.descriptor);
asm volatile(
"cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes"
" [%0], [%1, {%3}], [%2];"
:
: "r"(smem_addr), "l"(gmem_int_desc), "r"(src.mbarrier), "r"(src.crds[0])
: "memory");
}
__device__ inline void cpAsyncBulkTensorTileG2SMulticast(
const CpAsyncBulkTensorTileG2SIndex<1>& src,
uint32_t smem_addr,
uint16_t cta_mask) {
uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(src.descriptor);
asm volatile(
"cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster"
" [%0], [%1, {%3}], [%2], %4;"
:
: "r"(smem_addr),
"l"(gmem_int_desc),
"r"(src.mbarrier),
"r"(src.crds[0]),
"h"(cta_mask)
: "memory");
}
__device__ inline void cpAsyncBulkTensorTileG2S(
const CpAsyncBulkTensorTileG2SIndex<2>& src,
uint32_t smem_addr) {
uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(src.descriptor);
asm volatile(
"cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes"
" [%0], [%1, {%3, %4}], [%2];"
:
: "r"(smem_addr),
"l"(gmem_int_desc),
"r"(src.mbarrier),
"r"(src.crds[0]),
"r"(src.crds[1])
: "memory");
}
__device__ inline void cpAsyncBulkTensorTileG2SMulticast(
const CpAsyncBulkTensorTileG2SIndex<2>& src,
uint32_t smem_addr,
uint16_t cta_mask) {
uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(src.descriptor);
asm volatile(
"cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster"
" [%0], [%1, {%3, %4}], [%2], %5;"
:
: "r"(smem_addr),
"l"(gmem_int_desc),
"r"(src.mbarrier),
"r"(src.crds[0]),
"r"(src.crds[1]),
"h"(cta_mask)
: "memory");
}
__device__ inline void cpAsyncBulkTensorTileG2S(
const CpAsyncBulkTensorTileG2SIndex<3>& src,
uint32_t smem_addr) {
uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(src.descriptor);
asm volatile(
"cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes"
" [%0], [%1, {%3, %4, %5}], [%2];"
:
: "r"(smem_addr),
"l"(gmem_int_desc),
"r"(src.mbarrier),
"r"(src.crds[0]),
"r"(src.crds[1]),
"r"(src.crds[2])
: "memory");
}
__device__ inline void cpAsyncBulkTensorTileG2SMulticast(
const CpAsyncBulkTensorTileG2SIndex<3>& src,
uint32_t smem_addr,
uint16_t cta_mask) {
uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(src.descriptor);
asm volatile(
"cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast_cluster"
" [%0], [%1, {%3, %4, %5}], [%2], %6;"
:
: "r"(smem_addr),
"l"(gmem_int_desc),
"r"(src.mbarrier),
"r"(src.crds[0]),
"r"(src.crds[1]),
"r"(src.crds[2]),
"h"(cta_mask)
: "memory");
}
__device__ inline void cpAsyncBulkTensorTileG2S(
const CpAsyncBulkTensorTileG2SIndex<4>& src,
uint32_t smem_addr) {
uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(src.descriptor);
asm volatile(
"cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes"
" [%0], [%1, {%3, %4, %5, %6}], [%2];"
:
: "r"(smem_addr),
"l"(gmem_int_desc),
"r"(src.mbarrier),
"r"(src.crds[0]),
"r"(src.crds[1]),
"r"(src.crds[2]),
"r"(src.crds[3])
: "memory");
}
__device__ inline void cpAsyncBulkTensorTileG2SMulticast(
const CpAsyncBulkTensorTileG2SIndex<4>& src,
uint32_t smem_addr,
uint16_t cta_mask) {
uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(src.descriptor);
asm volatile(
"cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast_cluster"
" [%0], [%1, {%3, %4, %5, %6}], [%2], %7;"
:
: "r"(smem_addr),
"l"(gmem_int_desc),
"r"(src.mbarrier),
"r"(src.crds[0]),
"r"(src.crds[1]),
"r"(src.crds[2]),
"r"(src.crds[3]),
"h"(cta_mask)
: "memory");
}
__device__ inline void cpAsyncBulkTensorTileG2S(
const CpAsyncBulkTensorTileG2SIndex<5>& src,
uint32_t smem_addr) {
uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(src.descriptor);
asm volatile(
"cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes"
" [%0], [%1, {%3, %4, %5, %6, %7}], [%2];"
:
: "r"(smem_addr),
"l"(gmem_int_desc),
"r"(src.mbarrier),
"r"(src.crds[0]),
"r"(src.crds[1]),
"r"(src.crds[2]),
"r"(src.crds[3]),
"r"(src.crds[4])
: "memory");
}
__device__ inline void cpAsyncBulkTensorTileG2SMulticast(
const CpAsyncBulkTensorTileG2SIndex<5>& src,
uint32_t smem_addr,
uint16_t cta_mask) {
uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(src.descriptor);
asm volatile(
"cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast_cluster"
" [%0], [%1, {%3, %4, %5, %6, %7}], [%2], %8;"
:
: "r"(smem_addr),
"l"(gmem_int_desc),
"r"(src.mbarrier),
"r"(src.crds[0]),
"r"(src.crds[1]),
"r"(src.crds[2]),
"r"(src.crds[3]),
"r"(src.crds[4]),
"h"(cta_mask)
: "memory");
}
// TMA Stores:
template <int dim>
struct CpAsyncBulkTensorTileS2GIndex {
const TensorMap* descriptor;
Array<int32_t, dim> crds;
};
__device__ inline void cpAsyncBulkTensorTileS2G(
const CpAsyncBulkTensorTileS2GIndex<1>& dest,
uint32_t smem_addr) {
uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(dest.descriptor);
asm volatile(
"cp.async.bulk.tensor.1d.global.shared::cta.bulk_group [%0, {%2}], [%1];"
:
: "l"(gmem_int_desc), "r"(smem_addr), "r"(dest.crds[0])
: "memory");
}
__device__ inline void cpAsyncBulkTensorTileS2G(
const CpAsyncBulkTensorTileS2GIndex<2>& dest,
uint32_t smem_addr) {
uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(dest.descriptor);
asm volatile(
"cp.async.bulk.tensor.2d.global.shared::cta.bulk_group [%0, {%2, %3}], [%1];"
:
: "l"(gmem_int_desc), "r"(smem_addr), "r"(dest.crds[0]), "r"(dest.crds[1])
: "memory");
}
__device__ inline void cpAsyncBulkTensorTileS2G(
const CpAsyncBulkTensorTileS2GIndex<3>& dest,
uint32_t smem_addr) {
uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(dest.descriptor);
asm volatile(
"cp.async.bulk.tensor.3d.global.shared::cta.bulk_group [%0, {%2, %3, %4}], [%1];"
:
: "l"(gmem_int_desc),
"r"(smem_addr),
"r"(dest.crds[0]),
"r"(dest.crds[1]),
"r"(dest.crds[2])
: "memory");
}
__device__ inline void cpAsyncBulkTensorTileS2G(
const CpAsyncBulkTensorTileS2GIndex<4>& dest,
uint32_t smem_addr) {
uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(dest.descriptor);
asm volatile(
"cp.async.bulk.tensor.4d.global.shared::cta.bulk_group [%0, {%2, %3, %4, %5}], [%1];"
:
: "l"(gmem_int_desc),
"r"(smem_addr),
"r"(dest.crds[0]),
"r"(dest.crds[1]),
"r"(dest.crds[2]),
"r"(dest.crds[3])
: "memory");
}
__device__ inline void cpAsyncBulkTensorTileS2G(
const CpAsyncBulkTensorTileS2GIndex<5>& dest,
uint32_t smem_addr) {
uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(dest.descriptor);
asm volatile(
"cp.async.bulk.tensor.5d.global.shared::cta.bulk_group [%0, {%2, %3, %4, %5, %6}], [%1];"
:
: "l"(gmem_int_desc),
"r"(smem_addr),
"r"(dest.crds[0]),
"r"(dest.crds[1]),
"r"(dest.crds[2]),
"r"(dest.crds[3]),
"r"(dest.crds[4])
: "memory");
}
} // namespace Hopper
#endif // Arch 90
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
namespace fused_reduction {
// Tuple of Welford avg, var and N parameters.
//
// Template parameters:
// - DataTypeT: Type of avg and var
// - IndexTypeT: Type of N
// - MakeTuple: Template template parameter to define Tuple types
// (e.g., MakeLocalTuple>
template <
int NumVals,
typename DataTypeT,
typename IndexTypeT,
template <int, typename> typename MakeTuple>
struct WelfordTripletTuple {
static constexpr int num_vals = NumVals;
using DataType = DataTypeT;
using IndexType = IndexTypeT;
using DataTuple = typename MakeTuple<NumVals, DataType>::type;
using IndexTuple = typename MakeTuple<NumVals, IndexType>::type;
DataTuple avg;
DataTuple var;
IndexTuple N;
WelfordTripletTuple(
const DataTuple& avg,
const DataTuple& var,
const IndexTuple& N)
: avg(avg), var(var), N(N) {}
};
template <int NumVals, typename DataType, typename IndexType>
using LocalWelfordTripletTuple =
WelfordTripletTuple<NumVals, DataType, IndexType, MakeLocalTuple>;
template <int NumVals, typename DataType, typename IndexType>
using RefWelfordTripletTuple =
WelfordTripletTuple<NumVals, DataType, IndexType, MakeRefTuple>;
template <int NumVals, typename DataType, typename IndexType>
using ConstRefWelfordTripletTuple =
WelfordTripletTuple<NumVals, DataType, IndexType, MakeConstRefTuple>;
template <int NumVals, typename DataTypeT, typename IndexTypeT>
using VolatilePtrWelfordTripletTuple =
WelfordTripletTuple<NumVals, DataTypeT, IndexTypeT, MakeVolatilePtrTuple>;
// Advance pointer offsets of WelfordTripleTuple. Only valid when the
// values are pointer values.
template <typename WelfordTripletTupleType>
__inline__ __device__ static void operator+=(
WelfordTripletTupleType& triplet,
nvfuser_index_t offset) {
triplet.avg += offset;
triplet.var += offset;
triplet.N += offset;
}
// Copy each of the triplet tuples
template <typename DstType, typename SrcType>
__inline__ __device__ static void copyWelfordTripletTuple(
DstType& dst,
nvfuser_index_t dst_offset,
const SrcType& src,
nvfuser_index_t src_offset = 0) {
copyTuple(dst.avg, dst_offset, src.avg, src_offset);
copyTuple(dst.var, dst_offset, src.var, src_offset);
copyTuple(dst.N, dst_offset, src.N, src_offset);
}
// Copy each of the triplet tuples
template <typename DstType, typename SrcType>
__inline__ __device__ static void copyWelfordTripletTuple(
DstType& dst,
const SrcType& src,
nvfuser_index_t src_offset = 0) {
copyWelfordTripletTuple(dst, 0, src, src_offset);
}
// Copy each of the triplet tuples
template <typename DstType, typename SrcType, typename PredType>
__inline__ __device__ static void copyWelfordTripletTupleIf(
DstType& dst,
const SrcType& src,
const PredType& pred) {
copyTupleIf(dst.avg, src.avg, pred);
copyTupleIf(dst.var, src.var, pred);
copyTupleIf(dst.N, src.N, pred);
}
} // namespace fused_reduction
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
namespace fused_reduction {
namespace impl {
//! Suppose f_i be the i-th function of the binary function
//! parameters. Call the function as: f_i(x, y)
template <int i, typename DataType, typename Func, typename... Funcs>
struct FuncSelector {
static __device__ void call(
DataType& x,
const DataType y,
Func f,
Funcs... funcs) {
// Here, i is guaranteed to be larger than 0 as there's a
// specialization for i == 0 below. Recursively call FuncSelector
// by dropping f and decrementing i.
FuncSelector<i - 1, DataType, Funcs...>::call(x, y, funcs...);
}
};
//! Specialization of FuncSelector when i == 0, so f_i is f.
template <typename DataType, typename Func, typename... Funcs>
struct FuncSelector<0, DataType, Func, Funcs...> {
static __device__ void call(
DataType& x,
const DataType y,
Func f,
Funcs... funcs) {
f(x, y);
}
};
//! Call each of the first i+1 functions with the first i+1 values of
//! tuples. Here, i is guaranteed to be larger than -1 as there's a
//! specialization for i == -1.
template <int i, typename TupleType0, typename TupleType1, typename... Funcs>
struct FuncForEach {
static __device__ void call(
TupleType0& val0,
nvfuser_index_t offset0,
const TupleType1& val1,
nvfuser_index_t offset1,
Funcs... funcs) {
static_assert(
IsSameType<
typename TupleType0::template ValType<i>,
typename TupleType1::template ValType<i>>::value,
"Invalid tuple types");
// Process the first i functions first.
FuncForEach<i - 1, TupleType0, TupleType1, Funcs...>::call(
val0, offset0, val1, offset1, funcs...);
// Call the i+1-th function
FuncSelector<i, typename TupleType0::template ValType<i>, Funcs...>::call(
val0.val<i>(offset0), val1.val<i>(offset1), funcs...);
}
};
//! Specialization of FuncForEach when i == -1, which means no
//! function to call. Just for stopping the recursive pattern here.
template <typename TupleType0, typename TupleType1, typename... Funcs>
struct FuncForEach<-1, TupleType0, TupleType1, Funcs...> {
static __device__ void call(
TupleType0& val0,
nvfuser_index_t offset0,
const TupleType1& val1,
nvfuser_index_t offset1,
Funcs... funcs) {}
};
//! Reduce one value of a tuple using one of the reduction ops. The
//! value at val_idx is reduced by the function at func_idx.
template <
int func_idx,
int val_idx,
typename TupleType0,
typename TupleType1,
typename... Funcs>
__inline__ __device__ static void reduceVal(
TupleType0& val0,
nvfuser_index_t offset0,
const TupleType1& val1,
nvfuser_index_t offset1,
Funcs... reduction_ops) {
static_assert(
IsSameType<
typename TupleType0::template ValType<val_idx>,
typename TupleType1::template ValType<val_idx>>::value,
"Invalid tuple types");
FuncSelector<
func_idx,
typename TupleType0::template ValType<val_idx>,
Funcs...>::
call(
val0.val<val_idx>(offset0),
val1.val<val_idx>(offset1),
reduction_ops...);
}
//! Accumulate each value of a given pair of tuples using its corresponding
//! function. Suppose f_i be the i-th reduciton function. Call f_i as:
//! f_i(val0.val<i>(offset0), val1.val<i>(offset1)).
template <typename TupleType0, typename TupleType1, typename... Funcs>
__inline__ __device__ static void reduceEach(
TupleType0& val0,
nvfuser_index_t offset0,
const TupleType1& val1,
nvfuser_index_t offset1,
Funcs... reduction_ops) {
constexpr int num_funcs = sizeof...(reduction_ops);
FuncForEach<num_funcs - 1, TupleType0, TupleType1, Funcs...>::call(
val0, offset0, val1, offset1, reduction_ops...);
}
template <typename TupleType0, typename TupleType1, typename Func, int num_vals>
struct TupleReduce {};
template <typename TupleType0, typename TupleType1, typename Func>
struct TupleReduce<TupleType0, TupleType1, Func, 1> {
__inline__ __device__ static void reduce(
TupleType0& val0,
nvfuser_index_t offset0,
const TupleType1& val1,
nvfuser_index_t offset1,
Func reduction_op) {
static_assert(
IsSameType<
typename TupleType0::ValTypes,
typename TupleType1::ValTypes>::value,
"Invalid value types");
reduction_op(val0.val<0>(offset0), val1.val<0>(offset1));
}
};
template <typename TupleType0, typename TupleType1, typename Func>
struct TupleReduce<TupleType0, TupleType1, Func, 2> {
__inline__ __device__ static void reduce(
TupleType0& val0,
nvfuser_index_t offset0,
const TupleType1& val1,
nvfuser_index_t offset1,
Func reduction_op) {
static_assert(
IsSameType<
typename TupleType0::ValTypes,
typename TupleType1::ValTypes>::value,
"Invalid value types");
reduction_op(
val0.val<0>(offset0),
val0.val<1>(offset0),
val1.val<0>(offset1),
val1.val<1>(offset1));
}
};
template <typename TupleType0, typename TupleType1, typename Func>
struct TupleReduce<TupleType0, TupleType1, Func, 3> {
__inline__ __device__ static void reduce(
TupleType0& val0,
nvfuser_index_t offset0,
const TupleType1& val1,
nvfuser_index_t offset1,
Func reduction_op) {
static_assert(
IsSameType<
typename TupleType0::ValTypes,
typename TupleType1::ValTypes>::value,
"Invalid value types");
reduction_op(
val0.val<0>(offset0),
val0.val<1>(offset0),
val0.val<2>(offset0),
val1.val<0>(offset1),
val1.val<1>(offset1),
val1.val<2>(offset1));
}
};
//! Reduce all values of a tuple together. The reduction function must
//! have the same number of inputs as the number of values of each tuple.
template <typename TupleType0, typename TupleType1, typename Func>
__inline__ __device__ void reduceTuple(
TupleType0& val0,
nvfuser_index_t offset0,
const TupleType1& val1,
nvfuser_index_t offset1,
Func reduction_op) {
static_assert(
TupleType0::num_vals == TupleType1::num_vals, "Invalid number of values");
TupleReduce<TupleType0, TupleType1, Func, TupleType0::num_vals>::reduce(
val0, offset0, val1, offset1, reduction_op);
}
// Reduces all of the first (idx+1) values by a thread block
template <
int idx,
bool BROADCAST,
bool FORWARD_PROTECT_SMEM,
bool Aligned,
typename LocalTupleT,
typename BlockDimT,
typename... Funcs>
struct BlockReduceEach {
__inline__ __device__ static void reduce(
LocalTupleT& block_result,
const LocalTupleT& partial_result,
void* shared_mem,
bool has_block_result,
int tid_in_reduction,
int num_threads_per_reduction,
int num_elements_per_reduction,
int reduction_idx,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
Funcs... funcs) {
// Finish the reduction of each tuple value with a smaller offset
BlockReduceEach<
idx - 1,
BROADCAST,
true,
Aligned,
LocalTupleT,
BlockDimT,
Funcs...>::
reduce(
block_result,
partial_result,
shared_mem,
has_block_result,
tid_in_reduction,
num_threads_per_reduction,
num_elements_per_reduction,
reduction_idx,
block_dim,
funcs...);
if (num_elements_per_reduction == 1) {
if (has_block_result) {
block_result.val<idx>(0) = partial_result.val<idx>(0);
}
return;
}
using DataType = typename LocalTupleT::template ValType<idx>;
PtrTuple<DataType> shared_buf(static_cast<DataType*>(shared_mem));
LocalTuple<DataType> block_result_i(partial_result.val<idx>(0));
const auto smem_offset =
reduction_idx * num_threads_per_reduction + tid_in_reduction;
const int np2 = 1 << (31 - __clz(num_elements_per_reduction));
// Threads values are initialized, so all can participate here
if (tid_in_reduction >= np2) {
copyTuple(shared_buf, smem_offset, block_result_i);
}
block_sync::sync<Aligned>(block_dim);
if (tid_in_reduction < np2 &&
tid_in_reduction + np2 < num_elements_per_reduction) {
impl::reduceVal<idx, 0>(
block_result_i, 0, shared_buf, smem_offset + np2, funcs...);
}
if (tid_in_reduction < np2) {
copyTuple(shared_buf, smem_offset, block_result_i);
}
// Always sync when communicating across smem
block_sync::sync<Aligned>(block_dim);
// Reduce down to 2 values, last thread will do the final reduction and
// can save a syncthreads this way
for (int factor = np2 / 2; factor > 1; factor >>= 1) {
if (tid_in_reduction < factor) {
impl::reduceVal<idx, 0>(
shared_buf,
smem_offset,
shared_buf,
smem_offset + factor,
funcs...);
}
block_sync::sync<Aligned>(block_dim);
}
copyTuple(block_result_i, shared_buf, smem_offset);
// Do the last reduction
if (has_block_result) {
impl::reduceVal<idx, 0>(
block_result_i, 0, shared_buf, smem_offset + 1, funcs...);
}
if (BROADCAST) {
if (has_block_result) {
// Put result back in shared memory, put in the first entry of the
// reduction segment's buffer
copyTuple(
shared_buf,
reduction_idx * num_threads_per_reduction,
block_result_i);
}
// Sync threads to make sure result is in smem
block_sync::sync<Aligned>(block_dim);
copyTuple(
block_result_i,
shared_buf,
reduction_idx * num_threads_per_reduction);
}
block_result.val<idx>(0) = block_result_i.val<0>(0);
if (FORWARD_PROTECT_SMEM) {
block_sync::sync<Aligned>(block_dim);
}
}
};
// Specialization for idx == -1, i.e., no value to reduce.
template <
bool BROADCAST,
bool FORWARD_PROTECT_SMEM,
bool Aligned,
typename LocalTupleT,
typename BlockDimT,
typename... Funcs>
struct BlockReduceEach<
-1,
BROADCAST,
FORWARD_PROTECT_SMEM,
Aligned,
LocalTupleT,
BlockDimT,
Funcs...> {
__inline__ __device__ static void reduce(
LocalTupleT& block_result,
const LocalTupleT& partial_result,
void* shared_mem,
bool has_block_result,
int tid_in_reduction,
int num_threads_per_reduction,
int num_elements_per_reduction,
int reduction_idx,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
Funcs... funcs) {}
};
//! Reduce each value of a tuple by a thread block.
//!
//! The final result is broadcast when BROADCAST is true.
//!
//! \param block_result result of the block reduction
//! \param partial_result Per-thread input tuple
//! \param shared_mem
//! \param has_block_result
//! \param tid_in_reduction
//! \param num_threads_per_reduction
//! \param num_elements_per_reduction
//! \param reduction_idx
//! \param reduction_ops
template <
bool BROADCAST,
bool FORWARD_PROTECT_SMEM,
bool Aligned,
typename LocalTupleT,
typename BlockDimT,
typename... Funcs>
__inline__ __device__ void blockReduceEach(
LocalTupleT& block_result,
const LocalTupleT& partial_result,
void* shared_mem,
bool has_block_result,
int tid_in_reduction,
int num_threads_per_reduction,
int num_elements_per_reduction,
int reduction_idx,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
Funcs... reduction_ops) {
BlockReduceEach<
LocalTupleT::num_vals - 1,
BROADCAST,
FORWARD_PROTECT_SMEM,
Aligned,
LocalTupleT,
BlockDimT,
Funcs...>::
reduce(
block_result,
partial_result,
shared_mem,
has_block_result,
tid_in_reduction,
num_threads_per_reduction,
num_elements_per_reduction,
reduction_idx,
block_dim,
reduction_ops...);
}
} // namespace impl
// We have 6 dimensions, 3 in the grid, 3 in the block
// They can be 1 of 3 states,
// Reduction Domain - TEMPLATE STATE 0
// - Participating in the reduction, has values coming in, one value coming
// out across the dimension
// Iteration Domain - TEMPLATE STATE 1
// - Not participating in the reduction, has values across the dimension after
// the reduction
// Collapsed Domain - TEMPLATE STATE 2
// - Previously reduced, doesn't need to be reduced on that dimension, doesn't
// have values across that dimension
constexpr __device__ bool isReduce(int STATE) {
return STATE == 0;
}
constexpr __device__ bool isIter(int STATE) {
return STATE == 1;
}
constexpr __device__ bool isPred(int STATE) {
return STATE == 2;
}
constexpr __device__ bool inactive(int STATE) {
return STATE == 3;
}
constexpr __device__ bool activeNotIter(int STATE) {
return STATE != 3 && STATE != 1;
}
constexpr __device__ bool isReduceOrIter(int STATE) {
return isReduce(STATE) || isIter(STATE);
}
// When generating an index into the reduction, we have to stride by iteration
// domains and reduction domains. Collapsed domains we can ignore, but we need
// to make sure they never read or write (need to be predicated to correct
// participation).
// All inclusive reduction with option to re-broadcast. This reduction class
// does not use predication of parallelization in the read or write predicates.
// Instead there are 3 states each dimension of parallelization can have,
// described above. Predication, indexing, and reduction will be done based on
// this information.
template <
int X_BLOCK,
int Y_BLOCK,
int Z_BLOCK,
int X_THREAD,
int Y_THREAD,
int Z_THREAD,
bool PERSISTENT_REDUCTION,
bool BROADCAST>
class ParallelReduce {
static_assert(
!BROADCAST || PERSISTENT_REDUCTION,
"Broadcast requires persistent reduction");
static constexpr bool BLOCK_REDUCE =
isReduce(X_THREAD) || isReduce(Y_THREAD) || isReduce(Z_THREAD);
static constexpr bool GRID_REDUCE =
isReduce(X_BLOCK) || isReduce(Y_BLOCK) || isReduce(Z_BLOCK);
// ping-pong between global buffers to avoid a second sync
bool flip = false;
public:
__device__ ParallelReduce() {}
// reduceGroup does not support Welford-style reductions that reduce
// all values of a tuple together, so this is the only entry point
// for Welford for now.
template <bool Aligned, typename Func, typename BlockDimT, typename... Types>
__device__ __inline__ void reduce(
RefTuple<Types...> out,
const ConstRefTuple<Types...>& inp,
VolatilePtrTuple<Types...> global_work_buffer,
int64_t* global_sync_buffer, // Allocated as product of all
// non-participating Grid dimension
PtrTuple<Types...> shared_buf,
bool read_pred, // Prevent reading from out of bounds memory
bool write_pred, // Prevent from writing out of bounds
const LocalTuple<Types...>& init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
Func reduction_op);
//! Profiled version
template <bool Aligned, typename Func, typename BlockDimT, typename... Types>
__device__ __inline__ void reduce(
RefTuple<Types...> out,
const ConstRefTuple<Types...>& inp,
VolatilePtrTuple<Types...> global_work_buffer,
int64_t* global_sync_buffer, // Allocated as product of all
// non-participating Grid dimension
PtrTuple<Types...> shared_buf,
bool read_pred, // Prevent reading from out of bounds memory
bool write_pred, // Prevent from writing out of bounds
const LocalTuple<Types...>& init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
Func reduction_op,
int64_t& cycles,
int64_t& count);
//! Each value of a tuple is independently reduced by the
//! corresponding reduction op. Thus, Welford-like reductions are
//! not supported by this interface.
//!
//! Note that out is purely used as the output parameter, and its
//! initial value is not used but just overwritten. Since grid
//! reductions do not allow serial reduction IterDomains, there is
//! no need to accumulate into the out parameter.
template <
bool Aligned,
typename BlockDimT,
typename... DataTypes,
typename... Funcs,
typename... BoolTypes>
__device__ __inline__ void reduceGroup(
RefTuple<DataTypes...> out,
const ConstRefTuple<DataTypes...>& inp,
VolatilePtrTuple<DataTypes...> global_work_buffer,
const LocalTuple<DataTypes...>& init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
int64_t* global_sync_buffer,
void* shared_mem,
const LocalTuple<BoolTypes...>& read_preds,
const LocalTuple<BoolTypes...>& write_preds,
Funcs... funcs);
//! Profiled version
template <
bool Aligned,
typename BlockDimT,
typename... DataTypes,
typename... Funcs,
typename... BoolTypes>
__device__ __inline__ void reduceGroup(
RefTuple<DataTypes...> out,
const ConstRefTuple<DataTypes...>& inp,
VolatilePtrTuple<DataTypes...> global_work_buffer,
const LocalTuple<DataTypes...>& init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
int64_t* global_sync_buffer,
void* shared_mem,
const LocalTuple<BoolTypes...>& read_preds,
const LocalTuple<BoolTypes...>& write_preds,
int64_t& cycles,
int64_t& count,
Funcs... funcs);
// User-visible entry point of grouped grid welford +
// broadcast. Mostly the same as reduceGroup, and it would be
// possible to combine this to reduceGroup, but it might make the
// templated data structures even more complicated and difficult to
// understand. For now, keep it as a separate function.
//
// Unlike reduceGroup, though, the data types of welford ops must be
// the same. For example, reduceGroup can be used to reduce half and
// float values by passing a tuple of, e.g., LocalTuple<half,
// float>, but that's not supported here for implementation
// simplicity. In practice, it should be really uncommon to group
// welford ops with different data types, so this restriction
// shouldn't be an issue.
template <
bool Aligned,
int NumArgs,
typename DataType,
typename IndexType,
typename BlockDimT>
__device__ __inline__ void welfordGroup(
typename MakeRefTuple<NumArgs, DataType>::type out_avg,
typename MakeRefTuple<NumArgs, DataType>::type out_var,
typename MakeRefTuple<NumArgs, IndexType>::type out_N,
const typename MakeConstRefTuple<NumArgs, DataType>::type& inp_avg,
const typename MakeConstRefTuple<NumArgs, DataType>::type& inp_var,
const typename MakeConstRefTuple<NumArgs, IndexType>::type& inp_N,
const typename MakeLocalTuple<NumArgs, DataType>::type& init_avg,
const typename MakeLocalTuple<NumArgs, DataType>::type& init_var,
const typename MakeLocalTuple<NumArgs, IndexType>::type& init_N,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
typename MakeVolatilePtrTuple<NumArgs, DataType>::type
global_work_buffer_avg,
typename MakeVolatilePtrTuple<NumArgs, DataType>::type
global_work_buffer_var,
typename MakeVolatilePtrTuple<NumArgs, IndexType>::type
global_work_buffer_N,
int64_t* global_sync_buffer,
PtrTuple<DataType, DataType, IndexType> shared_buf,
const typename MakeLocalTuple<NumArgs, bool>::type& read_preds,
const typename MakeLocalTuple<NumArgs, bool>::type& write_preds);
//! Profiled version
template <
bool Aligned,
int NumArgs,
typename DataType,
typename IndexType,
typename BlockDimT>
__device__ __inline__ void welfordGroup(
typename MakeRefTuple<NumArgs, DataType>::type out_avg,
typename MakeRefTuple<NumArgs, DataType>::type out_var,
typename MakeRefTuple<NumArgs, IndexType>::type out_N,
const typename MakeConstRefTuple<NumArgs, DataType>::type& inp_avg,
const typename MakeConstRefTuple<NumArgs, DataType>::type& inp_var,
const typename MakeConstRefTuple<NumArgs, IndexType>::type& inp_N,
const typename MakeLocalTuple<NumArgs, DataType>::type& init_avg,
const typename MakeLocalTuple<NumArgs, DataType>::type& init_var,
const typename MakeLocalTuple<NumArgs, IndexType>::type& init_N,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
typename MakeVolatilePtrTuple<NumArgs, DataType>::type
global_work_buffer_avg,
typename MakeVolatilePtrTuple<NumArgs, DataType>::type
global_work_buffer_var,
typename MakeVolatilePtrTuple<NumArgs, IndexType>::type
global_work_buffer_N,
int64_t* global_sync_buffer,
PtrTuple<DataType, DataType, IndexType> shared_buf,
const typename MakeLocalTuple<NumArgs, bool>::type& read_preds,
const typename MakeLocalTuple<NumArgs, bool>::type& write_preds,
int64_t& cycles,
int64_t& count);
// This is highly specific to the outer-reduction pattern. All the
// assumptions should be asserted with static_assert at the begging of
// the fuction.
template <
bool Aligned,
int NumVals,
typename DataType,
int BDIMX,
int BDIMY,
typename BlockDimT>
__device__ __inline__ void welfordGroupOuter(
DataType out_avg[NumVals],
DataType out_var[NumVals],
nvfuser_index_t out_N[NumVals],
const DataType in_avg[NumVals],
const DataType in_var[NumVals],
nvfuser_index_t in_N,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
DataType* global_buf_avg,
DataType* global_buf_var,
nvfuser_index_t* global_buf_N,
DataType* shared_buf,
int64_t* global_sync_buffer);
// Profiled version
template <
bool Aligned,
int NumVals,
typename DataType,
int BDIMX,
int BDIMY,
typename BlockDimT>
__device__ __inline__ void welfordGroupOuter(
DataType out_avg[NumVals],
DataType out_var[NumVals],
nvfuser_index_t out_N[NumVals],
const DataType in_avg[NumVals],
const DataType in_var[NumVals],
nvfuser_index_t in_N,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
DataType* global_buf_avg,
DataType* global_buf_var,
nvfuser_index_t* global_buf_N,
DataType* shared_buf,
int64_t* global_sync_buffer,
int64_t& cycles,
int64_t& count);
private:
__device__ static bool isLastBlockInGrid() {
return index_utils::maskedIsLast<
isReduceOrIter(X_BLOCK),
isReduceOrIter(Y_BLOCK),
isReduceOrIter(Z_BLOCK)>(blockIdx, gridDim) &&
index_utils::maskedIsZero<
!isReduceOrIter(X_BLOCK),
!isReduceOrIter(Y_BLOCK),
!isReduceOrIter(Z_BLOCK)>(blockIdx);
}
//! Initial per-CTA reduction of each value of a tuple. Each value
//! is reduced individually, so the shared memory buffer just needs
//! to be large enough for each value. NOTE that the smem buffer is
//! not forward protected.
template <
bool BLOCK_BROADCAST,
bool Aligned,
typename BlockDimT,
typename... DataTypes,
typename... Funcs,
typename... BoolTypes>
__device__ __inline__ static LocalTuple<DataTypes...> reduceGroupBlock(
const ConstRefTuple<DataTypes...>& inp,
const LocalTuple<DataTypes...>& init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
void* shared_mem,
const LocalTuple<BoolTypes...>& read_preds,
bool block_reduce_participate,
Funcs... funcs);
//! Final reduction of partial results. Done by all blocks
//! redundantly when BROADCAST is true, or just one block otherwise.
//! The smem buffer is assumed synchronized when it is passed in,
//! but it isn't synchronized when returning from this function.
template <
bool Aligned,
typename BlockDimT,
typename... DataTypes,
typename... Funcs,
typename... BoolTypes>
__device__ __inline__ static void reduceGroupLastBlock(
RefTuple<DataTypes...>& out,
const VolatilePtrTuple<DataTypes...>& global_work_buffer,
const LocalTuple<DataTypes...>& init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
void* shared_mem,
nvfuser_index_t block_red_idx_offset,
nvfuser_index_t num_thread_iters,
nvfuser_index_t num_block_iters,
nvfuser_index_t thread_red_idx_offset,
nvfuser_index_t grid_red_size,
const LocalTuple<BoolTypes...>& write_preds,
bool block_reduce_participate,
bool grid_reduce_participate,
Funcs... reduction_ops);
//! Welford version of reduceGroupBlock
template <
bool BLOCK_BROADCAST,
bool Aligned,
int NumVals,
typename DataType,
typename IndexType,
typename BlockDimT>
__device__ __inline__ static void welfordGroupBlock(
LocalWelfordTripletTuple<NumVals, DataType, IndexType>& block_result,
const ConstRefWelfordTripletTuple<NumVals, DataType, IndexType>& inp,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
PtrTuple<DataType, DataType, IndexType> shared_buf,
const typename MakeLocalTuple<NumVals, bool>::type& read_preds,
bool block_reduce_participate);
//! Welford version of reduceGrouplLastBlock
template <
bool Aligned,
int NumVals,
typename DataType,
typename IndexType,
typename BlockDimT>
__device__ __inline__ static void welfordGroupLastBlock(
RefWelfordTripletTuple<NumVals, DataType, IndexType>& out,
const VolatilePtrWelfordTripletTuple<NumVals, DataType, IndexType>&
global_work_buffer,
const LocalWelfordTripletTuple<NumVals, DataType, IndexType>& init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
PtrTuple<DataType, DataType, IndexType> shared_buf,
nvfuser_index_t block_red_idx_offset,
nvfuser_index_t num_thread_iters,
nvfuser_index_t num_block_iters,
nvfuser_index_t thread_red_idx_offset,
nvfuser_index_t grid_red_size,
const typename MakeLocalTuple<NumVals, bool>::type& write_preds,
bool block_reduce_participate,
bool grid_reduce_participate);
// End Parallel reduce class
};
template <
int X_BLOCK,
int Y_BLOCK,
int Z_BLOCK,
int X_THREAD,
int Y_THREAD,
int Z_THREAD,
bool PERSISTENT_REDUCTION,
bool BROADCAST>
template <bool Aligned, typename Func, typename BlockDimT, typename... Types>
__device__ __inline__ void ParallelReduce<
X_BLOCK,
Y_BLOCK,
Z_BLOCK,
X_THREAD,
Y_THREAD,
Z_THREAD,
PERSISTENT_REDUCTION,
BROADCAST>::
reduce(
RefTuple<Types...> out,
const ConstRefTuple<Types...>& inp,
VolatilePtrTuple<Types...> global_work_buffer,
int64_t* global_sync_buffer, // Allocated as product of all
// non-participating Grid dimension
PtrTuple<Types...> shared_buf,
bool read_pred, // Prevent reading from out of bounds memory
bool write_pred, // Prevent from writing out of bounds
const LocalTuple<Types...>& init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
Func reduction_op) {
// If no reduction needed, just return input
if (!BLOCK_REDUCE && !GRID_REDUCE) {
if (read_pred && write_pred) {
out = inp;
}
return;
}
// Don't read/write in temporary buffers if in a predicated dimension
bool block_reduce_participate = index_utils::
maskedIsZero<isPred(X_THREAD), isPred(Y_THREAD), isPred(Z_THREAD)>(
threadIdx);
// Initialize block result
LocalTuple<Types...> block_result = init_val;
// Grab input data if participating in the reduction, set to block_result in
// the case there is no block reduction
if (block_reduce_participate && read_pred) {
block_result = inp;
}
// Only threads that with id == 0 in the dimensions being reduced will
// have a valid result
bool has_block_result = index_utils::
maskedIsZero<isReduce(X_THREAD), isReduce(Y_THREAD), isReduce(Z_THREAD)>(
threadIdx);
if (BLOCK_REDUCE) {
// -- START BLOCK REDUCTION -- //
// Size of the block reduction segment, can be an int since it's limited
// to number of threads
int block_reduction_size = index_utils::
maskedSize<isReduce(X_THREAD), isReduce(Y_THREAD), isReduce(Z_THREAD)>(
block_dim);
// Index in the reduction segment, can be an int since it's limited to
// number of threads
int tid_in_block_reduction = index_utils::maskedOffset<
isReduce(X_THREAD),
isReduce(Y_THREAD),
isReduce(Z_THREAD)>(threadIdx, block_dim);
// ID of the block reduction this thread is participating in
//
// If any of the parallel dimensions are predicated out, that means
// they've already been reduced, so we only care about the first thread in
// that dimension. Therefore don't expand the reduction_idx by that
// dimension
int block_reduction_idx = index_utils::
maskedOffset<isIter(X_THREAD), isIter(Y_THREAD), isIter(Z_THREAD)>(
threadIdx, block_dim);
// Shared memory buffer is 2D
// [iter dimension, reduction dimension]
// Offset into smem for the current thread
int block_reduce_smem_offset =
block_reduction_idx * block_reduction_size + tid_in_block_reduction;
// Initialize shared memory
if (block_reduce_participate) {
copyTuple(shared_buf, block_reduce_smem_offset, block_result);
}
// Sync to make sure smem is completely initialized
block_sync::sync<Aligned>(block_dim);
// Round reduction size down to nearest power of 2
int np2 = 1 << (31 - __clz(block_reduction_size));
// Perform an initial reduction leaving np2 elements
if (block_reduce_participate && tid_in_block_reduction < np2 &&
tid_in_block_reduction + np2 < block_reduction_size) {
impl::reduceTuple(
shared_buf,
block_reduce_smem_offset,
shared_buf,
block_reduce_smem_offset + np2,
reduction_op);
}
// Always need to sync while operating on shared memory
block_sync::sync<Aligned>(block_dim);
// Reduce down until 2 values, leaving 2 values allows us to manually
// perform the last reduction and avoid a syncthreads
for (int factor = np2 / 2; factor > 1; factor >>= 1) {
if (tid_in_block_reduction < factor && block_reduce_participate) {
impl::reduceTuple(
shared_buf,
block_reduce_smem_offset,
shared_buf,
block_reduce_smem_offset + factor,
reduction_op);
}
block_sync::sync<Aligned>(block_dim);
}
// Accumulate that last valid result
if (has_block_result) {
copyTuple(block_result, shared_buf, block_reduce_smem_offset);
if (block_reduction_size > 1) {
impl::reduceTuple(
block_result,
0,
shared_buf,
block_reduce_smem_offset + 1,
reduction_op);
}
}
// ===== BLOCK REDUCTION CLEANUP =======
if (!GRID_REDUCE) {
// If no grid reduction, we don't have to continue. Either broadcast
// back across the block or return the correct reduction
if (has_block_result && write_pred) {
impl::reduceTuple(block_result, 0, out, 0, reduction_op);
out = block_result;
}
if (BROADCAST) {
// No grid reduce, but need to broadcast, perform block broadcast
if (has_block_result && write_pred) {
// Put result back in shared memory, put in the first entry of the
// reduction segment's buffer
copyTuple(
shared_buf,
block_reduction_idx * block_reduction_size,
block_result);
}
// Sync threads to make sure result is in smem
block_sync::sync<Aligned>(block_dim);
// If the thread is participating, and is not attempting to write out
// of bounds, return the broadcasted value.
if (block_reduce_participate && write_pred) {
copyTuple(
out, shared_buf, block_reduction_idx * block_reduction_size);
}
}
// Forward protect shared memory, don't want threads to continue to
// another reduction/broadcast and pollute shared memory before the
// reduction is completely finished.
//
// This could be avoided in some cases if we added thread syncs from
// block reductions in the syncthread insertion pass.
block_sync::sync<Aligned>(block_dim);
return;
}
}
// -- START GRID REDUCTION -- //
// Grid reductions are more challenging for two reasons, (1) the reduction
// itself is 3D instead of 2D because we now have an iter domain space in
// the grid dimension. (2) a tree reduction isn't performed, instead all
// blocks will populate GMEM and one block will finish the grid reduction.
// What is the grid reduction size, block reduction already performed so
// that doesn't have to be taken into consideration
const auto grid_red_size = index_utils::
maskedSize<isReduce(X_BLOCK), isReduce(Y_BLOCK), isReduce(Z_BLOCK)>(
gridDim);
// Which ID in the reduction is this block. Threads can participate in
// multiple grid reductions, but the block will have the same relative index
// in those reductions
const auto idx_in_grid_red = index_utils::
maskedOffset<isReduce(X_BLOCK), isReduce(Y_BLOCK), isReduce(Z_BLOCK)>(
blockIdx, gridDim);
if (PERSISTENT_REDUCTION && flip) {
auto global_buffer_size =
index_utils::
maskedSize<isIter(X_BLOCK), isIter(Y_BLOCK), isIter(Z_BLOCK)>(
gridDim) *
index_utils::
maskedSize<isIter(X_THREAD), isIter(Y_THREAD), isIter(Z_THREAD)>(
block_dim) *
grid_red_size;
global_work_buffer += global_buffer_size;
}
flip = !flip;
// How many grid reductions have to be performed, in the grid dimension
const auto num_block_iters = index_utils::
maskedSize<isIter(X_BLOCK), isIter(Y_BLOCK), isIter(Z_BLOCK)>(gridDim);
// Which grid reduction does this block participate in, in the grid
// dimension
const auto block_red_idx_offset = index_utils::
maskedOffset<isIter(X_BLOCK), isIter(Y_BLOCK), isIter(Z_BLOCK)>(
blockIdx, gridDim);
// How many grid reductions have to be performed, in the block dimension
const auto num_thread_iters = index_utils::
maskedSize<isIter(X_THREAD), isIter(Y_THREAD), isIter(Z_THREAD)>(
block_dim);
// Which grid reduction does this thread participate in, in the block
// dimension
const auto thread_red_idx_offset = index_utils::
maskedOffset<isIter(X_THREAD), isIter(Y_THREAD), isIter(Z_THREAD)>(
threadIdx, block_dim);
// 3D buffer of reductions:
// [reduction_offset(grid), iter_offset(grid), iter_offset(block)]
// Offset into the work buffer
const auto work_buf_offset =
(idx_in_grid_red * num_block_iters + block_red_idx_offset) *
num_thread_iters +
thread_red_idx_offset;
// Don't read/write in temporary buffers if in a predicated dimension
bool grid_reduce_participate = index_utils::
maskedIsZero<isPred(X_BLOCK), isPred(Y_BLOCK), isPred(Z_BLOCK)>(blockIdx);
if (grid_reduce_participate && block_reduce_participate) {
if (has_block_result) {
copyTuple(global_work_buffer, work_buf_offset, block_result);
}
}
// -- GLOBAL BUFFER FILLED -- //
bool last_block = index_utils::
maskedIsLast<isReduce(X_BLOCK), isReduce(Y_BLOCK), isReduce(Z_BLOCK)>(
blockIdx, gridDim);
if (grid_reduce_participate) {
// Don't need to sync up blocks that are not participating in this
// reduction
grid_sync::sync<
isReduce(X_BLOCK),
isReduce(Y_BLOCK),
isReduce(Z_BLOCK),
PERSISTENT_REDUCTION,
Aligned>(
global_sync_buffer[block_red_idx_offset],
grid_red_size,
last_block,
block_dim);
}
// -- START BLOCK CLEANUP -- //
// All blocks perform the last cleanup, so every block, and every thread
// will have the final result
// Initialize block result
LocalTuple<Types...> last_block_result(init_val);
if ((PERSISTENT_REDUCTION || last_block) && grid_reduce_participate) {
// Can use the last block to reduce all the values the blocks filled in.
// Can use any thread that has been predicated, or has been reduced to do
// this reduction, cannot use any block that's associated with an
// iteration domain
// Start with non-block reduction
// Index in the reduction segment
int tid_in_block_reduction_2 = index_utils::maskedOffset<
activeNotIter(X_THREAD),
activeNotIter(Y_THREAD),
activeNotIter(Z_THREAD)>(threadIdx, block_dim);
int block_reduction_size_2 = index_utils::maskedSize<
activeNotIter(X_THREAD),
activeNotIter(Y_THREAD),
activeNotIter(Z_THREAD)>(block_dim);
// 3D buffer of reductions:
// [reduction_offset(grid), iter_offset(grid), iter_offset(block)]
// Change the offset, we want to keep the last two dimensions, but the
// first dimension is what we will reduce over
const auto work_buf_offset_2 =
block_red_idx_offset * num_thread_iters + thread_red_idx_offset;
for (auto reduction_i = tid_in_block_reduction_2;
reduction_i < grid_red_size;
reduction_i += block_reduction_size_2) {
impl::reduceTuple(
last_block_result,
0,
global_work_buffer,
work_buf_offset_2 +
reduction_i * num_block_iters *
num_thread_iters, // Iterating over the outer most
// dimension, so need to stride by the
// total number of grid reductions. Could
// come back and change it so this is the
// contiguous dimension
reduction_op);
}
// -- START LAST BLOCK - BLOCK REDUCTION -- //
// Reduced so we have one value per thread, we need to further reduce any
// dimension that is not an iter dimension
// Which block reduction this thread is participating in
int block_reduction_idx = index_utils::
maskedOffset<isIter(X_THREAD), isIter(Y_THREAD), isIter(Z_THREAD)>(
threadIdx, block_dim);
// Offset in smem for this thread's result
auto smem_offset =
block_reduction_idx * block_reduction_size_2 + tid_in_block_reduction_2;
// Similar as before, reduce down to nearest power of 2 so we can do a
// tree reduction
int np2 = 1 << (31 - __clz(min(block_reduction_size_2, grid_red_size)));
// Threads values are initialized, so all can participate here
if (tid_in_block_reduction_2 >= np2) {
copyTuple(shared_buf, smem_offset, last_block_result);
}
block_sync::sync<Aligned>(block_dim);
if (tid_in_block_reduction_2 < np2 &&
tid_in_block_reduction_2 + np2 <
min(block_reduction_size_2, grid_red_size)) {
impl::reduceTuple(
last_block_result, 0, shared_buf, smem_offset + np2, reduction_op);
}
if (tid_in_block_reduction_2 < np2) {
copyTuple(shared_buf, smem_offset, last_block_result);
}
// Always sync when communicating across smem
block_sync::sync<Aligned>(block_dim);
// Reduce down to 2 values, last thread will do the final reduction and
// can save a syncthreads this way
for (int factor = np2 / 2; factor > 1; factor >>= 1) {
if (tid_in_block_reduction_2 < factor) {
impl::reduceTuple(
shared_buf,
smem_offset,
shared_buf,
smem_offset + factor,
reduction_op);
}
block_sync::sync<Aligned>(block_dim);
}
// If this thread in each block has the final result before broadcasting
// to all other threads in block
bool has_block_result_2 = index_utils::maskedIsZero<
activeNotIter(X_THREAD),
activeNotIter(Y_THREAD),
activeNotIter(Z_THREAD)>(threadIdx);
// Do the last reduction, protected by the write predicate
copyTuple(last_block_result, shared_buf, smem_offset);
if (has_block_result && grid_reduce_participate) {
impl::reduceTuple(last_block_result, 0, out, 0, reduction_op);
if (min(block_reduction_size_2, grid_red_size) > 1) {
impl::reduceTuple(
last_block_result, 0, shared_buf, smem_offset + 1, reduction_op);
}
}
if (grid_reduce_participate && PERSISTENT_REDUCTION) {
// If persistent reduction, always broadcast reduced values
copyTuple(shared_buf, smem_offset, last_block_result);
block_sync::sync<Aligned>(block_dim);
if (write_pred && block_reduce_participate) {
copyTuple(
out, shared_buf, block_reduction_idx * block_reduction_size_2);
}
// For persistent kernels we double the global buffer allocation so we
// don't need to protect those buffers every iteration preventing the
// need of an additional grid_sync. Since we flip back and forth between
// sections of the buffer, the one grid sync protects the other part of
// the buffer.
} else {
if (grid_reduce_participate) {
if (last_block && has_block_result && block_reduce_participate &&
write_pred) {
copyTuple(
out, shared_buf, block_reduction_idx * block_reduction_size_2);
}
}
}
// Forward protect the smem used in this reduction
block_sync::sync<Aligned>(block_dim);
}
}
//! Profiled version
template <
int X_BLOCK,
int Y_BLOCK,
int Z_BLOCK,
int X_THREAD,
int Y_THREAD,
int Z_THREAD,
bool PERSISTENT_REDUCTION,
bool BROADCAST>
template <bool Aligned, typename Func, typename BlockDimT, typename... Types>
__device__ __inline__ void ParallelReduce<
X_BLOCK,
Y_BLOCK,
Z_BLOCK,
X_THREAD,
Y_THREAD,
Z_THREAD,
PERSISTENT_REDUCTION,
BROADCAST>::
reduce(
RefTuple<Types...> out,
const ConstRefTuple<Types...>& inp,
VolatilePtrTuple<Types...> global_work_buffer,
int64_t* global_sync_buffer, // Allocated as product of all
// non-participating Grid dimension
PtrTuple<Types...> shared_buf,
bool read_pred, // Prevent reading from out of bounds memory
bool write_pred, // Prevent from writing out of bounds
const LocalTuple<Types...>& init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
Func reduction_op,
int64_t& cycles,
int64_t& count) {
int64_t start_counter = 0;
if (isLastBlockInGrid() &&
index_utils::maskedIsZero<true, true, true>(threadIdx)) {
start_counter = readCycleCounter();
}
reduce<Aligned>(
out,
inp,
global_work_buffer,
global_sync_buffer,
shared_buf,
read_pred,
write_pred,
init_val,
block_dim,
reduction_op);
if (isLastBlockInGrid() &&
index_utils::maskedIsZero<true, true, true>(threadIdx)) {
cycles += readCycleCounter() - start_counter;
++count;
}
}
template <
int X_BLOCK,
int Y_BLOCK,
int Z_BLOCK,
int X_THREAD,
int Y_THREAD,
int Z_THREAD,
bool PERSISTENT_REDUCTION,
bool BROADCAST>
template <
bool Aligned,
typename BlockDimT,
typename... DataTypes,
typename... Funcs,
typename... BoolTypes>
__device__ __inline__ void ParallelReduce<
X_BLOCK,
Y_BLOCK,
Z_BLOCK,
X_THREAD,
Y_THREAD,
Z_THREAD,
PERSISTENT_REDUCTION,
BROADCAST>::
reduceGroup(
RefTuple<DataTypes...> out,
const ConstRefTuple<DataTypes...>& inp,
VolatilePtrTuple<DataTypes...> global_work_buffer,
const LocalTuple<DataTypes...>& init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
int64_t* global_sync_buffer,
void* shared_mem,
const LocalTuple<BoolTypes...>& read_preds,
const LocalTuple<BoolTypes...>& write_preds,
Funcs... funcs) {
static_assert(
sizeof...(DataTypes) == sizeof...(Funcs),
"Mismatched number of Tuple values and functions");
static_assert(
sizeof...(DataTypes) == sizeof...(BoolTypes),
"Mismatched number of Tuple values and predicate values");
// If no reduction needed, just return input
if (!BLOCK_REDUCE && !GRID_REDUCE) {
copyTupleIf(out, inp, read_preds && write_preds);
return;
}
// Don't read/write in temporary buffers if in a predicated dimension
const bool block_reduce_participate = index_utils::
maskedIsZero<isPred(X_THREAD), isPred(Y_THREAD), isPred(Z_THREAD)>(
threadIdx);
// Only threads that with id == 0 in the dimensions being reduced will
// have a valid result
const bool has_block_result = index_utils::
maskedIsZero<isReduce(X_THREAD), isReduce(Y_THREAD), isReduce(Z_THREAD)>(
threadIdx);
// Initial per-block reduction. Result is broadcast if specified
// and this call is block reduction only.
const auto block_result = reduceGroupBlock < !GRID_REDUCE && BROADCAST,
Aligned > (inp,
init_val,
block_dim,
shared_mem,
read_preds,
block_reduce_participate,
funcs...);
// If block reduction only, save to out and exit
if (!GRID_REDUCE) {
copyTupleIf(
out,
block_result,
write_preds &&
(block_reduce_participate && (BROADCAST || has_block_result)));
// Need a block sync here as reduceGroupBlock does not
// forward-protect the smem buffer. This block sync is not
// necessary when a grid reduction follows since a block sync is
// done just before the grid sync.
block_sync::sync<Aligned>(block_dim);
return;
}
// -- START GRID REDUCTION -- //
// Grid reductions are more challenging for two reasons, (1) the reduction
// itself is 3D instead of 2D because we now have an iter domain space in
// the grid dimension. (2) a tree reduction isn't performed, instead all
// blocks will populate GMEM and one block will finish the grid reduction.
// What is the grid reduction size, block reduction already performed so
// that doesn't have to be taken into consideration
const auto grid_red_size = index_utils::
maskedSize<isReduce(X_BLOCK), isReduce(Y_BLOCK), isReduce(Z_BLOCK)>(
gridDim);
// Which ID in the reduction is this block. Threads can participate in
// multiple grid reductions, but the block will have the same relative index
// in those reductions
const auto idx_in_grid_red = index_utils::
maskedOffset<isReduce(X_BLOCK), isReduce(Y_BLOCK), isReduce(Z_BLOCK)>(
blockIdx, gridDim);
// How many grid reductions have to be performed, in the grid dimension
const auto num_block_iters = index_utils::
maskedSize<isIter(X_BLOCK), isIter(Y_BLOCK), isIter(Z_BLOCK)>(gridDim);
// Which grid reduction does this block participate in, in the grid
// dimension
const auto block_red_idx_offset = index_utils::
maskedOffset<isIter(X_BLOCK), isIter(Y_BLOCK), isIter(Z_BLOCK)>(
blockIdx, gridDim);
// How many grid reductions have to be performed, in the block dimension
const auto num_thread_iters = index_utils::
maskedSize<isIter(X_THREAD), isIter(Y_THREAD), isIter(Z_THREAD)>(
block_dim);
// Which grid reduction does this thread participate in, in the block
// dimension
const auto thread_red_idx_offset = index_utils::
maskedOffset<isIter(X_THREAD), isIter(Y_THREAD), isIter(Z_THREAD)>(
threadIdx, block_dim);
// 3D buffer of reductions:
// [reduction_offset(grid), iter_offset(grid), iter_offset(block)]
// Offset into the work buffer
const auto work_buf_offset =
(idx_in_grid_red * num_block_iters + block_red_idx_offset) *
num_thread_iters +
thread_red_idx_offset;
// Don't read/write in temporary buffers if in a predicated dimension
bool grid_reduce_participate = index_utils::
maskedIsZero<isPred(X_BLOCK), isPred(Y_BLOCK), isPred(Z_BLOCK)>(blockIdx);
if (PERSISTENT_REDUCTION && flip) {
auto global_buffer_size =
index_utils::
maskedSize<isIter(X_BLOCK), isIter(Y_BLOCK), isIter(Z_BLOCK)>(
gridDim) *
index_utils::
maskedSize<isIter(X_THREAD), isIter(Y_THREAD), isIter(Z_THREAD)>(
block_dim) *
grid_red_size;
global_work_buffer += global_buffer_size;
}
flip = !flip;
// Per-block partial reduction to global work buffer
if (grid_reduce_participate && block_reduce_participate && has_block_result) {
copyTuple(global_work_buffer, work_buf_offset, block_result);
}
// -- GLOBAL BUFFER FILLED -- //
bool last_block = index_utils::
maskedIsLast<isReduce(X_BLOCK), isReduce(Y_BLOCK), isReduce(Z_BLOCK)>(
blockIdx, gridDim);
if (grid_reduce_participate) {
// Don't need to sync up blocks that are not participating in this
// reduction
grid_sync::sync<
isReduce(X_BLOCK),
isReduce(Y_BLOCK),
isReduce(Z_BLOCK),
PERSISTENT_REDUCTION,
Aligned>(
global_sync_buffer[block_red_idx_offset],
grid_red_size,
last_block,
block_dim);
}
// -- START BLOCK CLEANUP -- //
reduceGroupLastBlock<Aligned>(
out,
global_work_buffer,
init_val,
block_dim,
shared_mem,
block_red_idx_offset,
num_thread_iters,
num_block_iters,
thread_red_idx_offset,
grid_red_size,
write_preds,
block_reduce_participate,
grid_reduce_participate,
funcs...);
// Forward protect the smem buffer
block_sync::sync<Aligned>(block_dim);
}
template <
int X_BLOCK,
int Y_BLOCK,
int Z_BLOCK,
int X_THREAD,
int Y_THREAD,
int Z_THREAD,
bool PERSISTENT_REDUCTION,
bool BROADCAST>
template <
bool Aligned,
typename BlockDimT,
typename... DataTypes,
typename... Funcs,
typename... BoolTypes>
__device__ __inline__ void ParallelReduce<
X_BLOCK,
Y_BLOCK,
Z_BLOCK,
X_THREAD,
Y_THREAD,
Z_THREAD,
PERSISTENT_REDUCTION,
BROADCAST>::
reduceGroup(
RefTuple<DataTypes...> out,
const ConstRefTuple<DataTypes...>& inp,
VolatilePtrTuple<DataTypes...> global_work_buffer,
const LocalTuple<DataTypes...>& init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
int64_t* global_sync_buffer,
void* shared_mem,
const LocalTuple<BoolTypes...>& read_preds,
const LocalTuple<BoolTypes...>& write_preds,
int64_t& cycles,
int64_t& count,
Funcs... funcs) {
int64_t start_counter = 0;
if (isLastBlockInGrid() &&
index_utils::maskedIsZero<true, true, true>(threadIdx)) {
start_counter = readCycleCounter();
}
reduceGroup<Aligned>(
out,
inp,
global_work_buffer,
init_val,
block_dim,
global_sync_buffer,
shared_mem,
read_preds,
write_preds,
funcs...);
if (isLastBlockInGrid() &&
index_utils::maskedIsZero<true, true, true>(threadIdx)) {
cycles += readCycleCounter() - start_counter;
++count;
}
}
template <
int X_BLOCK,
int Y_BLOCK,
int Z_BLOCK,
int X_THREAD,
int Y_THREAD,
int Z_THREAD,
bool PERSISTENT_REDUCTION,
bool BROADCAST>
template <
bool BLOCK_BROADCAST,
bool Aligned,
typename BlockDimT,
typename... DataTypes,
typename... Funcs,
typename... BoolTypes>
__device__ __inline__ LocalTuple<DataTypes...> ParallelReduce<
X_BLOCK,
Y_BLOCK,
Z_BLOCK,
X_THREAD,
Y_THREAD,
Z_THREAD,
PERSISTENT_REDUCTION,
BROADCAST>::
reduceGroupBlock(
const ConstRefTuple<DataTypes...>& inp,
const LocalTuple<DataTypes...>& init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
void* shared_mem,
const LocalTuple<BoolTypes...>& read_preds,
bool block_reduce_participate,
Funcs... funcs) {
const bool has_block_result = index_utils::
maskedIsZero<isReduce(X_THREAD), isReduce(Y_THREAD), isReduce(Z_THREAD)>(
threadIdx);
// Initialize block result
LocalTuple<DataTypes...> block_result = init_val;
copyTupleIf(block_result, inp, block_reduce_participate && read_preds);
// Size of the block reduction segment, can be an int since it's limited
// to number of threads
const int block_reduction_size = index_utils::
maskedSize<isReduce(X_THREAD), isReduce(Y_THREAD), isReduce(Z_THREAD)>(
block_dim);
// Index in the reduction segment, can be an int since it's limited to
// number of threads
const int tid_in_block_reduction = index_utils::
maskedOffset<isReduce(X_THREAD), isReduce(Y_THREAD), isReduce(Z_THREAD)>(
threadIdx, block_dim);
// ID of the block reduction this thread is participating in
//
// If any of the parallel dimensions are predicated out, that means
// they've already been reduced, so we only care about the first thread in
// that dimension. Therefore don't expand the reduction_idx by that
// dimension
const int block_reduction_idx = index_utils::
maskedOffset<isIter(X_THREAD), isIter(Y_THREAD), isIter(Z_THREAD)>(
threadIdx, block_dim);
// Do not protect the smem buffer as it's not always necessary.
impl::blockReduceEach<
BLOCK_BROADCAST,
false,
Aligned,
LocalTuple<DataTypes...>,
BlockDimT,
Funcs...>(
block_result,
block_result,
shared_mem,
has_block_result,
tid_in_block_reduction,
block_reduction_size,
block_reduction_size,
block_reduction_idx,
block_dim,
funcs...);
return block_result;
}
template <
int X_BLOCK,
int Y_BLOCK,
int Z_BLOCK,
int X_THREAD,
int Y_THREAD,
int Z_THREAD,
bool PERSISTENT_REDUCTION,
bool BROADCAST>
template <
bool Aligned,
typename BlockDimT,
typename... DataTypes,
typename... Funcs,
typename... BoolTypes>
__device__ __inline__ void ParallelReduce<
X_BLOCK,
Y_BLOCK,
Z_BLOCK,
X_THREAD,
Y_THREAD,
Z_THREAD,
PERSISTENT_REDUCTION,
BROADCAST>::
reduceGroupLastBlock(
RefTuple<DataTypes...>& out,
const VolatilePtrTuple<DataTypes...>& global_work_buffer,
const LocalTuple<DataTypes...>& init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
void* shared_mem,
nvfuser_index_t block_red_idx_offset,
nvfuser_index_t num_thread_iters,
nvfuser_index_t num_block_iters,
nvfuser_index_t thread_red_idx_offset,
nvfuser_index_t grid_red_size,
const LocalTuple<BoolTypes...>& write_preds,
bool block_reduce_participate,
bool grid_reduce_participate,
Funcs... reduction_ops) {
// Initialize block result
LocalTuple<DataTypes...> last_block_result(init_val);
const bool last_block = index_utils::
maskedIsLast<isReduce(X_BLOCK), isReduce(Y_BLOCK), isReduce(Z_BLOCK)>(
blockIdx, gridDim);
if ((PERSISTENT_REDUCTION || last_block) && grid_reduce_participate) {
// Can use the last block to reduce all the values the blocks filled in.
// Can use any thread that has been predicated, or has been reduced to do
// this reduction, cannot use any block that's associated with an
// iteration domain
// Start with non-block reduction
// Index in the reduction segment
int tid_in_block_reduction = index_utils::maskedOffset<
activeNotIter(X_THREAD),
activeNotIter(Y_THREAD),
activeNotIter(Z_THREAD)>(threadIdx, block_dim);
int block_reduction_size = index_utils::maskedSize<
activeNotIter(X_THREAD),
activeNotIter(Y_THREAD),
activeNotIter(Z_THREAD)>(block_dim);
bool has_block_result = index_utils::maskedIsZero<
activeNotIter(X_THREAD),
activeNotIter(Y_THREAD),
activeNotIter(Z_THREAD)>(threadIdx);
// 3D buffer of reductions:
// [reduction_offset(grid), iter_offset(grid), iter_offset(block)]
// Change the offset, we want to keep the last two dimensions, but the
// first dimension is what we will reduce over
const auto work_buf_offset =
block_red_idx_offset * num_thread_iters + thread_red_idx_offset;
for (auto reduction_i = tid_in_block_reduction; reduction_i < grid_red_size;
reduction_i += block_reduction_size) {
impl::reduceEach(
last_block_result,
0,
global_work_buffer,
work_buf_offset +
reduction_i * num_block_iters *
num_thread_iters, // Iterating over the outer most
// dimension, so need to stride by the
// total number of grid reductions. Could
// come back and change it so this is the
// contiguous dimension
reduction_ops...);
}
// Which block reduction this thread is participating in
int block_reduction_idx = index_utils::
maskedOffset<isIter(X_THREAD), isIter(Y_THREAD), isIter(Z_THREAD)>(
threadIdx, block_dim);
impl::blockReduceEach<
BROADCAST,
false,
Aligned,
LocalTuple<DataTypes...>,
BlockDimT,
Funcs...>(
last_block_result,
last_block_result,
shared_mem,
has_block_result,
tid_in_block_reduction,
block_reduction_size,
min(grid_red_size, block_reduction_size),
block_reduction_idx,
block_dim,
reduction_ops...);
copyTupleIf(
out,
last_block_result,
write_preds &&
(block_reduce_participate && (BROADCAST || has_block_result)));
}
}
} // namespace fused_reduction
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
namespace fused_reduction {
namespace impl {
//! Implementation helper for welfordEach.
template <int ValIdx, typename Triplet0, typename Triplet1>
struct WelfordForEach {
static __inline__ __device__ void call(
Triplet0& triplet0,
nvfuser_index_t offset0,
const Triplet1& triplet1,
nvfuser_index_t offset1) {
static_assert(
Triplet0::num_vals == Triplet1::num_vals, "Invalid Triplet types");
static_assert(
IsSameType<typename Triplet0::DataType, typename Triplet1::DataType>::
value,
"Invalid Triplet types");
static_assert(
IsSameType<typename Triplet0::IndexType, typename Triplet1::IndexType>::
value,
"Invalid Triplet types");
using DataType = typename Triplet0::DataType;
using IndexType = typename Triplet0::IndexType;
WelfordForEach<ValIdx - 1, Triplet0, Triplet1>::call(
triplet0, offset0, triplet1, offset1);
welfordCombine<DataType, IndexType>(
triplet0.avg.val<ValIdx>(offset0),
triplet0.var.val<ValIdx>(offset0),
triplet0.N.val<ValIdx>(offset0),
triplet1.avg.val<ValIdx>(offset1),
triplet1.var.val<ValIdx>(offset1),
triplet1.N.val<ValIdx>(offset1));
}
};
template <typename Triplet0, typename Triplet1>
struct WelfordForEach<-1, Triplet0, Triplet1> {
__inline__ __device__ static void call(
Triplet0& triplet0,
nvfuser_index_t offset0,
const Triplet1& triplet1,
nvfuser_index_t offset1) {}
};
//! Call welfordCombine with each of the triplet tuples. This is a
//! welford version of reduceEach.
template <typename Triplet0, typename Triplet1>
__inline__ __device__ static void welfordEach(
Triplet0& triplet0,
nvfuser_index_t offset0,
const Triplet1& triplet1,
nvfuser_index_t offset1) {
WelfordForEach<Triplet0::num_vals - 1, Triplet0, Triplet1>::call(
triplet0, offset0, triplet1, offset1);
}
// Welford version of BlockReduceEach
template <
int idx,
bool BROADCAST,
bool FORWARD_PROTECT_SMEM,
bool Aligned,
int NumVals,
typename DataType,
typename IndexType,
typename BlockDimT>
struct BlockWelfordEach {
__inline__ __device__ static void reduce(
LocalWelfordTripletTuple<NumVals, DataType, IndexType>& block_result,
const LocalWelfordTripletTuple<NumVals, DataType, IndexType>&
partial_result,
PtrTuple<DataType, DataType, IndexType> shared_buf,
bool has_block_result,
int tid_in_reduction,
int num_threads_per_reduction,
int num_elements_per_reduction,
int reduction_idx,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
// Finish the reduction of each tuple value with a smaller offset
BlockWelfordEach<
idx - 1,
BROADCAST,
true,
Aligned,
NumVals,
DataType,
IndexType,
BlockDimT>::
reduce(
block_result,
partial_result,
shared_buf,
has_block_result,
tid_in_reduction,
num_threads_per_reduction,
num_elements_per_reduction,
reduction_idx,
block_dim);
if (num_elements_per_reduction == 1) {
if (has_block_result) {
copyWelfordTripletTuple(block_result, partial_result);
}
return;
}
LocalTuple<DataType, DataType, IndexType> block_result_i(
partial_result.avg.val<idx>(0),
partial_result.var.val<idx>(0),
partial_result.N.val<idx>(0));
const auto smem_offset =
reduction_idx * num_threads_per_reduction + tid_in_reduction;
const int np2 = 1 << (31 - __clz(num_elements_per_reduction));
// Threads values are initialized, so all can participate here
if (tid_in_reduction >= np2) {
copyTuple(shared_buf, smem_offset, block_result_i);
}
block_sync::sync<Aligned>(block_dim);
if (tid_in_reduction < np2 &&
tid_in_reduction + np2 < num_elements_per_reduction) {
impl::reduceTuple(
block_result_i,
0,
shared_buf,
smem_offset + np2,
welfordCombine<DataType, IndexType>);
}
if (tid_in_reduction < np2) {
copyTuple(shared_buf, smem_offset, block_result_i);
}
// Always sync when communicating across smem
block_sync::sync<Aligned>(block_dim);
// Reduce down to 2 values, last thread will do the final reduction and
// can save a syncthreads this way
for (int factor = np2 / 2; factor > 1; factor >>= 1) {
if (tid_in_reduction < factor) {
impl::reduceTuple(
shared_buf,
smem_offset,
shared_buf,
smem_offset + factor,
welfordCombine<DataType, IndexType>);
}
block_sync::sync<Aligned>(block_dim);
}
copyTuple(block_result_i, shared_buf, smem_offset);
// Do the last reduction
if (has_block_result) {
impl::reduceTuple(
block_result_i,
0,
shared_buf,
smem_offset + 1,
welfordCombine<DataType, IndexType>);
}
if (BROADCAST) {
if (has_block_result) {
// Put result back in shared memory, put in the first entry of the
// reduction segment's buffer
copyTuple(
shared_buf,
reduction_idx * num_threads_per_reduction,
block_result_i);
}
// Sync threads to make sure result is in smem
block_sync::sync<Aligned>(block_dim);
copyTuple(
block_result_i,
shared_buf,
reduction_idx * num_threads_per_reduction);
}
block_result.avg.val<idx>(0) = block_result_i.val<0>(0);
block_result.var.val<idx>(0) = block_result_i.val<1>(0);
block_result.N.val<idx>(0) = block_result_i.val<2>(0);
if (FORWARD_PROTECT_SMEM) {
block_sync::sync<Aligned>(block_dim);
}
}
};
// Specialization for idx == -1, i.e., no value to reduce.
template <
bool BROADCAST,
bool FORWARD_PROTECT_SMEM,
bool Aligned,
int NumVals,
typename DataType,
typename IndexType,
typename BlockDimT>
struct BlockWelfordEach<
-1,
BROADCAST,
FORWARD_PROTECT_SMEM,
Aligned,
NumVals,
DataType,
IndexType,
BlockDimT> {
__inline__ __device__ static void reduce(
LocalWelfordTripletTuple<NumVals, DataType, IndexType>& block_result,
const LocalWelfordTripletTuple<NumVals, DataType, IndexType>&
partial_result,
PtrTuple<DataType, DataType, IndexType> shared_buf,
bool has_block_result,
int tid_in_reduction,
int num_threads_per_reduction,
int num_elements_per_reduction,
int reduction_idx,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {}
};
//! Welford version of blockReduceEach. Perform block-parallel Welford
//! reduction of each Welford triplet.
template <
bool BROADCAST,
bool FORWARD_PROTECT_SMEM,
bool Aligned,
int NumVals,
typename DataType,
typename IndexType,
typename BlockDimT>
__inline__ __device__ void blockWelfordEach(
LocalWelfordTripletTuple<NumVals, DataType, IndexType>& block_result,
const LocalWelfordTripletTuple<NumVals, DataType, IndexType>&
partial_result,
PtrTuple<DataType, DataType, IndexType> shared_buf,
bool has_block_result,
int tid_in_reduction,
int num_threads_per_reduction,
int num_elements_per_reduction,
int reduction_idx,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
BlockWelfordEach<
NumVals - 1,
BROADCAST,
FORWARD_PROTECT_SMEM,
Aligned,
NumVals,
DataType,
IndexType,
BlockDimT>::
reduce(
block_result,
partial_result,
shared_buf,
has_block_result,
tid_in_reduction,
num_threads_per_reduction,
num_elements_per_reduction,
reduction_idx,
block_dim);
}
} // namespace impl
template <
int X_BLOCK,
int Y_BLOCK,
int Z_BLOCK,
int X_THREAD,
int Y_THREAD,
int Z_THREAD,
bool PERSISTENT_REDUCTION,
bool BROADCAST>
template <
bool Aligned,
int NumArgs,
typename DataType,
typename IndexType,
typename BlockDimT>
__device__ __inline__ void ParallelReduce<
X_BLOCK,
Y_BLOCK,
Z_BLOCK,
X_THREAD,
Y_THREAD,
Z_THREAD,
PERSISTENT_REDUCTION,
BROADCAST>::
welfordGroup(
typename MakeRefTuple<NumArgs, DataType>::type out_avg,
typename MakeRefTuple<NumArgs, DataType>::type out_var,
typename MakeRefTuple<NumArgs, IndexType>::type out_N,
const typename MakeConstRefTuple<NumArgs, DataType>::type& inp_avg,
const typename MakeConstRefTuple<NumArgs, DataType>::type& inp_var,
const typename MakeConstRefTuple<NumArgs, IndexType>::type& inp_N,
const typename MakeLocalTuple<NumArgs, DataType>::type& init_avg,
const typename MakeLocalTuple<NumArgs, DataType>::type& init_var,
const typename MakeLocalTuple<NumArgs, IndexType>::type& init_N,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
typename MakeVolatilePtrTuple<NumArgs, DataType>::type
global_work_buffer_avg,
typename MakeVolatilePtrTuple<NumArgs, DataType>::type
global_work_buffer_var,
typename MakeVolatilePtrTuple<NumArgs, IndexType>::type
global_work_buffer_N,
int64_t* global_sync_buffer,
PtrTuple<DataType, DataType, IndexType> shared_buf,
const typename MakeLocalTuple<NumArgs, bool>::type& read_preds,
const typename MakeLocalTuple<NumArgs, bool>::type& write_preds) {
const ConstRefWelfordTripletTuple<NumArgs, DataType, IndexType> inp(
inp_avg, inp_var, inp_N);
RefWelfordTripletTuple<NumArgs, DataType, IndexType> out(
out_avg, out_var, out_N);
// If no reduction needed, just return input
if (!BLOCK_REDUCE && !GRID_REDUCE) {
copyWelfordTripletTupleIf(out, inp, read_preds && write_preds);
return;
}
// Don't read/write in temporary buffers if in a predicated dimension
const bool block_reduce_participate = index_utils::
maskedIsZero<isPred(X_THREAD), isPred(Y_THREAD), isPred(Z_THREAD)>(
threadIdx);
// Only threads that with id == 0 in the dimensions being reduced will
// have a valid result
const bool has_block_result = index_utils::
maskedIsZero<isReduce(X_THREAD), isReduce(Y_THREAD), isReduce(Z_THREAD)>(
threadIdx);
LocalWelfordTripletTuple<NumArgs, DataType, IndexType> block_result(
init_avg, init_var, init_N);
// Initial per-block reduction. Result is broadcast if specified
// and this call is block reduction only.
welfordGroupBlock<
!GRID_REDUCE && BROADCAST,
Aligned,
NumArgs,
DataType,
IndexType>(
block_result,
inp,
block_dim,
shared_buf,
read_preds,
block_reduce_participate);
// If block reduction only, save to out and exit
if (!GRID_REDUCE) {
copyWelfordTripletTupleIf(
out,
block_result,
write_preds &&
(block_reduce_participate && (BROADCAST || has_block_result)));
// Need a block sync here as reduceGroupBlock does not
// forward-protect the smem buffer. This block sync is not
// necessary when a grid reduction follows since a block sync is
// done just before the grid sync.
block_sync::sync<Aligned>(block_dim);
return;
}
// -- START GRID REDUCTION -- //
// Grid reductions are more challenging for two reasons, (1) the reduction
// itself is 3D instead of 2D because we now have an iter domain space in
// the grid dimension. (2) a tree reduction isn't performed, instead all
// blocks will populate GMEM and one block will finish the grid reduction.
// What is the grid reduction size, block reduction already performed so
// that doesn't have to be taken into consideration
const auto grid_red_size = index_utils::
maskedSize<isReduce(X_BLOCK), isReduce(Y_BLOCK), isReduce(Z_BLOCK)>(
gridDim);
// Which ID in the reduction is this block. Threads can participate in
// multiple grid reductions, but the block will have the same relative index
// in those reductions
const auto idx_in_grid_red = index_utils::
maskedOffset<isReduce(X_BLOCK), isReduce(Y_BLOCK), isReduce(Z_BLOCK)>(
blockIdx, gridDim);
// How many grid reductions have to be performed, in the grid dimension
const auto num_block_iters = index_utils::
maskedSize<isIter(X_BLOCK), isIter(Y_BLOCK), isIter(Z_BLOCK)>(gridDim);
// Which grid reduction does this block participate in, in the grid
// dimension
const auto block_red_idx_offset = index_utils::
maskedOffset<isIter(X_BLOCK), isIter(Y_BLOCK), isIter(Z_BLOCK)>(
blockIdx, gridDim);
// How many grid reductions have to be performed, in the block dimension
const auto num_thread_iters = index_utils::
maskedSize<isIter(X_THREAD), isIter(Y_THREAD), isIter(Z_THREAD)>(
block_dim);
// Which grid reduction does this thread participate in, in the block
// dimension
const auto thread_red_idx_offset = index_utils::
maskedOffset<isIter(X_THREAD), isIter(Y_THREAD), isIter(Z_THREAD)>(
threadIdx, block_dim);
// 3D buffer of reductions:
// [reduction_offset(grid), iter_offset(grid), iter_offset(block)]
// Offset into the work buffer
auto work_buf_offset =
(idx_in_grid_red * num_block_iters + block_red_idx_offset) *
num_thread_iters +
thread_red_idx_offset;
// Don't read/write in temporary buffers if in a predicated dimension
bool grid_reduce_participate = index_utils::
maskedIsZero<isPred(X_BLOCK), isPred(Y_BLOCK), isPred(Z_BLOCK)>(blockIdx);
VolatilePtrWelfordTripletTuple<NumArgs, DataType, IndexType>
global_work_buffer(
global_work_buffer_avg, global_work_buffer_var, global_work_buffer_N);
if (PERSISTENT_REDUCTION && flip) {
auto global_buffer_size =
index_utils::
maskedSize<isIter(X_BLOCK), isIter(Y_BLOCK), isIter(Z_BLOCK)>(
gridDim) *
index_utils::
maskedSize<isIter(X_THREAD), isIter(Y_THREAD), isIter(Z_THREAD)>(
block_dim) *
grid_red_size;
global_work_buffer += global_buffer_size;
}
flip = !flip;
// Per-block partial reduction to global work buffer
if (grid_reduce_participate && block_reduce_participate && has_block_result) {
copyWelfordTripletTuple(global_work_buffer, work_buf_offset, block_result);
}
// -- GLOBAL BUFFER FILLED -- //
bool last_block = index_utils::
maskedIsLast<isReduce(X_BLOCK), isReduce(Y_BLOCK), isReduce(Z_BLOCK)>(
blockIdx, gridDim);
if (grid_reduce_participate) {
// Don't need to sync up blocks that are not participating in this
// reduction
grid_sync::sync<
isReduce(X_BLOCK),
isReduce(Y_BLOCK),
isReduce(Z_BLOCK),
PERSISTENT_REDUCTION,
Aligned>(
global_sync_buffer[block_red_idx_offset],
grid_red_size,
last_block,
block_dim);
}
// -- START BLOCK CLEANUP -- //
welfordGroupLastBlock<Aligned, NumArgs, DataType, IndexType>(
out,
global_work_buffer,
LocalWelfordTripletTuple<NumArgs, DataType, IndexType>(
init_avg, init_var, init_N),
block_dim,
shared_buf,
block_red_idx_offset,
num_thread_iters,
num_block_iters,
thread_red_idx_offset,
grid_red_size,
write_preds,
block_reduce_participate,
grid_reduce_participate);
// Forward protect the smem buffer
block_sync::sync<Aligned>(block_dim);
}
template <
int X_BLOCK,
int Y_BLOCK,
int Z_BLOCK,
int X_THREAD,
int Y_THREAD,
int Z_THREAD,
bool PERSISTENT_REDUCTION,
bool BROADCAST>
template <
bool Aligned,
int NumArgs,
typename DataType,
typename IndexType,
typename BlockDimT>
__device__ __inline__ void ParallelReduce<
X_BLOCK,
Y_BLOCK,
Z_BLOCK,
X_THREAD,
Y_THREAD,
Z_THREAD,
PERSISTENT_REDUCTION,
BROADCAST>::
welfordGroup(
typename MakeRefTuple<NumArgs, DataType>::type out_avg,
typename MakeRefTuple<NumArgs, DataType>::type out_var,
typename MakeRefTuple<NumArgs, IndexType>::type out_N,
const typename MakeConstRefTuple<NumArgs, DataType>::type& inp_avg,
const typename MakeConstRefTuple<NumArgs, DataType>::type& inp_var,
const typename MakeConstRefTuple<NumArgs, IndexType>::type& inp_N,
const typename MakeLocalTuple<NumArgs, DataType>::type& init_avg,
const typename MakeLocalTuple<NumArgs, DataType>::type& init_var,
const typename MakeLocalTuple<NumArgs, IndexType>::type& init_N,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
typename MakeVolatilePtrTuple<NumArgs, DataType>::type
global_work_buffer_avg,
typename MakeVolatilePtrTuple<NumArgs, DataType>::type
global_work_buffer_var,
typename MakeVolatilePtrTuple<NumArgs, IndexType>::type
global_work_buffer_N,
int64_t* global_sync_buffer,
PtrTuple<DataType, DataType, IndexType> shared_buf,
const typename MakeLocalTuple<NumArgs, bool>::type& read_preds,
const typename MakeLocalTuple<NumArgs, bool>::type& write_preds,
int64_t& cycles,
int64_t& count) {
int64_t start_counter = 0;
if (isLastBlockInGrid() &&
index_utils::maskedIsZero<true, true, true>(threadIdx)) {
start_counter = readCycleCounter();
}
welfordGroup<Aligned, NumArgs, DataType, IndexType>(
out_avg,
out_var,
out_N,
inp_avg,
inp_var,
inp_N,
init_avg,
init_var,
init_N,
block_dim,
global_work_buffer_avg,
global_work_buffer_var,
global_work_buffer_N,
global_sync_buffer,
shared_buf,
read_preds,
write_preds);
if (isLastBlockInGrid() &&
index_utils::maskedIsZero<true, true, true>(threadIdx)) {
cycles += readCycleCounter() - start_counter;
++count;
}
}
template <
int X_BLOCK,
int Y_BLOCK,
int Z_BLOCK,
int X_THREAD,
int Y_THREAD,
int Z_THREAD,
bool PERSISTENT_REDUCTION,
bool BROADCAST>
template <
bool BLOCK_BROADCAST,
bool Aligned,
int NumVals,
typename DataType,
typename IndexType,
typename BlockDimT>
__device__ __inline__ void ParallelReduce<
X_BLOCK,
Y_BLOCK,
Z_BLOCK,
X_THREAD,
Y_THREAD,
Z_THREAD,
PERSISTENT_REDUCTION,
BROADCAST>::
welfordGroupBlock(
LocalWelfordTripletTuple<NumVals, DataType, IndexType>& block_result,
const ConstRefWelfordTripletTuple<NumVals, DataType, IndexType>& inp,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
PtrTuple<DataType, DataType, IndexType> shared_buf,
const typename MakeLocalTuple<NumVals, bool>::type& read_preds,
bool block_reduce_participate) {
const bool has_block_result = index_utils::
maskedIsZero<isReduce(X_THREAD), isReduce(Y_THREAD), isReduce(Z_THREAD)>(
threadIdx);
copyWelfordTripletTupleIf(
block_result, inp, block_reduce_participate && read_preds);
// Size of the block reduction segment, can be an int since it's limited
// to number of threads
const int block_reduction_size = index_utils::
maskedSize<isReduce(X_THREAD), isReduce(Y_THREAD), isReduce(Z_THREAD)>(
block_dim);
// Index in the reduction segment, can be an int since it's limited to
// number of threads
const int tid_in_block_reduction = index_utils::
maskedOffset<isReduce(X_THREAD), isReduce(Y_THREAD), isReduce(Z_THREAD)>(
threadIdx, block_dim);
// ID of the block reduction this thread is participating in
//
// If any of the parallel dimensions are predicated out, that means
// they've already been reduced, so we only care about the first thread in
// that dimension. Therefore don't expand the reduction_idx by that
// dimension
const int block_reduction_idx = index_utils::
maskedOffset<isIter(X_THREAD), isIter(Y_THREAD), isIter(Z_THREAD)>(
threadIdx, block_dim);
// Do not protect the smem buffer as it's not always necessary.
impl::blockWelfordEach<
BLOCK_BROADCAST,
false,
Aligned,
NumVals,
DataType,
IndexType,
BlockDimT>(
block_result,
block_result,
shared_buf,
has_block_result,
tid_in_block_reduction,
block_reduction_size,
block_reduction_size,
block_reduction_idx,
block_dim);
}
template <
int X_BLOCK,
int Y_BLOCK,
int Z_BLOCK,
int X_THREAD,
int Y_THREAD,
int Z_THREAD,
bool PERSISTENT_REDUCTION,
bool BROADCAST>
template <
bool Aligned,
int NumVals,
typename DataType,
typename IndexType,
typename BlockDimT>
__device__ __inline__ void ParallelReduce<
X_BLOCK,
Y_BLOCK,
Z_BLOCK,
X_THREAD,
Y_THREAD,
Z_THREAD,
PERSISTENT_REDUCTION,
BROADCAST>::
welfordGroupLastBlock(
RefWelfordTripletTuple<NumVals, DataType, IndexType>& out,
const VolatilePtrWelfordTripletTuple<NumVals, DataType, IndexType>&
global_work_buffer,
const LocalWelfordTripletTuple<NumVals, DataType, IndexType>& init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
PtrTuple<DataType, DataType, IndexType> shared_buf,
nvfuser_index_t block_red_idx_offset,
nvfuser_index_t num_thread_iters,
nvfuser_index_t num_block_iters,
nvfuser_index_t thread_red_idx_offset,
nvfuser_index_t grid_red_size,
const typename MakeLocalTuple<NumVals, bool>::type& write_preds,
bool block_reduce_participate,
bool grid_reduce_participate) {
// Initialize block result
auto last_block_result = init_val;
const bool last_block = index_utils::
maskedIsLast<isReduce(X_BLOCK), isReduce(Y_BLOCK), isReduce(Z_BLOCK)>(
blockIdx, gridDim);
if ((PERSISTENT_REDUCTION || last_block) && grid_reduce_participate) {
// Can use the last block to reduce all the values the blocks filled in.
// Can use any thread that has been predicated, or has been reduced to do
// this reduction, cannot use any block that's associated with an
// iteration domain
// Start with non-block reduction
// Index in the reduction segment
int tid_in_block_reduction = index_utils::maskedOffset<
activeNotIter(X_THREAD),
activeNotIter(Y_THREAD),
activeNotIter(Z_THREAD)>(threadIdx, block_dim);
int block_reduction_size = index_utils::maskedSize<
activeNotIter(X_THREAD),
activeNotIter(Y_THREAD),
activeNotIter(Z_THREAD)>(block_dim);
bool has_block_result = index_utils::maskedIsZero<
activeNotIter(X_THREAD),
activeNotIter(Y_THREAD),
activeNotIter(Z_THREAD)>(threadIdx);
// 3D buffer of reductions:
// [reduction_offset(grid), iter_offset(grid), iter_offset(block)]
// Change the offset, we want to keep the last two dimensions, but the
// first dimension is what we will reduce over
const auto work_buf_offset =
block_red_idx_offset * num_thread_iters + thread_red_idx_offset;
for (auto reduction_i = tid_in_block_reduction; reduction_i < grid_red_size;
reduction_i += block_reduction_size) {
impl::welfordEach(
last_block_result,
0,
global_work_buffer,
work_buf_offset + reduction_i * num_block_iters * num_thread_iters);
}
// Which block reduction this thread is participating in
int block_reduction_idx = index_utils::
maskedOffset<isIter(X_THREAD), isIter(Y_THREAD), isIter(Z_THREAD)>(
threadIdx, block_dim);
impl::blockWelfordEach<
BROADCAST,
false,
Aligned,
NumVals,
DataType,
IndexType>(
last_block_result,
last_block_result,
shared_buf,
has_block_result,
tid_in_block_reduction,
block_reduction_size,
min(grid_red_size, block_reduction_size),
block_reduction_idx,
block_dim);
copyWelfordTripletTupleIf(
out,
last_block_result,
write_preds &&
(block_reduce_participate && (BROADCAST || has_block_result)));
}
}
} // namespace fused_reduction
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
namespace fused_reduction {
namespace impl {
// Grouped block welford optimized for outer reductions with
// TIDx and TIDy mapped to non-reduction and reduction domains,
// respectively with unused TIDz.
//
// The main motivation of this optimized version is the same as the
// grouped grid reduction, i.e, by doing multiple reductions together,
// it is possible to reduce the number of synchronizations. However,
// unlike the grouped grid reduction, the cost of grouping can be
// prohitively high, i.e., the size of the work buffer must be
// expanded by a factor of grouping. In the case of grid
// reductions, the buffer is on global memory, so the space requirement
// is not a concern, but that isn't the case with block reductions,
// since the buffer is on shared memory, which has a limited
// capacity.
//
// This implementation tries to benefit from aggregated block
// synchronizations while minimizing the cost of the expanded buffer
// size by first partially reducing the input within each warp. It
// would save the required buffer size by a factor of WARP_SIZE /
// blockDim.x as the reduction is done along threadIdx.y. So to be
// effective, blockDim.x needs to be smaller than WARP_SIZE, and in the
// case of grouped grid welford, it should be typically 8 or 16.
//
// The algorithm is an adaptation of scattered butterfly reduction,
// aka recursive halving, commonly used for implementing
// MPI_Reduce_scatter. For a visual illustration of the data
// organization, see, for example, page 22 of Solomonik,
// Design of Parallel and High-Performance Computing:
// Distributed-Memory Models and Algorithms, 2015
// (https://solomonik.cs.illinois.edu/talks/dphpc-dec-2015.pdf)
//
// Assumptions:
// - blockDim.x and blockDim.y are statically known values so that all
// loops can be completely unrolled
// - blockDim.x is smaller than WARP_SIZE
// - blockDim.x evenly divides WARP_SIZE
// - There are multiple warps per block
// - The gouping factor, NumVals, is at least as large as the warp
// dimY and is divisible by the warp dimY.
//
// This is meant to be used as part of the grouped grid welford
// reduction but should be usable as a standalone block welford routine as
// long as the above assumptions hold.
//
// Note: Having an output reference parameter resulted in using more
// registers than just returing the output. Results would vary
// depending on compiler versions, but it seems safer to return outputs
// as a new value.
template <
bool Aligned,
int NumVals,
typename DataType,
int BDIMX,
int BDIMY,
typename BlockDimT>
__inline__ __device__ WelfordTriplet<DataType> blockWelfordOuter(
DataType* inp_avg,
DataType* inp_var,
nvfuser_index_t inp_N,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
DataType* smem) {
constexpr int num_warps = BDIMX * BDIMY / 32;
static_assert(num_warps >= 1, "There must be at least a single warp");
static_assert(32 % BDIMX == 0, "blockDimx.x must be able to divide 32");
const int tid = threadIdx.x + threadIdx.y * BDIMX;
const int wid = tid / 32;
// Dimension of the Y axis within each warp
constexpr int wdimy = 32 / BDIMX;
static_assert(NumVals >= wdimy, "NumVals must be >= 32 / blockDim.x");
static_assert(
NumVals % wdimy == 0, "NumVals must be divisible by 32 / blockDim.x");
// There must be at least a single warp
// Y index within each warp
const int warp_tidy = threadIdx.y % wdimy;
// Thread index in each warp
const int lane_id = threadIdx.x + warp_tidy * BDIMX;
constexpr int smem_var_offset = num_warps * BDIMX * NumVals;
constexpr int smem_N_offset = num_warps * BDIMX * NumVals * 2;
// We define a chunk as a value in a group and a chunk size as the
// number of group values per thread. Initially, the chunk size is
// NumVals. After the initial warp reduction, the chunk size is
// reduced to NumVals/wdimy. For example, suppose NumVals=8,
// blockDim.x=8, blockDim.y=32, then wdimy=4, so after the initial
// warp reduction, the chunk size is 2. This is the number of
// elements each thread stores to shared memory.
int chunk_size = NumVals;
// Butterfly reduction, a.k.a. recursive halving as each iteration
// halves the number of values
#pragma unroll
for (int lane_mask = 16; lane_mask >= BDIMX; lane_mask /= 2) {
chunk_size /= 2;
const auto peer_N = __shfl_xor_sync(0xffffffff, inp_N, lane_mask);
const auto updated_N = inp_N + peer_N;
const DataType b_N_div_ab_N =
updated_N != 0 ? ((DataType)peer_N) / ((DataType)updated_N) : 0;
#pragma unroll
for (int index_in_chunk = 0; index_in_chunk < chunk_size;
++index_in_chunk) {
DataType pushed_avg = 0;
DataType pushed_var = 0;
DataType self_avg = 0;
DataType self_var = 0;
// Divergent branch. Not a big deal with independent scheduling?
if (lane_id & lane_mask) {
// Push first half
auto push_offset = index_in_chunk;
auto self_offset = index_in_chunk + chunk_size;
pushed_avg = inp_avg[push_offset];
pushed_var = inp_var[push_offset];
self_avg = inp_avg[self_offset];
self_var = inp_var[self_offset];
} else {
// Push second half
auto push_offset = index_in_chunk + chunk_size;
auto self_offset = index_in_chunk;
pushed_avg = inp_avg[push_offset];
pushed_var = inp_var[push_offset];
self_avg = inp_avg[self_offset];
self_var = inp_var[self_offset];
}
auto peer_avg = __shfl_xor_sync(0xffffffff, pushed_avg, lane_mask);
auto peer_var = __shfl_xor_sync(0xffffffff, pushed_var, lane_mask);
auto delta = peer_avg - self_avg;
self_avg += delta * b_N_div_ab_N;
self_var += peer_var + delta * delta * ((DataType)(inp_N)) * b_N_div_ab_N;
inp_avg[index_in_chunk] = self_avg;
inp_var[index_in_chunk] = self_var;
}
inp_N = updated_N;
}
// At this point, chunk_size is reduced to NumVals/wdimy as
// mentioned above. Each thread has warp-reduced chunk_size values
// in array inp. This chunk_size_post_reduction should be equal to
// chunk_size at this point.
constexpr int chunk_size_post_reduction = NumVals / wdimy;
// More specifically, the warp_tidy of each thread defines
// the chunk IDs held by the thread as follows:
//
// [warp_tidy * chunk_size_post_reduction, warp_tidy *
// chunk_size_post_reduction + chunk_size_post_reduction]
//
// Each thread uploads the chunk_size_post_reduction values one by
// one. Each chunk is spread by BDIMX * BDIMY values. The data
// layout of the shared memory is:
//
// [chunk_size, wid, warp_tidy, TIDx]
//
// The remaining reduction is done on the WID
// dimension. More specifically, we assign one warp per chunk (or
// a value of the group). The wdimy threads of the same threadId.x
// collectively reduce num_warps partial results, each of which is
// stored with stride 32. This means that there will be wdimy-way
// bank conflicts, so to avoid that, swizzling is also employed.
#pragma unroll
for (int i = 0; i < chunk_size; ++i) {
// Accumulating smem offset from the innermost dimension
int smem_offset = 0;
// TIDx
smem_offset += threadIdx.x;
// Warp_TIDy with swizzle
smem_offset += ((warp_tidy + wid) % wdimy) * BDIMX;
// WID
smem_offset += wid * 32;
// chunk_size
smem_offset += i * BDIMX * BDIMY;
smem[smem_offset] = inp_avg[i];
smem[smem_var_offset + smem_offset] = inp_var[i];
// Upload N only when threadIdx.x == 0 && chunk_index == 0
if (threadIdx.x == 0 && i == 0 && warp_tidy == 0) {
reinterpret_cast<nvfuser_index_t*>(smem + smem_N_offset)[wid] = inp_N;
}
}
block_sync::sync<Aligned>(block_dim);
// The next step is to let each thread of a warp independently
// accumulate the partial results on the shared memory
// reduction. A single warp is used to accumulate of the partial
// results for a single chunk, so warp wid takes care of the wid-th
// chunk.
//
// The starting offset of partial results of a chunk is:
//
// (wid % chunk_size_post_reduction) * BDIMX * BDIMY + (wid /
// chunk_size_post_reduction) * BDIMX
//
// Note that each thread had chunk_size_post_reduction contiguous
// chunks, so when uploaded to shmem, they are strided by
// BDIMX*BDIMY, hence (wid % chunk_size_post_reduction) * BDIMX *
// BDIMY.
// The vector width is likely at least 4, so at least 4 warps should
// be used, which is
// enough to occupy an SM. When NumVals=8, it might be more
// efficient to use just 4 warps with each warp taking care of two
// groups, but the difference would be pretty small.
// Also, the number of warps should be at least 8 and can be 16
// too. NumVals should be 8 at largest, so it's always num_warps >=
// NumVals.
DataType avg = 0;
DataType var = 0;
nvfuser_index_t N = 0;
static_assert(
num_warps >= NumVals,
"Number of warps must be at least as large as NumVals");
if (wid < NumVals) {
#pragma unroll
for (int i = warp_tidy; i < num_warps; i += wdimy) {
int offset = 0;
offset += threadIdx.x;
// Offset to the partial results of the i-th warp
offset += i * 32;
// Offset to the chunk for this warp. Swizzled to avoid bank
// conflicts.
offset += ((wid / chunk_size + i) % wdimy) * BDIMX;
offset += (wid % chunk_size) * BDIMX * BDIMY;
DataType avg_smem = smem[offset];
DataType var_smem = smem[smem_var_offset + offset];
nvfuser_index_t N_smem =
reinterpret_cast<nvfuser_index_t*>(&smem[smem_N_offset])[i];
welfordCombine(avg, var, N, avg_smem, var_smem, N_smem);
}
}
block_sync::sync<Aligned>(block_dim);
// Nothing to do for warps whose wid is larger than NunVals
if (wid >= NumVals) {
WelfordTriplet<DataType> out = {0, 0, 0};
return out;
}
// Standard binary-exchange reduction within wdimy intra-warp
// threads.
#pragma unroll
for (int lane_mask = 16; lane_mask >= BDIMX; lane_mask /= 2) {
auto avg_peer = __shfl_xor_sync(0xffffffff, avg, lane_mask);
auto var_peer = __shfl_xor_sync(0xffffffff, var, lane_mask);
auto N_peer = __shfl_xor_sync(0xffffffff, N, lane_mask);
welfordCombine(avg, var, N, avg_peer, var_peer, N_peer);
}
WelfordTriplet<DataType> out = {avg, var, N};
return out;
}
} // namespace impl
} // namespace fused_reduction
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
namespace fused_reduction {
namespace impl {
// Utility struct to hold multiple values for grouped Welford. The
// count is uniform, so there's only one N value.
template <int NumVals, typename DataType>
struct WelfordTripletVector {
Array<DataType, NumVals, NumVals> avg_;
Array<DataType, NumVals, NumVals> var_;
nvfuser_index_t N_;
WelfordTripletVector() = default;
__device__ WelfordTripletVector(
const DataType avg[NumVals],
const DataType var[NumVals],
const nvfuser_index_t N) {
memcpy(avg_.array, avg, sizeof(DataType) * NumVals);
memcpy(var_.array, var, sizeof(DataType) * NumVals);
N_ = N;
}
__device__ WelfordTripletVector& operator=(
const WelfordTripletVector<NumVals, DataType>& other) {
avg_ = other.avg_;
var_ = other.var_;
N_ = other.N_;
return *this;
}
__device__ void init() {
avg_.set((DataType)0);
var_.set((DataType)0);
N_ = 0;
}
__device__ DataType& avg(int idx) {
return avg_[idx];
}
__device__ DataType avg(int idx) const {
return avg_.array[idx];
}
__device__ DataType& var(int idx) {
return var_[idx];
}
__device__ DataType var(int idx) const {
return var_.array[idx];
}
__device__ nvfuser_index_t& N() {
return N_;
}
__device__ nvfuser_index_t N() const {
return N_;
}
};
// The offset in smem buffer to broadcast final results within a
// thread block
template <int BDIMX>
__inline__ __device__ int getSmemGroupOffset(int iter_idx, int group_idx) {
return group_idx * BDIMX + iter_idx;
}
// Upload the final results to smem for intra-block broadcasting
template <int NumVals, typename DataType, int BDIMX, int BDIMY>
__inline__ __device__ void copyFromTripletToSmem(
DataType* smem,
int iter_idx,
int group_idx,
const WelfordTriplet<DataType>& local_triplet) {
int offset = getSmemGroupOffset<BDIMX>(iter_idx, group_idx);
smem[offset] = local_triplet.avg;
int smem_stride = BDIMX * NumVals;
smem[smem_stride + offset] = local_triplet.var;
if (iter_idx == 0 && group_idx == 0) {
reinterpret_cast<nvfuser_index_t*>(smem + smem_stride * 2)[0] =
local_triplet.N;
}
}
// Fetch the final results from smem for intra-block broadcasting
template <int NumVals, typename DataType, int BDIMX, int BDIMY>
__inline__ __device__ void copyFromSmemToTriplet(
WelfordTriplet<DataType>& local_triplet,
const DataType* smem,
int iter_idx,
int group_idx) {
int offset = getSmemGroupOffset<BDIMX>(iter_idx, group_idx);
local_triplet.avg = smem[offset];
int smem_stride = BDIMX * NumVals;
local_triplet.var = smem[smem_stride + offset];
local_triplet.N =
reinterpret_cast<const nvfuser_index_t*>(smem + smem_stride * 2)[0];
}
// Per-thread accumulation of the per-block partial results in global
// memory. There's gridDim.y partial results, which is accumulated in
// parallel by threadIdx.y. This should be followed by a block reduction.
template <int NumVals, typename DataType, int BDIMX, int BDIMY>
__device__ __inline__ WelfordTripletVector<NumVals, DataType>
welfordGroupAccumulateGlobalBuffer(
volatile DataType* global_buf_avg,
volatile DataType* global_buf_var,
volatile nvfuser_index_t* global_buf_N,
bool flip) {
const int grid_size = gridDim.x * gridDim.y;
const int iter_idx = threadIdx.x;
const int red_idx = threadIdx.y;
const int num_threads_per_reduction = BDIMY;
WelfordTripletVector<NumVals, DataType> results;
results.init();
// Reduction is done cooperatively with the thread blocks with the
// same blockIdx.x. Thread blocks with the same blockIdx.x uses a
// global buffer of size blockDim.x * gridDim.y for each value in a
// group.
// Advance the global buffer pointers to the location of the values
// to accumulate for the first group value (i.e., gi == 0 in the
// below NumVals loop)
global_buf_avg += iter_idx + blockIdx.x * BDIMX * gridDim.y;
global_buf_var += iter_idx + blockIdx.x * BDIMX * gridDim.y;
global_buf_N += iter_idx + blockIdx.x * BDIMX * gridDim.y;
if (flip) {
global_buf_avg += BDIMX * grid_size * NumVals;
global_buf_var += BDIMX * grid_size * NumVals;
global_buf_N += BDIMX * grid_size * NumVals;
}
// Since there's gridDim.y elements to reduce using blockDim.y
// threads, loop over gridDim.y with stride blockDim.y. First, just
// grab the values in the global memory.
if (red_idx < gridDim.y) {
int work_buf_offset = red_idx * BDIMX;
// N is constant across NumVals
const auto g_N = global_buf_N[work_buf_offset];
results.N() = g_N;
// Just copy the first elements
#pragma unroll
for (int gi = 0; gi < NumVals; ++gi) {
auto& a_avg = results.avg(gi);
auto& a_var = results.var(gi);
auto b_avg = global_buf_avg[work_buf_offset];
auto b_var = global_buf_var[work_buf_offset];
work_buf_offset += grid_size * BDIMX;
results.avg(gi) = b_avg;
results.var(gi) = b_var;
}
}
// Accumulate into results by looping over the remaining results in
// the global buffer
for (int ri = red_idx + num_threads_per_reduction; ri < gridDim.y;
ri += num_threads_per_reduction) {
int work_buf_offset = ri * BDIMX;
// N is constant across NumVals
const auto g_N = global_buf_N[work_buf_offset];
nvfuser_index_t updated_N = results.N() + g_N;
// Hoist the division by updated_N as it's invariant over the
// NumVals loop
DataType b_N_div_ab_N = updated_N != 0
? (((DataType)g_N) / ((DataType)updated_N))
: (DataType)0;
DataType a_N_b_N_div_ab_N = ((DataType)results.N()) * b_N_div_ab_N;
#pragma unroll
for (int gi = 0; gi < NumVals; ++gi) {
auto& a_avg = results.avg(gi);
auto& a_var = results.var(gi);
auto b_avg = global_buf_avg[work_buf_offset];
auto b_var = global_buf_var[work_buf_offset];
work_buf_offset += grid_size * BDIMX;
auto delta = b_avg - a_avg;
a_avg += delta * b_N_div_ab_N;
a_var += b_var + delta * delta * a_N_b_N_div_ab_N;
}
results.N() = updated_N;
}
return results;
}
} // namespace impl
template <
int X_BLOCK,
int Y_BLOCK,
int Z_BLOCK,
int X_THREAD,
int Y_THREAD,
int Z_THREAD,
bool PERSISTENT_REDUCTION,
bool BROADCAST>
template <
bool Aligned,
int NumVals,
typename DataType,
int BDIMX,
int BDIMY,
typename BlockDimT>
__device__ __inline__ void ParallelReduce<
X_BLOCK,
Y_BLOCK,
Z_BLOCK,
X_THREAD,
Y_THREAD,
Z_THREAD,
PERSISTENT_REDUCTION,
BROADCAST>::
welfordGroupOuter(
DataType out_avg[NumVals],
DataType out_var[NumVals],
nvfuser_index_t out_N[NumVals],
const DataType in_avg[NumVals],
const DataType in_var[NumVals],
nvfuser_index_t in_N,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
DataType* global_buf_avg,
DataType* global_buf_var,
nvfuser_index_t* global_buf_N,
DataType* shared_buf,
int64_t* global_sync_buffer) {
using namespace fused_reduction::impl;
static_assert(
isIter(X_BLOCK) && isReduce(Y_BLOCK) && inactive(Z_BLOCK) &&
isIter(X_THREAD) && isReduce(Y_THREAD) && inactive(Z_THREAD),
"Invalid parallelization for outer welford reduction");
static_assert(
BDIMY % NumVals == 0, "blockDim.y must be divisible by group count");
static_assert(BDIMX <= 32, "blockDim.x must be up to 32.");
static_assert(
(BDIMX * BDIMY) % 32 == 0, "Number of threads must be a multiple of 32.");
static_assert(32 % BDIMX == 0, "blockDim.x must be able to divide 32.");
static_assert(
NumVals >= (32 / BDIMX), "Group count must be >= 32 / blockDim.x");
#pragma unroll
for (int i = 0; i < NumVals; ++i) {
out_avg[i] = in_avg[i];
out_var[i] = in_var[i];
}
auto iter_tid = index_utils::
maskedOffset<isIter(X_THREAD), isIter(Y_THREAD), isIter(Z_THREAD)>(
threadIdx, block_dim);
auto per_block_result =
impl::blockWelfordOuter<Aligned, NumVals, DataType, BDIMX, BDIMY>(
out_avg, out_var, in_N, block_dim, shared_buf);
// At this point, threads with tid_in_group == 0 has valid partial
// results. Store them to global buffer.
const int grid_size = gridDim.x * gridDim.y;
const int iter_idx = threadIdx.x;
// Stores the partial results into the global work buffer. Only
// threads with tid_in_group have the valid partial results
const int wid = (threadIdx.x + threadIdx.y * BDIMX) / 32;
constexpr int wdimy = 32 / BDIMX;
const int warp_tidy = threadIdx.y % wdimy;
const bool has_valid_block_reduction_result = warp_tidy == 0 && wid < NumVals;
// Each valid result is held by a warp
const int valid_group_idx = wid;
if (has_valid_block_reduction_result) {
int work_buf_offset = iter_idx + blockIdx.y * BDIMX +
blockIdx.x * BDIMX * gridDim.y + valid_group_idx * BDIMX * grid_size;
if (PERSISTENT_REDUCTION && flip) {
auto global_buffer_size = BDIMX * grid_size * NumVals;
work_buf_offset += global_buffer_size;
}
global_buf_avg[work_buf_offset] = per_block_result.avg;
global_buf_var[work_buf_offset] = per_block_result.var;
// the count values should be the same across the group, so just
// store once
if (valid_group_idx == 0) {
global_buf_N[work_buf_offset] = per_block_result.N;
}
}
flip = !flip;
// -- GLOBAL BUFFER FILLED -- //
bool last_block = index_utils::
maskedIsLast<isReduce(X_BLOCK), isReduce(Y_BLOCK), isReduce(Z_BLOCK)>(
blockIdx, gridDim);
grid_sync::sync<
isReduce(X_BLOCK),
isReduce(Y_BLOCK),
isReduce(Z_BLOCK),
PERSISTENT_REDUCTION,
Aligned>(
global_sync_buffer[blockIdx.x], gridDim.y, last_block, block_dim);
auto partial_results =
welfordGroupAccumulateGlobalBuffer<NumVals, DataType, BDIMX, BDIMY>(
global_buf_avg, global_buf_var, global_buf_N, !flip);
auto per_block_final_result =
impl::blockWelfordOuter<Aligned, NumVals, DataType, BDIMX, BDIMY>(
partial_results.avg_.array,
partial_results.var_.array,
partial_results.N_,
block_dim,
shared_buf);
// At this point, each thread of the groups with tid_in_group=0
// has the final reduction result. We need to upload them to
// shmem for broadcasting.
if (has_valid_block_reduction_result) {
copyFromTripletToSmem<NumVals, DataType, BDIMX, BDIMY>(
shared_buf, iter_idx, valid_group_idx, per_block_final_result);
}
__syncthreads();
#pragma unroll
for (int i = 0; i < NumVals; ++i) {
WelfordTriplet<DataType> final_result;
copyFromSmemToTriplet<NumVals, DataType, BDIMX, BDIMY>(
final_result, shared_buf, iter_idx, i);
out_avg[i] = final_result.avg;
out_var[i] = final_result.var;
in_N = final_result.N;
}
#pragma unroll
for (int i = 0; i < NumVals; ++i) {
out_N[i] = in_N;
}
// Forward protect the smem buffer
__syncthreads();
}
template <
int X_BLOCK,
int Y_BLOCK,
int Z_BLOCK,
int X_THREAD,
int Y_THREAD,
int Z_THREAD,
bool PERSISTENT_REDUCTION,
bool BROADCAST>
template <
bool Aligned,
int NumVals,
typename DataType,
int BDIMX,
int BDIMY,
typename BlockDimT>
__device__ __inline__ void ParallelReduce<
X_BLOCK,
Y_BLOCK,
Z_BLOCK,
X_THREAD,
Y_THREAD,
Z_THREAD,
PERSISTENT_REDUCTION,
BROADCAST>::
welfordGroupOuter(
DataType out_avg[NumVals],
DataType out_var[NumVals],
nvfuser_index_t out_N[NumVals],
const DataType in_avg[NumVals],
const DataType in_var[NumVals],
nvfuser_index_t in_N,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
DataType* global_buf_avg,
DataType* global_buf_var,
nvfuser_index_t* global_buf_N,
DataType* shared_buf,
int64_t* global_sync_buffer,
int64_t& cycles,
int64_t& count) {
int64_t start_counter = 0;
if (isLastBlockInGrid() &&
index_utils::maskedIsZero<true, true, true>(threadIdx)) {
start_counter = readCycleCounter();
}
welfordGroupOuter<Aligned, NumVals, DataType, BDIMX, BDIMY>(
out_avg,
out_var,
out_N,
in_avg,
in_var,
in_N,
block_dim,
global_buf_avg,
global_buf_var,
global_buf_N,
shared_buf,
global_sync_buffer);
if (isLastBlockInGrid() &&
index_utils::maskedIsZero<true, true, true>(threadIdx)) {
cycles += readCycleCounter() - start_counter;
++count;
}
}
} // namespace fused_reduction
// Codegen generated code
Test Diffs
Toggle All
1: GpuViewTest.FusionReshapePersistentShmoo
Kernel 32
CUDA
PTX
53997da5d
Diff
03a1b695e
-4
+4 index type: int
registers: 18
gmem: 3
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 4, 4> T0, Tensor<float, 4, 4> T1, Tensor<float, 7, 7> T12) {
if (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < 120)) {
Array<float, 1, 1> T14;
T14[0] = 0;
T14[0]
= T1[(((((T1.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 60)) + (T1.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 20) % 3))) + ((2 * T1.alloc_stride[2LL]) * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) / 10))) + (T1.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) / 5))) + (T1.alloc_stride[3LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) % 5)))];
Array<float, 1, 1> T13;
T13[0] = 0;
T13[0]
= T0[(((((T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 60)) + (T0.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 20) % 3))) + ((2 * T0.alloc_stride[2LL]) * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) / 10))) + (T0.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) / 5))) + (T0.alloc_stride[3LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) % 5)))];
Array<float, 1, 1> T2;
T2[0]
= T13[0]
+ T14[0];
Array<float, 1, 1> T3;
T3[0]
= T2[0];
Array<float, 1, 1> T4;
T4[0]
= T3[0];
Array<float, 1, 1> T7;
T7[0]
= T4[0]
- T4[0];
Array<float, 1, 1> T8;
T8[0]
= expf(T7[0]);
Array<float, 1, 1> T11;
T11[0]
= reciprocal(T8[0]);
Array<float, 1, 1> T15;
T15[0]
= T8[0]
* T11[0];
T12[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
= T15[0];
}
}
__global__ void nvfuser_N(Tensor<float, 4, 4> T0, Tensor<float, 4, 4> T1, Tensor<float, 7, 7> T12) {
if ((((nvfuser_index_t)threadIdx.x) < 120)) {
Array<float, 1, 1> T14;
T14[0] = 0;
T14[0]
= T1[(((((T1.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) / 60)) + (T1.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.x) / 20) % 3))) + ((2 * T1.alloc_stride[2LL]) * ((((nvfuser_index_t)threadIdx.x) % 20) / 10))) + (T1.alloc_stride[2LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) / 5))) + (T1.alloc_stride[3LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) % 5)))];
Array<float, 1, 1> T13;
T13[0] = 0;
T13[0]
= T0[(((((T0.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) / 60)) + (T0.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.x) / 20) % 3))) + ((2 * T0.alloc_stride[2LL]) * ((((nvfuser_index_t)threadIdx.x) % 20) / 10))) + (T0.alloc_stride[2LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) / 5))) + (T0.alloc_stride[3LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) % 5)))];
Array<float, 1, 1> T2;
T2[0]
= T13[0]
+ T14[0];
Array<float, 1, 1> T3;
T3[0]
= T2[0];
Array<float, 1, 1> T4;
T4[0]
= T3[0];
Array<float, 1, 1> T7;
T7[0]
= T4[0]
- T4[0];
Array<float, 1, 1> T8;
T8[0]
= expf(T7[0]);
Array<float, 1, 1> T11;
T11[0]
= reciprocal(T8[0]);
Array<float, 1, 1> T15;
T15[0]
= T8[0]
* T11[0];
T12[((nvfuser_index_t)threadIdx.x)]
= T15[0];
}
}
--- 53997da5d
+++ 03a1b695e
@@ -1,15 +1,15 @@
__global__ void nvfuser_N(Tensor<float, 4, 4> T0, Tensor<float, 4, 4> T1, Tensor<float, 7, 7> T12) {
- if (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < 120)) {
+ if ((((nvfuser_index_t)threadIdx.x) < 120)) {
Array<float, 1, 1> T14;
T14[0] = 0;
T14[0]
- = T1[(((((T1.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 60)) + (T1.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 20) % 3))) + ((2 * T1.alloc_stride[2LL]) * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) / 10))) + (T1.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) / 5))) + (T1.alloc_stride[3LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) % 5)))];
+ = T1[(((((T1.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) / 60)) + (T1.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.x) / 20) % 3))) + ((2 * T1.alloc_stride[2LL]) * ((((nvfuser_index_t)threadIdx.x) % 20) / 10))) + (T1.alloc_stride[2LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) / 5))) + (T1.alloc_stride[3LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) % 5)))];
Array<float, 1, 1> T13;
T13[0] = 0;
T13[0]
- = T0[(((((T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 60)) + (T0.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 20) % 3))) + ((2 * T0.alloc_stride[2LL]) * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) / 10))) + (T0.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) / 5))) + (T0.alloc_stride[3LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) % 5)))];
+ = T0[(((((T0.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) / 60)) + (T0.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.x) / 20) % 3))) + ((2 * T0.alloc_stride[2LL]) * ((((nvfuser_index_t)threadIdx.x) % 20) / 10))) + (T0.alloc_stride[2LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) / 5))) + (T0.alloc_stride[3LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) % 5)))];
Array<float, 1, 1> T2;
T2[0]
= T13[0]
+ T14[0];
Array<float, 1, 1> T3;
@@ -30,9 +30,9 @@
= reciprocal(T8[0]);
Array<float, 1, 1> T15;
T15[0]
= T8[0]
* T11[0];
- T12[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
+ T12[((nvfuser_index_t)threadIdx.x)]
= T15[0];
}
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_37_cu_89a70c69_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_37_cu_89a70c69_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_37_cu_89a70c69_191103std14__numeric_typeIvE5valueE = 1;
.entry _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_37_cu_89a70c69_1911010nvfuser_37ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE(
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_37_cu_89a70c69_1911010nvfuser_37ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0[40],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_37_cu_89a70c69_1911010nvfuser_37ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1[40],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_37_cu_89a70c69_1911010nvfuser_37ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_2[64]
)
{
.reg .pred %p<2>;
.reg .f32 %f<23>;
.reg .b32 %r<113>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r44, %r45}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_37_cu_89a70c69_1911010nvfuser_37ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0+24];
ld.param.v2.u32 {%r46, %r47}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_37_cu_89a70c69_1911010nvfuser_37ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0+32];
ld.param.v2.u32 {%r52, %r53}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_37_cu_89a70c69_1911010nvfuser_37ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1+24];
ld.param.v2.u32 {%r54, %r55}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_37_cu_89a70c69_1911010nvfuser_37ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1+32];
ld.param.u64 %rd3, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_37_cu_89a70c69_1911010nvfuser_37ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_2];
ld.param.u64 %rd2, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_37_cu_89a70c69_1911010nvfuser_37ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1];
ld.param.u64 %rd1, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_37_cu_89a70c69_1911010nvfuser_37ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0];
mov.u32 %r70, %ctaid.x;
shl.b32 %r71, %r70, 7;
mov.u32 %r72, %tid.x;
add.s32 %r9, %r71, %r72;
setp.gt.s32 %p1, %r9, 119;
@%p1 bra $L__BB0_2;
cvta.to.global.u64 %rd4, %rd2;
cvta.to.global.u64 %rd5, %rd1;
cvta.to.global.u64 %rd6, %rd3;
mul.hi.s32 %r73, %r9, -2004318071;
add.s32 %r74, %r73, %r9;
shr.u32 %r75, %r74, 31;
shr.s32 %r76, %r74, 5;
add.s32 %r77, %r76, %r75;
mul.hi.s32 %r78, %r9, 1717986919;
shr.u32 %r79, %r78, 31;
shr.s32 %r80, %r78, 3;
add.s32 %r81, %r80, %r79;
mul.hi.s32 %r82, %r81, 1431655766;
shr.u32 %r83, %r82, 31;
add.s32 %r84, %r82, %r83;
mul.lo.s32 %r85, %r84, 3;
sub.s32 %r86, %r81, %r85;
mul.lo.s32 %r87, %r53, %r86;
mul.lo.s32 %r88, %r81, 20;
sub.s32 %r89, %r9, %r88;
mul.hi.s32 %r90, %r89, 1717986919;
shr.u32 %r91, %r90, 31;
shr.s32 %r92, %r90, 2;
add.s32 %r93, %r92, %r91;
shl.b32 %r94, %r93, 1;
mul.lo.s32 %r95, %r93, 10;
sub.s32 %r96, %r89, %r95;
mul.hi.s32 %r97, %r96, 1717986919;
shr.u32 %r98, %r97, 31;
shr.s32 %r99, %r97, 1;
add.s32 %r100, %r99, %r98;
mul.lo.s32 %r101, %r100, 5;
sub.s32 %r102, %r96, %r101;
add.s32 %r103, %r94, %r100;
mad.lo.s32 %r104, %r52, %r77, %r87;
mad.lo.s32 %r105, %r55, %r102, %r104;
mad.lo.s32 %r106, %r103, %r54, %r105;
mul.wide.s32 %rd7, %r106, 4;
add.s64 %rd8, %rd4, %rd7;
mul.lo.s32 %r107, %r45, %r86;
mad.lo.s32 %r108, %r44, %r77, %r107;
mad.lo.s32 %r109, %r102, %r47, %r108;
mad.lo.s32 %r110, %r103, %r46, %r109;
mul.wide.s32 %rd9, %r110, 4;
add.s64 %rd10, %rd5, %rd9;
ld.global.f32 %f1, [%rd10];
ld.global.f32 %f2, [%rd8];
add.f32 %f3, %f2, %f1;
sub.f32 %f4, %f3, %f3;
mov.f32 %f5, 0f3F000000;
mov.f32 %f6, 0f3BBB989D;
fma.rn.f32 %f7, %f4, %f6, %f5;
cvt.sat.f32.f32 %f8, %f7;
mov.f32 %f9, 0f4B400001;
mov.f32 %f10, 0f437C0000;
fma.rm.f32 %f11, %f8, %f10, %f9;
add.f32 %f12, %f11, 0fCB40007F;
neg.f32 %f13, %f12;
mov.f32 %f14, 0f3FB8AA3B;
fma.rn.f32 %f15, %f4, %f14, %f13;
mov.f32 %f16, 0f32A57060;
fma.rn.f32 %f17, %f4, %f16, %f15;
mov.b32 %r111, %f11;
shl.b32 %r112, %r111, 23;
mov.b32 %f18, %r112;
ex2.approx.ftz.f32 %f19, %f17;
mul.f32 %f20, %f19, %f18;
rcp.rn.f32 %f21, %f20;
mul.f32 %f22, %f20, %f21;
mul.wide.s32 %rd11, %r9, 4;
add.s64 %rd12, %rd6, %rd11;
st.global.f32 [%rd12], %f22;
$L__BB0_2:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_37_cu_63f423c9_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_37_cu_63f423c9_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_37_cu_63f423c9_160113std14__numeric_typeIvE5valueE = 1;
.entry _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_37_cu_63f423c9_1601110nvfuser_37ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE(
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_37_cu_63f423c9_1601110nvfuser_37ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0[40],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_37_cu_63f423c9_1601110nvfuser_37ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1[40],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_37_cu_63f423c9_1601110nvfuser_37ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_2[64]
)
{
.reg .pred %p<2>;
.reg .f32 %f<23>;
.reg .b32 %r<110>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r44, %r45}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_37_cu_63f423c9_1601110nvfuser_37ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0+24];
ld.param.v2.u32 {%r46, %r47}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_37_cu_63f423c9_1601110nvfuser_37ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0+32];
ld.param.v2.u32 {%r52, %r53}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_37_cu_63f423c9_1601110nvfuser_37ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1+24];
ld.param.v2.u32 {%r54, %r55}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_37_cu_63f423c9_1601110nvfuser_37ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1+32];
ld.param.u64 %rd3, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_37_cu_63f423c9_1601110nvfuser_37ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_2];
ld.param.u64 %rd2, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_37_cu_63f423c9_1601110nvfuser_37ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1];
ld.param.u64 %rd1, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_37_cu_63f423c9_1601110nvfuser_37ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0];
mov.u32 %r9, %tid.x;
setp.gt.s32 %p1, %r9, 119;
@%p1 bra $L__BB0_2;
cvta.to.global.u64 %rd4, %rd2;
cvta.to.global.u64 %rd5, %rd1;
cvta.to.global.u64 %rd6, %rd3;
mul.hi.s32 %r70, %r9, -2004318071;
add.s32 %r71, %r70, %r9;
shr.u32 %r72, %r71, 31;
shr.s32 %r73, %r71, 5;
add.s32 %r74, %r73, %r72;
mul.hi.s32 %r75, %r9, 1717986919;
shr.u32 %r76, %r75, 31;
shr.s32 %r77, %r75, 3;
add.s32 %r78, %r77, %r76;
mul.hi.s32 %r79, %r78, 1431655766;
shr.u32 %r80, %r79, 31;
add.s32 %r81, %r79, %r80;
mul.lo.s32 %r82, %r81, 3;
sub.s32 %r83, %r78, %r82;
mul.lo.s32 %r84, %r53, %r83;
mul.lo.s32 %r85, %r78, 20;
sub.s32 %r86, %r9, %r85;
mul.hi.s32 %r87, %r86, 1717986919;
shr.u32 %r88, %r87, 31;
shr.s32 %r89, %r87, 2;
add.s32 %r90, %r89, %r88;
shl.b32 %r91, %r90, 1;
mul.lo.s32 %r92, %r90, 10;
sub.s32 %r93, %r86, %r92;
mul.hi.s32 %r94, %r93, 1717986919;
shr.u32 %r95, %r94, 31;
shr.s32 %r96, %r94, 1;
add.s32 %r97, %r96, %r95;
mul.lo.s32 %r98, %r97, 5;
sub.s32 %r99, %r93, %r98;
add.s32 %r100, %r91, %r97;
mad.lo.s32 %r101, %r52, %r74, %r84;
mad.lo.s32 %r102, %r55, %r99, %r101;
mad.lo.s32 %r103, %r100, %r54, %r102;
mul.wide.s32 %rd7, %r103, 4;
add.s64 %rd8, %rd4, %rd7;
mul.lo.s32 %r104, %r45, %r83;
mad.lo.s32 %r105, %r44, %r74, %r104;
mad.lo.s32 %r106, %r47, %r99, %r105;
mad.lo.s32 %r107, %r100, %r46, %r106;
mul.wide.s32 %rd9, %r107, 4;
add.s64 %rd10, %rd5, %rd9;
ld.global.f32 %f1, [%rd10];
ld.global.f32 %f2, [%rd8];
add.f32 %f3, %f2, %f1;
sub.f32 %f4, %f3, %f3;
mov.f32 %f5, 0f3F000000;
mov.f32 %f6, 0f3BBB989D;
fma.rn.f32 %f7, %f4, %f6, %f5;
cvt.sat.f32.f32 %f8, %f7;
mov.f32 %f9, 0f4B400001;
mov.f32 %f10, 0f437C0000;
fma.rm.f32 %f11, %f8, %f10, %f9;
add.f32 %f12, %f11, 0fCB40007F;
neg.f32 %f13, %f12;
mov.f32 %f14, 0f3FB8AA3B;
fma.rn.f32 %f15, %f4, %f14, %f13;
mov.f32 %f16, 0f32A57060;
fma.rn.f32 %f17, %f4, %f16, %f15;
mov.b32 %r108, %f11;
shl.b32 %r109, %r108, 23;
mov.b32 %f18, %r109;
ex2.approx.ftz.f32 %f19, %f17;
mul.f32 %f20, %f19, %f18;
rcp.rn.f32 %f21, %f20;
mul.f32 %f22, %f20, %f21;
mul.wide.s32 %rd11, %r9, 4;
add.s64 %rd12, %rd6, %rd11;
st.global.f32 [%rd12], %f22;
$L__BB0_2:
ret;
}
--- 53997da5d
+++ 03a1b695e
@@ -20,72 +20,69 @@
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_2[64]
)
{
.reg .pred %p<2>;
.reg .f32 %f<23>;
- .reg .b32 %r<113>;
+ .reg .b32 %r<110>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r44, %r45}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0+24];
ld.param.v2.u32 {%r46, %r47}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0+32];
ld.param.v2.u32 {%r52, %r53}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1+24];
ld.param.v2.u32 {%r54, %r55}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1+32];
ld.param.u64 %rd3, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_2];
ld.param.u64 %rd2, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1];
ld.param.u64 %rd1, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0];
- mov.u32 %r70, %ctaid.x;
- shl.b32 %r71, %r70, 7;
- mov.u32 %r72, %tid.x;
- add.s32 %r9, %r71, %r72;
+ mov.u32 %r9, %tid.x;
setp.gt.s32 %p1, %r9, 119;
@%p1 bra $L__BB0_2;
cvta.to.global.u64 %rd4, %rd2;
cvta.to.global.u64 %rd5, %rd1;
cvta.to.global.u64 %rd6, %rd3;
- mul.hi.s32 %r73, %r9, -2004318071;
- add.s32 %r74, %r73, %r9;
- shr.u32 %r75, %r74, 31;
- shr.s32 %r76, %r74, 5;
- add.s32 %r77, %r76, %r75;
- mul.hi.s32 %r78, %r9, 1717986919;
- shr.u32 %r79, %r78, 31;
- shr.s32 %r80, %r78, 3;
- add.s32 %r81, %r80, %r79;
- mul.hi.s32 %r82, %r81, 1431655766;
- shr.u32 %r83, %r82, 31;
- add.s32 %r84, %r82, %r83;
- mul.lo.s32 %r85, %r84, 3;
- sub.s32 %r86, %r81, %r85;
- mul.lo.s32 %r87, %r53, %r86;
- mul.lo.s32 %r88, %r81, 20;
- sub.s32 %r89, %r9, %r88;
- mul.hi.s32 %r90, %r89, 1717986919;
- shr.u32 %r91, %r90, 31;
- shr.s32 %r92, %r90, 2;
- add.s32 %r93, %r92, %r91;
- shl.b32 %r94, %r93, 1;
- mul.lo.s32 %r95, %r93, 10;
- sub.s32 %r96, %r89, %r95;
- mul.hi.s32 %r97, %r96, 1717986919;
- shr.u32 %r98, %r97, 31;
- shr.s32 %r99, %r97, 1;
- add.s32 %r100, %r99, %r98;
- mul.lo.s32 %r101, %r100, 5;
- sub.s32 %r102, %r96, %r101;
- add.s32 %r103, %r94, %r100;
- mad.lo.s32 %r104, %r52, %r77, %r87;
- mad.lo.s32 %r105, %r55, %r102, %r104;
- mad.lo.s32 %r106, %r103, %r54, %r105;
- mul.wide.s32 %rd7, %r106, 4;
+ mul.hi.s32 %r70, %r9, -2004318071;
+ add.s32 %r71, %r70, %r9;
+ shr.u32 %r72, %r71, 31;
+ shr.s32 %r73, %r71, 5;
+ add.s32 %r74, %r73, %r72;
+ mul.hi.s32 %r75, %r9, 1717986919;
+ shr.u32 %r76, %r75, 31;
+ shr.s32 %r77, %r75, 3;
+ add.s32 %r78, %r77, %r76;
+ mul.hi.s32 %r79, %r78, 1431655766;
+ shr.u32 %r80, %r79, 31;
+ add.s32 %r81, %r79, %r80;
+ mul.lo.s32 %r82, %r81, 3;
+ sub.s32 %r83, %r78, %r82;
+ mul.lo.s32 %r84, %r53, %r83;
+ mul.lo.s32 %r85, %r78, 20;
+ sub.s32 %r86, %r9, %r85;
+ mul.hi.s32 %r87, %r86, 1717986919;
+ shr.u32 %r88, %r87, 31;
+ shr.s32 %r89, %r87, 2;
+ add.s32 %r90, %r89, %r88;
+ shl.b32 %r91, %r90, 1;
+ mul.lo.s32 %r92, %r90, 10;
+ sub.s32 %r93, %r86, %r92;
+ mul.hi.s32 %r94, %r93, 1717986919;
+ shr.u32 %r95, %r94, 31;
+ shr.s32 %r96, %r94, 1;
+ add.s32 %r97, %r96, %r95;
+ mul.lo.s32 %r98, %r97, 5;
+ sub.s32 %r99, %r93, %r98;
+ add.s32 %r100, %r91, %r97;
+ mad.lo.s32 %r101, %r52, %r74, %r84;
+ mad.lo.s32 %r102, %r55, %r99, %r101;
+ mad.lo.s32 %r103, %r100, %r54, %r102;
+ mul.wide.s32 %rd7, %r103, 4;
add.s64 %rd8, %rd4, %rd7;
- mul.lo.s32 %r107, %r45, %r86;
- mad.lo.s32 %r108, %r44, %r77, %r107;
- mad.lo.s32 %r109, %r102, %r47, %r108;
- mad.lo.s32 %r110, %r103, %r46, %r109;
- mul.wide.s32 %rd9, %r110, 4;
+ mul.lo.s32 %r104, %r45, %r83;
+ mad.lo.s32 %r105, %r44, %r74, %r104;
+ mad.lo.s32 %r106, %r47, %r99, %r105;
+ mad.lo.s32 %r107, %r100, %r46, %r106;
+ mul.wide.s32 %rd9, %r107, 4;
add.s64 %rd10, %rd5, %rd9;
ld.global.f32 %f1, [%rd10];
ld.global.f32 %f2, [%rd8];
add.f32 %f3, %f2, %f1;
sub.f32 %f4, %f3, %f3;
@@ -100,13 +97,13 @@
neg.f32 %f13, %f12;
mov.f32 %f14, 0f3FB8AA3B;
fma.rn.f32 %f15, %f4, %f14, %f13;
mov.f32 %f16, 0f32A57060;
fma.rn.f32 %f17, %f4, %f16, %f15;
- mov.b32 %r111, %f11;
- shl.b32 %r112, %r111, 23;
- mov.b32 %f18, %r112;
+ mov.b32 %r108, %f11;
+ shl.b32 %r109, %r108, 23;
+ mov.b32 %f18, %r109;
ex2.approx.ftz.f32 %f19, %f17;
mul.f32 %f20, %f19, %f18;
rcp.rn.f32 %f21, %f20;
mul.f32 %f22, %f20, %f21;
mul.wide.s32 %rd11, %r9, 4;
Kernel 36
CUDA
PTX
53997da5d
Diff
03a1b695e
-8
+8 index type: int
registers: 14
gmem: 3
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 5, 5> T4, Tensor<float, 5, 5> T12) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
Array<float, 4, 4> T13;
T13.set(float(NEG_INFINITY));
if (((((((nvfuser_index_t)threadIdx.x) * 4) + 3) < 4) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242))) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T13[0], &T4[(((4 * ((nvfuser_index_t)threadIdx.x)) + (4 * ((nvfuser_index_t)threadIdx.y))) + ((4 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))]);
}
Array<float, 1, 1> T15;
T15[0] = NEG_INFINITY;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
T15[0] = fmax(
T15[0],
T13[i0]);
}
Array<float, 1, 1> T5;
T5[0] = NEG_INFINITY;
blockReduce<true, false, false, true>(T5[0], T15[0], [](float &a, float b) { a = fmax(a, b); }, static_cast<float*>(shared_mem), true, true, float(NEG_INFINITY), DefaultBlockDim());
Array<float, 1, 1> T6;
broadcast::blockBroadcast<true, false, false, true>(T6[0], T5[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
// Alias Allocation - register
auto& T8 = T13;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
Array<float, 1, 1> T7;
T7[0]
= T13[i1]
- T6[0];
T8[i1]
= expf(T7[0]);
}
Array<float, 1, 1> T16;
T16[0] = 0.000000000e+00f;
if (((((((nvfuser_index_t)threadIdx.x) * 4) + 3) < 4) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242))) {
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 4; ++i2) {
T16[0]
= T16[0]
+ T8[i2];
}
} else {
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 4; ++i2) {
if ((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < 4) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242))) {
T16[0]
= T16[0]
+ T8[i2];
}
}
}
Array<float, 1, 1> T9;
T9[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T9[0], T16[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T10;
broadcast::blockBroadcast<true, false, false, true>(T10[0], T9[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T11;
T11[0]
= reciprocal(T10[0]);
if (((((((nvfuser_index_t)threadIdx.x) * 4) + 3) < 4) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242))) {
Array<float, 4, 4> T14;
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 4; ++i3) {
T14[i3]
= T8[i3]
* T11[0];
}
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T12[(((4 * ((nvfuser_index_t)threadIdx.x)) + (4 * ((nvfuser_index_t)threadIdx.y))) + ((4 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T14[0]);
} else {
Array<float, 4, 4> T14;
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 4; ++i3) {
T14[i3]
= T8[i3]
* T11[0];
}
if ((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < 4) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T12[(((4 * ((nvfuser_index_t)threadIdx.x)) + (4 * ((nvfuser_index_t)threadIdx.y))) + ((4 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T14[0]);
}
}
}
__global__ void nvfuser_N(Tensor<float, 5, 5> T4, Tensor<float, 5, 5> T12) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
Array<float, 4, 4> T13;
T13.set(float(NEG_INFINITY));
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242)) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T13[0], &T4[((4 * ((nvfuser_index_t)threadIdx.y)) + ((4 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))]);
}
Array<float, 1, 1> T15;
T15[0] = NEG_INFINITY;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
T15[0] = fmax(
T15[0],
T13[i0]);
}
Array<float, 1, 1> T5;
T5[0] = NEG_INFINITY;
blockReduce<true, false, false, true>(T5[0], T15[0], [](float &a, float b) { a = fmax(a, b); }, static_cast<float*>(shared_mem), true, true, float(NEG_INFINITY), DefaultBlockDim());
Array<float, 1, 1> T6;
broadcast::blockBroadcast<true, false, false, true>(T6[0], T5[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
// Alias Allocation - register
auto& T8 = T13;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
Array<float, 1, 1> T7;
T7[0]
= T13[i1]
- T6[0];
T8[i1]
= expf(T7[0]);
}
Array<float, 1, 1> T16;
T16[0] = 0.000000000e+00f;
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242)) {
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 4; ++i2) {
T16[0]
= T16[0]
+ T8[i2];
}
} else {
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 4; ++i2) {
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242)) {
T16[0]
= T16[0]
+ T8[i2];
}
}
}
Array<float, 1, 1> T9;
T9[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T9[0], T16[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T10;
broadcast::blockBroadcast<true, false, false, true>(T10[0], T9[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T11;
T11[0]
= reciprocal(T10[0]);
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242)) {
Array<float, 4, 4> T14;
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 4; ++i3) {
T14[i3]
= T8[i3]
* T11[0];
}
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T12[((4 * ((nvfuser_index_t)threadIdx.y)) + ((4 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T14[0]);
} else {
Array<float, 4, 4> T14;
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 4; ++i3) {
T14[i3]
= T8[i3]
* T11[0];
}
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T12[((4 * ((nvfuser_index_t)threadIdx.y)) + ((4 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T14[0]);
}
}
}
--- 53997da5d
+++ 03a1b695e
@@ -1,12 +1,12 @@
__global__ void nvfuser_N(Tensor<float, 5, 5> T4, Tensor<float, 5, 5> T12) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
Array<float, 4, 4> T13;
T13.set(float(NEG_INFINITY));
- if (((((((nvfuser_index_t)threadIdx.x) * 4) + 3) < 4) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242))) {
- loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T13[0], &T4[(((4 * ((nvfuser_index_t)threadIdx.x)) + (4 * ((nvfuser_index_t)threadIdx.y))) + ((4 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))]);
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242)) {
+ loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T13[0], &T4[((4 * ((nvfuser_index_t)threadIdx.y)) + ((4 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))]);
}
Array<float, 1, 1> T15;
T15[0] = NEG_INFINITY;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
@@ -30,21 +30,21 @@
T8[i1]
= expf(T7[0]);
}
Array<float, 1, 1> T16;
T16[0] = 0.000000000e+00f;
- if (((((((nvfuser_index_t)threadIdx.x) * 4) + 3) < 4) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242))) {
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242)) {
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 4; ++i2) {
T16[0]
= T16[0]
+ T8[i2];
}
} else {
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 4; ++i2) {
- if ((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < 4) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242))) {
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242)) {
T16[0]
= T16[0]
+ T8[i2];
}
}
@@ -55,27 +55,27 @@
Array<float, 1, 1> T10;
broadcast::blockBroadcast<true, false, false, true>(T10[0], T9[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T11;
T11[0]
= reciprocal(T10[0]);
- if (((((((nvfuser_index_t)threadIdx.x) * 4) + 3) < 4) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242))) {
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242)) {
Array<float, 4, 4> T14;
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 4; ++i3) {
T14[i3]
= T8[i3]
* T11[0];
}
- loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T12[(((4 * ((nvfuser_index_t)threadIdx.x)) + (4 * ((nvfuser_index_t)threadIdx.y))) + ((4 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T14[0]);
+ loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T12[((4 * ((nvfuser_index_t)threadIdx.y)) + ((4 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T14[0]);
} else {
Array<float, 4, 4> T14;
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 4; ++i3) {
T14[i3]
= T8[i3]
* T11[0];
}
- if ((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < 4) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242))) {
- loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T12[(((4 * ((nvfuser_index_t)threadIdx.x)) + (4 * ((nvfuser_index_t)threadIdx.y))) + ((4 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T14[0]);
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242)) {
+ loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T12[((4 * ((nvfuser_index_t)threadIdx.y)) + ((4 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T14[0]);
}
}
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_06ff237a_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_06ff237a_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_06ff237a_191103std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_06ff237a_191105arrayE[];
.entry _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_06ff237a_1911010nvfuser_41ENS_6TensorIfLi5ELi5EEES1_(
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_06ff237a_1911010nvfuser_41ENS_6TensorIfLi5ELi5EEES1__param_0[48],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_06ff237a_1911010nvfuser_41ENS_6TensorIfLi5ELi5EEES1__param_1[48]
)
{
.reg .pred %p<47>;
.reg .f32 %f<125>;
.reg .b32 %r<105>;
.reg .b64 %rd<27>;
ld.param.u64 %rd7, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_06ff237a_1911010nvfuser_41ENS_6TensorIfLi5ELi5EEES1__param_1];
ld.param.u64 %rd6, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_06ff237a_1911010nvfuser_41ENS_6TensorIfLi5ELi5EEES1__param_0];
mov.u32 %r1, %tid.x;
setp.lt.s32 %p2, %r1, 1;
mov.f32 %f118, 0fFF800000;
mov.f32 %f119, 0fFF800000;
mov.f32 %f120, 0fFF800000;
mov.f32 %f121, 0fFF800000;
@%p2 bra $L__BB0_1;
bra.uni $L__BB0_3;
$L__BB0_1:
mov.u32 %r56, %tid.y;
mov.u32 %r57, %ctaid.x;
mov.u32 %r58, %ntid.y;
mad.lo.s32 %r2, %r58, %r57, %r56;
setp.gt.s32 %p3, %r2, 241;
@%p3 bra $L__BB0_3;
add.s32 %r63, %r2, %r1;
shl.b32 %r64, %r63, 2;
mul.wide.s32 %rd9, %r64, 4;
add.s64 %rd8, %rd6, %rd9;
// begin inline asm
ld.global.cs.v4.u32 {%r59,%r60,%r61,%r62}, [%rd8];
// end inline asm
mov.b32 %f118, %r59;
mov.b32 %f119, %r60;
mov.b32 %f120, %r61;
mov.b32 %f121, %r62;
$L__BB0_3:
setp.gt.f32 %p4, %f118, %f119;
setp.nan.f32 %p5, %f118, %f118;
or.pred %p6, %p5, %p4;
selp.f32 %f35, %f118, %f119, %p6;
setp.nan.f32 %p7, %f35, %f35;
setp.gt.f32 %p8, %f35, %f120;
or.pred %p9, %p7, %p8;
selp.f32 %f36, %f35, %f120, %p9;
setp.nan.f32 %p10, %f36, %f36;
setp.gt.f32 %p11, %f36, %f121;
or.pred %p12, %p10, %p11;
selp.f32 %f37, %f36, %f121, %p12;
mov.u32 %r65, %tid.z;
mov.u32 %r3, %ntid.y;
mov.u32 %r4, %tid.y;
mad.lo.s32 %r5, %r3, %r65, %r4;
mov.u32 %r6, %ntid.x;
mad.lo.s32 %r7, %r5, %r6, %r1;
mul.wide.u32 %rd10, %r7, 4;
mov.u64 %rd11, _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_06ff237a_191105arrayE;
add.s64 %rd1, %rd11, %rd10;
st.shared.f32 [%rd1], %f37;
bar.sync 0;
clz.b32 %r66, %r6;
mov.u32 %r67, 31;
sub.s32 %r68, %r67, %r66;
mov.u32 %r69, 1;
shl.b32 %r8, %r69, %r68;
setp.lt.u32 %p13, %r1, %r8;
add.s32 %r70, %r8, %r1;
setp.lt.u32 %p14, %r70, %r6;
and.pred %p1, %p13, %p14;
add.s32 %r71, %r7, %r8;
mul.wide.s32 %rd12, %r71, 4;
add.s64 %rd2, %rd11, %rd12;
not.pred %p15, %p1;
@%p15 bra $L__BB0_5;
ld.shared.f32 %f38, [%rd2];
ld.shared.f32 %f39, [%rd1];
setp.nan.f32 %p16, %f39, %f39;
setp.gt.f32 %p17, %f39, %f38;
or.pred %p18, %p16, %p17;
selp.f32 %f40, %f39, %f38, %p18;
st.shared.f32 [%rd1], %f40;
$L__BB0_5:
bar.sync 0;
shr.u32 %r72, %r8, 31;
add.s32 %r73, %r8, %r72;
shr.s32 %r104, %r73, 1;
setp.lt.s32 %p19, %r8, 4;
@%p19 bra $L__BB0_10;
mov.u32 %r103, %r104;
$L__BB0_7:
setp.ge.u32 %p20, %r1, %r103;
@%p20 bra $L__BB0_9;
add.s32 %r74, %r103, %r7;
mul.wide.s32 %rd13, %r74, 4;
add.s64 %rd15, %rd11, %rd13;
ld.shared.f32 %f41, [%rd1];
setp.nan.f32 %p21, %f41, %f41;
ld.shared.f32 %f42, [%rd15];
setp.gt.f32 %p22, %f41, %f42;
or.pred %p23, %p21, %p22;
selp.f32 %f43, %f41, %f42, %p23;
st.shared.f32 [%rd1], %f43;
$L__BB0_9:
bar.sync 0;
shr.u32 %r11, %r103, 1;
setp.gt.u32 %p24, %r103, 3;
mov.u32 %r103, %r11;
@%p24 bra $L__BB0_7;
$L__BB0_10:
setp.ne.s32 %p25, %r1, 0;
add.s32 %r75, %r7, 1;
mul.wide.u32 %rd16, %r75, 4;
add.s64 %rd3, %rd11, %rd16;
mov.f32 %f122, 0fFF800000;
@%p25 bra $L__BB0_13;
ld.shared.f32 %f122, [%rd1];
setp.lt.u32 %p26, %r6, 2;
@%p26 bra $L__BB0_13;
ld.shared.f32 %f45, [%rd3];
setp.gt.f32 %p27, %f122, %f45;
setp.nan.f32 %p28, %f122, %f122;
or.pred %p29, %p28, %p27;
selp.f32 %f122, %f122, %f45, %p29;
$L__BB0_13:
bar.sync 0;
mul.wide.s32 %rd18, %r5, 4;
add.s64 %rd4, %rd11, %rd18;
setp.eq.s32 %p30, %r1, 0;
@%p30 bra $L__BB0_14;
bra.uni $L__BB0_15;
$L__BB0_14:
st.shared.f32 [%rd4], %f122;
$L__BB0_15:
bar.sync 0;
ld.shared.f32 %f46, [%rd4];
bar.sync 0;
sub.f32 %f47, %f118, %f46;
mov.f32 %f48, 0f3F000000;
mov.f32 %f49, 0f3BBB989D;
fma.rn.f32 %f50, %f47, %f49, %f48;
cvt.sat.f32.f32 %f51, %f50;
mov.f32 %f52, 0f4B400001;
mov.f32 %f53, 0f437C0000;
fma.rm.f32 %f54, %f51, %f53, %f52;
add.f32 %f55, %f54, 0fCB40007F;
neg.f32 %f56, %f55;
mov.f32 %f57, 0f3FB8AA3B;
fma.rn.f32 %f58, %f47, %f57, %f56;
mov.f32 %f59, 0f32A57060;
fma.rn.f32 %f60, %f47, %f59, %f58;
mov.b32 %r76, %f54;
shl.b32 %r77, %r76, 23;
mov.b32 %f61, %r77;
ex2.approx.ftz.f32 %f62, %f60;
mul.f32 %f12, %f62, %f61;
sub.f32 %f63, %f119, %f46;
fma.rn.f32 %f64, %f63, %f49, %f48;
cvt.sat.f32.f32 %f65, %f64;
fma.rm.f32 %f66, %f65, %f53, %f52;
add.f32 %f67, %f66, 0fCB40007F;
neg.f32 %f68, %f67;
fma.rn.f32 %f69, %f63, %f57, %f68;
fma.rn.f32 %f70, %f63, %f59, %f69;
mov.b32 %r78, %f66;
shl.b32 %r79, %r78, 23;
mov.b32 %f71, %r79;
ex2.approx.ftz.f32 %f72, %f70;
mul.f32 %f13, %f72, %f71;
sub.f32 %f73, %f120, %f46;
fma.rn.f32 %f74, %f73, %f49, %f48;
cvt.sat.f32.f32 %f75, %f74;
fma.rm.f32 %f76, %f75, %f53, %f52;
add.f32 %f77, %f76, 0fCB40007F;
neg.f32 %f78, %f77;
fma.rn.f32 %f79, %f73, %f57, %f78;
fma.rn.f32 %f80, %f73, %f59, %f79;
mov.b32 %r80, %f76;
shl.b32 %r81, %r80, 23;
mov.b32 %f81, %r81;
ex2.approx.ftz.f32 %f82, %f80;
mul.f32 %f14, %f82, %f81;
sub.f32 %f83, %f121, %f46;
fma.rn.f32 %f84, %f83, %f49, %f48;
cvt.sat.f32.f32 %f85, %f84;
fma.rm.f32 %f86, %f85, %f53, %f52;
add.f32 %f87, %f86, 0fCB40007F;
neg.f32 %f88, %f87;
fma.rn.f32 %f89, %f83, %f57, %f88;
fma.rn.f32 %f90, %f83, %f59, %f89;
mov.b32 %r82, %f86;
shl.b32 %r83, %r82, 23;
mov.b32 %f91, %r83;
ex2.approx.ftz.f32 %f92, %f90;
mul.f32 %f15, %f92, %f91;
@%p2 bra $L__BB0_16;
bra.uni $L__BB0_17;
$L__BB0_16:
mov.u32 %r84, %ctaid.x;
mad.lo.s32 %r85, %r3, %r84, %r4;
setp.lt.s32 %p32, %r85, 242;
@%p32 bra $L__BB0_18;
bra.uni $L__BB0_17;
$L__BB0_18:
add.f32 %f97, %f12, 0f00000000;
add.f32 %f98, %f97, %f13;
add.f32 %f99, %f98, %f14;
add.f32 %f123, %f99, %f15;
bra.uni $L__BB0_19;
$L__BB0_17:
mov.u32 %r86, %ctaid.x;
mad.lo.s32 %r87, %r3, %r86, %r4;
setp.lt.s32 %p34, %r87, 242;
and.pred %p35, %p2, %p34;
add.f32 %f93, %f12, 0f00000000;
add.f32 %f94, %f93, %f13;
add.f32 %f95, %f94, %f14;
add.f32 %f96, %f95, %f15;
selp.f32 %f123, %f96, 0f00000000, %p35;
$L__BB0_19:
st.shared.f32 [%rd1], %f123;
bar.sync 0;
@%p15 bra $L__BB0_21;
ld.shared.f32 %f100, [%rd2];
ld.shared.f32 %f101, [%rd1];
add.f32 %f102, %f100, %f101;
st.shared.f32 [%rd1], %f102;
$L__BB0_21:
bar.sync 0;
@%p19 bra $L__BB0_25;
$L__BB0_22:
setp.ge.u32 %p38, %r1, %r104;
@%p38 bra $L__BB0_24;
add.s32 %r88, %r104, %r7;
mul.wide.s32 %rd20, %r88, 4;
add.s64 %rd22, %rd11, %rd20;
ld.shared.f32 %f103, [%rd1];
ld.shared.f32 %f104, [%rd22];
add.f32 %f105, %f104, %f103;
st.shared.f32 [%rd1], %f105;
$L__BB0_24:
bar.sync 0;
shr.u32 %r13, %r104, 1;
setp.gt.u32 %p39, %r104, 3;
mov.u32 %r104, %r13;
@%p39 bra $L__BB0_22;
$L__BB0_25:
mov.f32 %f124, 0f00000000;
@%p25 bra $L__BB0_28;
ld.shared.f32 %f107, [%rd1];
add.f32 %f124, %f107, 0f00000000;
setp.lt.u32 %p41, %r6, 2;
@%p41 bra $L__BB0_28;
ld.shared.f32 %f108, [%rd3];
add.f32 %f124, %f124, %f108;
$L__BB0_28:
bar.sync 0;
@%p25 bra $L__BB0_30;
st.shared.f32 [%rd4], %f124;
$L__BB0_30:
setp.gt.s32 %p43, %r1, 0;
bar.sync 0;
ld.shared.f32 %f109, [%rd4];
bar.sync 0;
rcp.rn.f32 %f22, %f109;
@%p43 bra $L__BB0_32;
mov.u32 %r89, %ctaid.x;
mad.lo.s32 %r14, %r3, %r89, %r4;
setp.lt.s32 %p44, %r14, 242;
@%p44 bra $L__BB0_35;
bra.uni $L__BB0_32;
$L__BB0_35:
mul.f32 %f110, %f22, %f12;
mov.b32 %r97, %f110;
mul.f32 %f111, %f22, %f13;
mov.b32 %r98, %f111;
mul.f32 %f112, %f22, %f14;
mov.b32 %r99, %f112;
mul.f32 %f113, %f22, %f15;
mov.b32 %r100, %f113;
add.s32 %r101, %r14, %r1;
shl.b32 %r102, %r101, 2;
mul.wide.s32 %rd26, %r102, 4;
add.s64 %rd25, %rd7, %rd26;
// begin inline asm
st.global.cs.v4.s32 [%rd25], {%r97,%r98,%r99,%r100};
// end inline asm
bra.uni $L__BB0_36;
$L__BB0_32:
mul.f32 %f23, %f22, %f12;
mul.f32 %f24, %f22, %f13;
mul.f32 %f25, %f22, %f14;
mul.f32 %f26, %f22, %f15;
@%p43 bra $L__BB0_36;
mov.u32 %r90, %ctaid.x;
mad.lo.s32 %r15, %r3, %r90, %r4;
setp.gt.s32 %p46, %r15, 241;
@%p46 bra $L__BB0_36;
add.s32 %r95, %r15, %r1;
shl.b32 %r96, %r95, 2;
mul.wide.s32 %rd24, %r96, 4;
add.s64 %rd23, %rd7, %rd24;
mov.b32 %r91, %f23;
mov.b32 %r92, %f24;
mov.b32 %r93, %f25;
mov.b32 %r94, %f26;
// begin inline asm
st.global.cs.v4.s32 [%rd23], {%r91,%r92,%r93,%r94};
// end inline asm
$L__BB0_36:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_ecac0cda_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_ecac0cda_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_ecac0cda_160113std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_ecac0cda_160115arrayE[];
.entry _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_ecac0cda_1601110nvfuser_41ENS_6TensorIfLi5ELi5EEES1_(
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_ecac0cda_1601110nvfuser_41ENS_6TensorIfLi5ELi5EEES1__param_0[48],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_ecac0cda_1601110nvfuser_41ENS_6TensorIfLi5ELi5EEES1__param_1[48]
)
{
.reg .pred %p<39>;
.reg .f32 %f<107>;
.reg .b32 %r<87>;
.reg .b64 %rd<24>;
ld.param.u64 %rd6, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_ecac0cda_1601110nvfuser_41ENS_6TensorIfLi5ELi5EEES1__param_1];
ld.param.u64 %rd5, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_ecac0cda_1601110nvfuser_41ENS_6TensorIfLi5ELi5EEES1__param_0];
mov.u32 %r54, %ctaid.x;
mov.u32 %r1, %ntid.y;
mov.u32 %r2, %tid.y;
mad.lo.s32 %r3, %r1, %r54, %r2;
setp.gt.s32 %p2, %r3, 241;
mov.f32 %f101, 0fFF800000;
mov.f32 %f102, %f101;
mov.f32 %f103, %f101;
mov.f32 %f104, %f101;
@%p2 bra $L__BB0_2;
shl.b32 %r59, %r3, 2;
mul.wide.s32 %rd8, %r59, 4;
add.s64 %rd7, %rd5, %rd8;
// begin inline asm
ld.global.cs.v4.u32 {%r55,%r56,%r57,%r58}, [%rd7];
// end inline asm
mov.b32 %f101, %r55;
mov.b32 %f102, %r56;
mov.b32 %f103, %r57;
mov.b32 %f104, %r58;
$L__BB0_2:
setp.gt.f32 %p3, %f101, %f102;
setp.nan.f32 %p4, %f101, %f101;
or.pred %p5, %p4, %p3;
selp.f32 %f24, %f101, %f102, %p5;
setp.nan.f32 %p6, %f24, %f24;
setp.gt.f32 %p7, %f24, %f103;
or.pred %p8, %p6, %p7;
selp.f32 %f25, %f24, %f103, %p8;
setp.nan.f32 %p9, %f25, %f25;
setp.gt.f32 %p10, %f25, %f104;
or.pred %p11, %p9, %p10;
selp.f32 %f26, %f25, %f104, %p11;
mov.u32 %r60, %tid.z;
mad.lo.s32 %r4, %r1, %r60, %r2;
mov.u32 %r5, %ntid.x;
mov.u32 %r6, %tid.x;
mad.lo.s32 %r7, %r4, %r5, %r6;
mul.wide.u32 %rd9, %r7, 4;
mov.u64 %rd10, _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_ecac0cda_160115arrayE;
add.s64 %rd1, %rd10, %rd9;
st.shared.f32 [%rd1], %f26;
bar.sync 0;
clz.b32 %r61, %r5;
mov.u32 %r62, 31;
sub.s32 %r63, %r62, %r61;
mov.u32 %r64, 1;
shl.b32 %r8, %r64, %r63;
setp.lt.u32 %p12, %r6, %r8;
add.s32 %r65, %r8, %r6;
setp.lt.u32 %p13, %r65, %r5;
and.pred %p1, %p12, %p13;
add.s32 %r66, %r7, %r8;
mul.wide.s32 %rd11, %r66, 4;
add.s64 %rd2, %rd10, %rd11;
not.pred %p14, %p1;
@%p14 bra $L__BB0_4;
ld.shared.f32 %f27, [%rd2];
ld.shared.f32 %f28, [%rd1];
setp.nan.f32 %p15, %f28, %f28;
setp.gt.f32 %p16, %f28, %f27;
or.pred %p17, %p15, %p16;
selp.f32 %f29, %f28, %f27, %p17;
st.shared.f32 [%rd1], %f29;
$L__BB0_4:
bar.sync 0;
shr.u32 %r67, %r8, 31;
add.s32 %r68, %r8, %r67;
shr.s32 %r86, %r68, 1;
setp.lt.s32 %p18, %r8, 4;
@%p18 bra $L__BB0_9;
mov.u32 %r85, %r86;
$L__BB0_6:
setp.ge.u32 %p19, %r6, %r85;
@%p19 bra $L__BB0_8;
add.s32 %r69, %r85, %r7;
mul.wide.s32 %rd12, %r69, 4;
add.s64 %rd14, %rd10, %rd12;
ld.shared.f32 %f30, [%rd1];
setp.nan.f32 %p20, %f30, %f30;
ld.shared.f32 %f31, [%rd14];
setp.gt.f32 %p21, %f30, %f31;
or.pred %p22, %p20, %p21;
selp.f32 %f32, %f30, %f31, %p22;
st.shared.f32 [%rd1], %f32;
$L__BB0_8:
bar.sync 0;
shr.u32 %r11, %r85, 1;
setp.gt.u32 %p23, %r85, 3;
mov.u32 %r85, %r11;
@%p23 bra $L__BB0_6;
$L__BB0_9:
setp.ne.s32 %p24, %r6, 0;
add.s32 %r70, %r7, 1;
mul.wide.u32 %rd15, %r70, 4;
add.s64 %rd3, %rd10, %rd15;
mov.f32 %f105, 0fFF800000;
@%p24 bra $L__BB0_12;
ld.shared.f32 %f105, [%rd1];
setp.lt.u32 %p25, %r5, 2;
@%p25 bra $L__BB0_12;
ld.shared.f32 %f34, [%rd3];
setp.gt.f32 %p26, %f105, %f34;
setp.nan.f32 %p27, %f105, %f105;
or.pred %p28, %p27, %p26;
selp.f32 %f105, %f105, %f34, %p28;
$L__BB0_12:
bar.sync 0;
mul.wide.s32 %rd17, %r4, 4;
add.s64 %rd4, %rd10, %rd17;
setp.eq.s32 %p29, %r6, 0;
@%p29 bra $L__BB0_13;
bra.uni $L__BB0_14;
$L__BB0_13:
st.shared.f32 [%rd4], %f105;
$L__BB0_14:
setp.lt.s32 %p30, %r3, 242;
bar.sync 0;
ld.shared.f32 %f35, [%rd4];
bar.sync 0;
sub.f32 %f36, %f101, %f35;
mov.f32 %f37, 0f3F000000;
mov.f32 %f38, 0f3BBB989D;
fma.rn.f32 %f39, %f36, %f38, %f37;
cvt.sat.f32.f32 %f40, %f39;
mov.f32 %f41, 0f4B400001;
mov.f32 %f42, 0f437C0000;
fma.rm.f32 %f43, %f40, %f42, %f41;
add.f32 %f44, %f43, 0fCB40007F;
neg.f32 %f45, %f44;
mov.f32 %f46, 0f3FB8AA3B;
fma.rn.f32 %f47, %f36, %f46, %f45;
mov.f32 %f48, 0f32A57060;
fma.rn.f32 %f49, %f36, %f48, %f47;
mov.b32 %r71, %f43;
shl.b32 %r72, %r71, 23;
mov.b32 %f50, %r72;
ex2.approx.ftz.f32 %f51, %f49;
mul.f32 %f12, %f51, %f50;
sub.f32 %f52, %f102, %f35;
fma.rn.f32 %f53, %f52, %f38, %f37;
cvt.sat.f32.f32 %f54, %f53;
fma.rm.f32 %f55, %f54, %f42, %f41;
add.f32 %f56, %f55, 0fCB40007F;
neg.f32 %f57, %f56;
fma.rn.f32 %f58, %f52, %f46, %f57;
fma.rn.f32 %f59, %f52, %f48, %f58;
mov.b32 %r73, %f55;
shl.b32 %r74, %r73, 23;
mov.b32 %f60, %r74;
ex2.approx.ftz.f32 %f61, %f59;
mul.f32 %f13, %f61, %f60;
sub.f32 %f62, %f103, %f35;
fma.rn.f32 %f63, %f62, %f38, %f37;
cvt.sat.f32.f32 %f64, %f63;
fma.rm.f32 %f65, %f64, %f42, %f41;
add.f32 %f66, %f65, 0fCB40007F;
neg.f32 %f67, %f66;
fma.rn.f32 %f68, %f62, %f46, %f67;
fma.rn.f32 %f69, %f62, %f48, %f68;
mov.b32 %r75, %f65;
shl.b32 %r76, %r75, 23;
mov.b32 %f70, %r76;
ex2.approx.ftz.f32 %f71, %f69;
mul.f32 %f14, %f71, %f70;
sub.f32 %f72, %f104, %f35;
fma.rn.f32 %f73, %f72, %f38, %f37;
cvt.sat.f32.f32 %f74, %f73;
fma.rm.f32 %f75, %f74, %f42, %f41;
add.f32 %f76, %f75, 0fCB40007F;
neg.f32 %f77, %f76;
fma.rn.f32 %f78, %f72, %f46, %f77;
fma.rn.f32 %f79, %f72, %f48, %f78;
mov.b32 %r77, %f75;
shl.b32 %r78, %r77, 23;
mov.b32 %f80, %r78;
ex2.approx.ftz.f32 %f81, %f79;
mul.f32 %f15, %f81, %f80;
add.f32 %f82, %f12, 0f00000000;
add.f32 %f83, %f82, %f13;
add.f32 %f84, %f83, %f14;
add.f32 %f85, %f84, %f15;
selp.f32 %f86, %f85, 0f00000000, %p30;
st.shared.f32 [%rd1], %f86;
bar.sync 0;
@%p14 bra $L__BB0_16;
ld.shared.f32 %f87, [%rd2];
ld.shared.f32 %f88, [%rd1];
add.f32 %f89, %f87, %f88;
st.shared.f32 [%rd1], %f89;
$L__BB0_16:
bar.sync 0;
@%p18 bra $L__BB0_20;
$L__BB0_17:
setp.ge.u32 %p33, %r6, %r86;
@%p33 bra $L__BB0_19;
add.s32 %r79, %r86, %r7;
mul.wide.s32 %rd19, %r79, 4;
add.s64 %rd21, %rd10, %rd19;
ld.shared.f32 %f90, [%rd1];
ld.shared.f32 %f91, [%rd21];
add.f32 %f92, %f91, %f90;
st.shared.f32 [%rd1], %f92;
$L__BB0_19:
bar.sync 0;
shr.u32 %r13, %r86, 1;
setp.gt.u32 %p34, %r86, 3;
mov.u32 %r86, %r13;
@%p34 bra $L__BB0_17;
$L__BB0_20:
mov.f32 %f106, 0f00000000;
@%p24 bra $L__BB0_23;
ld.shared.f32 %f94, [%rd1];
add.f32 %f106, %f94, 0f00000000;
setp.lt.u32 %p36, %r5, 2;
@%p36 bra $L__BB0_23;
ld.shared.f32 %f95, [%rd3];
add.f32 %f106, %f106, %f95;
$L__BB0_23:
bar.sync 0;
@%p24 bra $L__BB0_25;
st.shared.f32 [%rd4], %f106;
$L__BB0_25:
bar.sync 0;
ld.shared.f32 %f19, [%rd4];
bar.sync 0;
@%p2 bra $L__BB0_27;
rcp.rn.f32 %f96, %f19;
mul.f32 %f97, %f96, %f12;
mov.b32 %r80, %f97;
mul.f32 %f98, %f96, %f13;
mov.b32 %r81, %f98;
mul.f32 %f99, %f96, %f14;
mov.b32 %r82, %f99;
mul.f32 %f100, %f96, %f15;
mov.b32 %r83, %f100;
shl.b32 %r84, %r3, 2;
mul.wide.s32 %rd23, %r84, 4;
add.s64 %rd22, %rd6, %rd23;
// begin inline asm
st.global.cs.v4.s32 [%rd22], {%r80,%r81,%r82,%r83};
// end inline asm
$L__BB0_27:
ret;
}
--- 53997da5d
+++ 03a1b695e
@@ -18,353 +18,288 @@
.entry _ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_(
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1__param_0[48],
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1__param_1[48]
)
{
- .reg .pred %p<47>;
- .reg .f32 %f<125>;
- .reg .b32 %r<105>;
- .reg .b64 %rd<27>;
-
-
- ld.param.u64 %rd7, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1__param_1];
- ld.param.u64 %rd6, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1__param_0];
- mov.u32 %r1, %tid.x;
- setp.lt.s32 %p2, %r1, 1;
- mov.f32 %f118, 0fFF800000;
- mov.f32 %f119, 0fFF800000;
- mov.f32 %f120, 0fFF800000;
- mov.f32 %f121, 0fFF800000;
- @%p2 bra $L__BB0_1;
- bra.uni $L__BB0_3;
-
-$L__BB0_1:
- mov.u32 %r56, %tid.y;
- mov.u32 %r57, %ctaid.x;
- mov.u32 %r58, %ntid.y;
- mad.lo.s32 %r2, %r58, %r57, %r56;
- setp.gt.s32 %p3, %r2, 241;
- @%p3 bra $L__BB0_3;
-
- add.s32 %r63, %r2, %r1;
- shl.b32 %r64, %r63, 2;
- mul.wide.s32 %rd9, %r64, 4;
- add.s64 %rd8, %rd6, %rd9;
+ .reg .pred %p<39>;
+ .reg .f32 %f<107>;
+ .reg .b32 %r<87>;
+ .reg .b64 %rd<24>;
+
+
+ ld.param.u64 %rd6, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1__param_1];
+ ld.param.u64 %rd5, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1__param_0];
+ mov.u32 %r54, %ctaid.x;
+ mov.u32 %r1, %ntid.y;
+ mov.u32 %r2, %tid.y;
+ mad.lo.s32 %r3, %r1, %r54, %r2;
+ setp.gt.s32 %p2, %r3, 241;
+ mov.f32 %f101, 0fFF800000;
+ mov.f32 %f102, %f101;
+ mov.f32 %f103, %f101;
+ mov.f32 %f104, %f101;
+ @%p2 bra $L__BB0_2;
+
+ shl.b32 %r59, %r3, 2;
+ mul.wide.s32 %rd8, %r59, 4;
+ add.s64 %rd7, %rd5, %rd8;
- ld.global.cs.v4.u32 {%r59,%r60,%r61,%r62}, [%rd8];
+ ld.global.cs.v4.u32 {%r55,%r56,%r57,%r58}, [%rd7];
- mov.b32 %f118, %r59;
- mov.b32 %f119, %r60;
- mov.b32 %f120, %r61;
- mov.b32 %f121, %r62;
-
-$L__BB0_3:
- setp.gt.f32 %p4, %f118, %f119;
- setp.nan.f32 %p5, %f118, %f118;
- or.pred %p6, %p5, %p4;
- selp.f32 %f35, %f118, %f119, %p6;
- setp.nan.f32 %p7, %f35, %f35;
- setp.gt.f32 %p8, %f35, %f120;
- or.pred %p9, %p7, %p8;
- selp.f32 %f36, %f35, %f120, %p9;
- setp.nan.f32 %p10, %f36, %f36;
- setp.gt.f32 %p11, %f36, %f121;
- or.pred %p12, %p10, %p11;
- selp.f32 %f37, %f36, %f121, %p12;
- mov.u32 %r65, %tid.z;
- mov.u32 %r3, %ntid.y;
- mov.u32 %r4, %tid.y;
- mad.lo.s32 %r5, %r3, %r65, %r4;
- mov.u32 %r6, %ntid.x;
- mad.lo.s32 %r7, %r5, %r6, %r1;
- mul.wide.u32 %rd10, %r7, 4;
- mov.u64 %rd11, _ZN11kernelscope6kernelE;
- add.s64 %rd1, %rd11, %rd10;
- st.shared.f32 [%rd1], %f37;
- bar.sync 0;
- clz.b32 %r66, %r6;
- mov.u32 %r67, 31;
- sub.s32 %r68, %r67, %r66;
- mov.u32 %r69, 1;
- shl.b32 %r8, %r69, %r68;
- setp.lt.u32 %p13, %r1, %r8;
- add.s32 %r70, %r8, %r1;
- setp.lt.u32 %p14, %r70, %r6;
- and.pred %p1, %p13, %p14;
- add.s32 %r71, %r7, %r8;
- mul.wide.s32 %rd12, %r71, 4;
- add.s64 %rd2, %rd11, %rd12;
- not.pred %p15, %p1;
- @%p15 bra $L__BB0_5;
-
- ld.shared.f32 %f38, [%rd2];
- ld.shared.f32 %f39, [%rd1];
- setp.nan.f32 %p16, %f39, %f39;
- setp.gt.f32 %p17, %f39, %f38;
- or.pred %p18, %p16, %p17;
- selp.f32 %f40, %f39, %f38, %p18;
- st.shared.f32 [%rd1], %f40;
-
-$L__BB0_5:
- bar.sync 0;
- shr.u32 %r72, %r8, 31;
- add.s32 %r73, %r8, %r72;
- shr.s32 %r104, %r73, 1;
- setp.lt.s32 %p19, %r8, 4;
- @%p19 bra $L__BB0_10;
-
- mov.u32 %r103, %r104;
-
-$L__BB0_7:
- setp.ge.u32 %p20, %r1, %r103;
- @%p20 bra $L__BB0_9;
-
- add.s32 %r74, %r103, %r7;
- mul.wide.s32 %rd13, %r74, 4;
- add.s64 %rd15, %rd11, %rd13;
- ld.shared.f32 %f41, [%rd1];
- setp.nan.f32 %p21, %f41, %f41;
- ld.shared.f32 %f42, [%rd15];
- setp.gt.f32 %p22, %f41, %f42;
- or.pred %p23, %p21, %p22;
- selp.f32 %f43, %f41, %f42, %p23;
- st.shared.f32 [%rd1], %f43;
+ mov.b32 %f101, %r55;
+ mov.b32 %f102, %r56;
+ mov.b32 %f103, %r57;
+ mov.b32 %f104, %r58;
+
+$L__BB0_2:
+ setp.gt.f32 %p3, %f101, %f102;
+ setp.nan.f32 %p4, %f101, %f101;
+ or.pred %p5, %p4, %p3;
+ selp.f32 %f24, %f101, %f102, %p5;
+ setp.nan.f32 %p6, %f24, %f24;
+ setp.gt.f32 %p7, %f24, %f103;
+ or.pred %p8, %p6, %p7;
+ selp.f32 %f25, %f24, %f103, %p8;
+ setp.nan.f32 %p9, %f25, %f25;
+ setp.gt.f32 %p10, %f25, %f104;
+ or.pred %p11, %p9, %p10;
+ selp.f32 %f26, %f25, %f104, %p11;
+ mov.u32 %r60, %tid.z;
+ mad.lo.s32 %r4, %r1, %r60, %r2;
+ mov.u32 %r5, %ntid.x;
+ mov.u32 %r6, %tid.x;
+ mad.lo.s32 %r7, %r4, %r5, %r6;
+ mul.wide.u32 %rd9, %r7, 4;
+ mov.u64 %rd10, _ZN11kernelscope6kernelE;
+ add.s64 %rd1, %rd10, %rd9;
+ st.shared.f32 [%rd1], %f26;
+ bar.sync 0;
+ clz.b32 %r61, %r5;
+ mov.u32 %r62, 31;
+ sub.s32 %r63, %r62, %r61;
+ mov.u32 %r64, 1;
+ shl.b32 %r8, %r64, %r63;
+ setp.lt.u32 %p12, %r6, %r8;
+ add.s32 %r65, %r8, %r6;
+ setp.lt.u32 %p13, %r65, %r5;
+ and.pred %p1, %p12, %p13;
+ add.s32 %r66, %r7, %r8;
+ mul.wide.s32 %rd11, %r66, 4;
+ add.s64 %rd2, %rd10, %rd11;
+ not.pred %p14, %p1;
+ @%p14 bra $L__BB0_4;
+
+ ld.shared.f32 %f27, [%rd2];
+ ld.shared.f32 %f28, [%rd1];
+ setp.nan.f32 %p15, %f28, %f28;
+ setp.gt.f32 %p16, %f28, %f27;
+ or.pred %p17, %p15, %p16;
+ selp.f32 %f29, %f28, %f27, %p17;
+ st.shared.f32 [%rd1], %f29;
+
+$L__BB0_4:
+ bar.sync 0;
+ shr.u32 %r67, %r8, 31;
+ add.s32 %r68, %r8, %r67;
+ shr.s32 %r86, %r68, 1;
+ setp.lt.s32 %p18, %r8, 4;
+ @%p18 bra $L__BB0_9;
+
+ mov.u32 %r85, %r86;
+
+$L__BB0_6:
+ setp.ge.u32 %p19, %r6, %r85;
+ @%p19 bra $L__BB0_8;
+
+ add.s32 %r69, %r85, %r7;
+ mul.wide.s32 %rd12, %r69, 4;
+ add.s64 %rd14, %rd10, %rd12;
+ ld.shared.f32 %f30, [%rd1];
+ setp.nan.f32 %p20, %f30, %f30;
+ ld.shared.f32 %f31, [%rd14];
+ setp.gt.f32 %p21, %f30, %f31;
+ or.pred %p22, %p20, %p21;
+ selp.f32 %f32, %f30, %f31, %p22;
+ st.shared.f32 [%rd1], %f32;
+
+$L__BB0_8:
+ bar.sync 0;
+ shr.u32 %r11, %r85, 1;
+ setp.gt.u32 %p23, %r85, 3;
+ mov.u32 %r85, %r11;
+ @%p23 bra $L__BB0_6;
$L__BB0_9:
- bar.sync 0;
- shr.u32 %r11, %r103, 1;
- setp.gt.u32 %p24, %r103, 3;
- mov.u32 %r103, %r11;
- @%p24 bra $L__BB0_7;
-
-$L__BB0_10:
- setp.ne.s32 %p25, %r1, 0;
- add.s32 %r75, %r7, 1;
- mul.wide.u32 %rd16, %r75, 4;
- add.s64 %rd3, %rd11, %rd16;
- mov.f32 %f122, 0fFF800000;
- @%p25 bra $L__BB0_13;
-
- ld.shared.f32 %f122, [%rd1];
- setp.lt.u32 %p26, %r6, 2;
- @%p26 bra $L__BB0_13;
-
- ld.shared.f32 %f45, [%rd3];
- setp.gt.f32 %p27, %f122, %f45;
- setp.nan.f32 %p28, %f122, %f122;
- or.pred %p29, %p28, %p27;
- selp.f32 %f122, %f122, %f45, %p29;
+ setp.ne.s32 %p24, %r6, 0;
+ add.s32 %r70, %r7, 1;
+ mul.wide.u32 %rd15, %r70, 4;
+ add.s64 %rd3, %rd10, %rd15;
+ mov.f32 %f105, 0fFF800000;
+ @%p24 bra $L__BB0_12;
+
+ ld.shared.f32 %f105, [%rd1];
+ setp.lt.u32 %p25, %r5, 2;
+ @%p25 bra $L__BB0_12;
+
+ ld.shared.f32 %f34, [%rd3];
+ setp.gt.f32 %p26, %f105, %f34;
+ setp.nan.f32 %p27, %f105, %f105;
+ or.pred %p28, %p27, %p26;
+ selp.f32 %f105, %f105, %f34, %p28;
+
+$L__BB0_12:
+ bar.sync 0;
+ mul.wide.s32 %rd17, %r4, 4;
+ add.s64 %rd4, %rd10, %rd17;
+ setp.eq.s32 %p29, %r6, 0;
+ @%p29 bra $L__BB0_13;
+ bra.uni $L__BB0_14;
$L__BB0_13:
- bar.sync 0;
- mul.wide.s32 %rd18, %r5, 4;
- add.s64 %rd4, %rd11, %rd18;
- setp.eq.s32 %p30, %r1, 0;
- @%p30 bra $L__BB0_14;
- bra.uni $L__BB0_15;
+ st.shared.f32 [%rd4], %f105;
$L__BB0_14:
- st.shared.f32 [%rd4], %f122;
-
-$L__BB0_15:
- bar.sync 0;
- ld.shared.f32 %f46, [%rd4];
- bar.sync 0;
- sub.f32 %f47, %f118, %f46;
- mov.f32 %f48, 0f3F000000;
- mov.f32 %f49, 0f3BBB989D;
- fma.rn.f32 %f50, %f47, %f49, %f48;
- cvt.sat.f32.f32 %f51, %f50;
- mov.f32 %f52, 0f4B400001;
- mov.f32 %f53, 0f437C0000;
- fma.rm.f32 %f54, %f51, %f53, %f52;
- add.f32 %f55, %f54, 0fCB40007F;
- neg.f32 %f56, %f55;
- mov.f32 %f57, 0f3FB8AA3B;
- fma.rn.f32 %f58, %f47, %f57, %f56;
- mov.f32 %f59, 0f32A57060;
- fma.rn.f32 %f60, %f47, %f59, %f58;
- mov.b32 %r76, %f54;
- shl.b32 %r77, %r76, 23;
- mov.b32 %f61, %r77;
- ex2.approx.ftz.f32 %f62, %f60;
- mul.f32 %f12, %f62, %f61;
- sub.f32 %f63, %f119, %f46;
- fma.rn.f32 %f64, %f63, %f49, %f48;
- cvt.sat.f32.f32 %f65, %f64;
- fma.rm.f32 %f66, %f65, %f53, %f52;
- add.f32 %f67, %f66, 0fCB40007F;
- neg.f32 %f68, %f67;
- fma.rn.f32 %f69, %f63, %f57, %f68;
- fma.rn.f32 %f70, %f63, %f59, %f69;
- mov.b32 %r78, %f66;
- shl.b32 %r79, %r78, 23;
- mov.b32 %f71, %r79;
- ex2.approx.ftz.f32 %f72, %f70;
- mul.f32 %f13, %f72, %f71;
- sub.f32 %f73, %f120, %f46;
- fma.rn.f32 %f74, %f73, %f49, %f48;
- cvt.sat.f32.f32 %f75, %f74;
- fma.rm.f32 %f76, %f75, %f53, %f52;
- add.f32 %f77, %f76, 0fCB40007F;
- neg.f32 %f78, %f77;
- fma.rn.f32 %f79, %f73, %f57, %f78;
- fma.rn.f32 %f80, %f73, %f59, %f79;
- mov.b32 %r80, %f76;
- shl.b32 %r81, %r80, 23;
- mov.b32 %f81, %r81;
- ex2.approx.ftz.f32 %f82, %f80;
- mul.f32 %f14, %f82, %f81;
- sub.f32 %f83, %f121, %f46;
- fma.rn.f32 %f84, %f83, %f49, %f48;
- cvt.sat.f32.f32 %f85, %f84;
- fma.rm.f32 %f86, %f85, %f53, %f52;
- add.f32 %f87, %f86, 0fCB40007F;
- neg.f32 %f88, %f87;
- fma.rn.f32 %f89, %f83, %f57, %f88;
- fma.rn.f32 %f90, %f83, %f59, %f89;
- mov.b32 %r82, %f86;
- shl.b32 %r83, %r82, 23;
- mov.b32 %f91, %r83;
- ex2.approx.ftz.f32 %f92, %f90;
- mul.f32 %f15, %f92, %f91;
- @%p2 bra $L__BB0_16;
- bra.uni $L__BB0_17;
+ setp.lt.s32 %p30, %r3, 242;
+ bar.sync 0;
+ ld.shared.f32 %f35, [%rd4];
+ bar.sync 0;
+ sub.f32 %f36, %f101, %f35;
+ mov.f32 %f37, 0f3F000000;
+ mov.f32 %f38, 0f3BBB989D;
+ fma.rn.f32 %f39, %f36, %f38, %f37;
+ cvt.sat.f32.f32 %f40, %f39;
+ mov.f32 %f41, 0f4B400001;
+ mov.f32 %f42, 0f437C0000;
+ fma.rm.f32 %f43, %f40, %f42, %f41;
+ add.f32 %f44, %f43, 0fCB40007F;
+ neg.f32 %f45, %f44;
+ mov.f32 %f46, 0f3FB8AA3B;
+ fma.rn.f32 %f47, %f36, %f46, %f45;
+ mov.f32 %f48, 0f32A57060;
+ fma.rn.f32 %f49, %f36, %f48, %f47;
+ mov.b32 %r71, %f43;
+ shl.b32 %r72, %r71, 23;
+ mov.b32 %f50, %r72;
+ ex2.approx.ftz.f32 %f51, %f49;
+ mul.f32 %f12, %f51, %f50;
+ sub.f32 %f52, %f102, %f35;
+ fma.rn.f32 %f53, %f52, %f38, %f37;
+ cvt.sat.f32.f32 %f54, %f53;
+ fma.rm.f32 %f55, %f54, %f42, %f41;
+ add.f32 %f56, %f55, 0fCB40007F;
+ neg.f32 %f57, %f56;
+ fma.rn.f32 %f58, %f52, %f46, %f57;
+ fma.rn.f32 %f59, %f52, %f48, %f58;
+ mov.b32 %r73, %f55;
+ shl.b32 %r74, %r73, 23;
+ mov.b32 %f60, %r74;
+ ex2.approx.ftz.f32 %f61, %f59;
+ mul.f32 %f13, %f61, %f60;
+ sub.f32 %f62, %f103, %f35;
+ fma.rn.f32 %f63, %f62, %f38, %f37;
+ cvt.sat.f32.f32 %f64, %f63;
+ fma.rm.f32 %f65, %f64, %f42, %f41;
+ add.f32 %f66, %f65, 0fCB40007F;
+ neg.f32 %f67, %f66;
+ fma.rn.f32 %f68, %f62, %f46, %f67;
+ fma.rn.f32 %f69, %f62, %f48, %f68;
+ mov.b32 %r75, %f65;
+ shl.b32 %r76, %r75, 23;
+ mov.b32 %f70, %r76;
+ ex2.approx.ftz.f32 %f71, %f69;
+ mul.f32 %f14, %f71, %f70;
+ sub.f32 %f72, %f104, %f35;
+ fma.rn.f32 %f73, %f72, %f38, %f37;
+ cvt.sat.f32.f32 %f74, %f73;
+ fma.rm.f32 %f75, %f74, %f42, %f41;
+ add.f32 %f76, %f75, 0fCB40007F;
+ neg.f32 %f77, %f76;
+ fma.rn.f32 %f78, %f72, %f46, %f77;
+ fma.rn.f32 %f79, %f72, %f48, %f78;
+ mov.b32 %r77, %f75;
+ shl.b32 %r78, %r77, 23;
+ mov.b32 %f80, %r78;
+ ex2.approx.ftz.f32 %f81, %f79;
+ mul.f32 %f15, %f81, %f80;
+ add.f32 %f82, %f12, 0f00000000;
+ add.f32 %f83, %f82, %f13;
+ add.f32 %f84, %f83, %f14;
+ add.f32 %f85, %f84, %f15;
+ selp.f32 %f86, %f85, 0f00000000, %p30;
+ st.shared.f32 [%rd1], %f86;
+ bar.sync 0;
+ @%p14 bra $L__BB0_16;
+
+ ld.shared.f32 %f87, [%rd2];
+ ld.shared.f32 %f88, [%rd1];
+ add.f32 %f89, %f87, %f88;
+ st.shared.f32 [%rd1], %f89;
$L__BB0_16:
- mov.u32 %r84, %ctaid.x;
- mad.lo.s32 %r85, %r3, %r84, %r4;
- setp.lt.s32 %p32, %r85, 242;
- @%p32 bra $L__BB0_18;
- bra.uni $L__BB0_17;
-
-$L__BB0_18:
- add.f32 %f97, %f12, 0f00000000;
- add.f32 %f98, %f97, %f13;
- add.f32 %f99, %f98, %f14;
- add.f32 %f123, %f99, %f15;
- bra.uni $L__BB0_19;
+ bar.sync 0;
+ @%p18 bra $L__BB0_20;
$L__BB0_17:
- mov.u32 %r86, %ctaid.x;
- mad.lo.s32 %r87, %r3, %r86, %r4;
- setp.lt.s32 %p34, %r87, 242;
- and.pred %p35, %p2, %p34;
- add.f32 %f93, %f12, 0f00000000;
- add.f32 %f94, %f93, %f13;
- add.f32 %f95, %f94, %f14;
- add.f32 %f96, %f95, %f15;
- selp.f32 %f123, %f96, 0f00000000, %p35;
+ setp.ge.u32 %p33, %r6, %r86;
+ @%p33 bra $L__BB0_19;
+
+ add.s32 %r79, %r86, %r7;
+ mul.wide.s32 %rd19, %r79, 4;
+ add.s64 %rd21, %rd10, %rd19;
+ ld.shared.f32 %f90, [%rd1];
+ ld.shared.f32 %f91, [%rd21];
+ add.f32 %f92, %f91, %f90;
+ st.shared.f32 [%rd1], %f92;
$L__BB0_19:
- st.shared.f32 [%rd1], %f123;
- bar.sync 0;
- @%p15 bra $L__BB0_21;
-
- ld.shared.f32 %f100, [%rd2];
- ld.shared.f32 %f101, [%rd1];
- add.f32 %f102, %f100, %f101;
- st.shared.f32 [%rd1], %f102;
-
-$L__BB0_21:
- bar.sync 0;
- @%p19 bra $L__BB0_25;
-
-$L__BB0_22:
- setp.ge.u32 %p38, %r1, %r104;
- @%p38 bra $L__BB0_24;
-
- add.s32 %r88, %r104, %r7;
- mul.wide.s32 %rd20, %r88, 4;
- add.s64 %rd22, %rd11, %rd20;
- ld.shared.f32 %f103, [%rd1];
- ld.shared.f32 %f104, [%rd22];
- add.f32 %f105, %f104, %f103;
- st.shared.f32 [%rd1], %f105;
-
-$L__BB0_24:
- bar.sync 0;
- shr.u32 %r13, %r104, 1;
- setp.gt.u32 %p39, %r104, 3;
- mov.u32 %r104, %r13;
- @%p39 bra $L__BB0_22;
+ bar.sync 0;
+ shr.u32 %r13, %r86, 1;
+ setp.gt.u32 %p34, %r86, 3;
+ mov.u32 %r86, %r13;
+ @%p34 bra $L__BB0_17;
+
+$L__BB0_20:
+ mov.f32 %f106, 0f00000000;
+ @%p24 bra $L__BB0_23;
+
+ ld.shared.f32 %f94, [%rd1];
+ add.f32 %f106, %f94, 0f00000000;
+ setp.lt.u32 %p36, %r5, 2;
+ @%p36 bra $L__BB0_23;
+
+ ld.shared.f32 %f95, [%rd3];
+ add.f32 %f106, %f106, %f95;
+
+$L__BB0_23:
+ bar.sync 0;
+ @%p24 bra $L__BB0_25;
+
+ st.shared.f32 [%rd4], %f106;
$L__BB0_25:
- mov.f32 %f124, 0f00000000;
- @%p25 bra $L__BB0_28;
-
- ld.shared.f32 %f107, [%rd1];
- add.f32 %f124, %f107, 0f00000000;
- setp.lt.u32 %p41, %r6, 2;
- @%p41 bra $L__BB0_28;
-
- ld.shared.f32 %f108, [%rd3];
- add.f32 %f124, %f124, %f108;
-
-$L__BB0_28:
- bar.sync 0;
- @%p25 bra $L__BB0_30;
-
- st.shared.f32 [%rd4], %f124;
-
-$L__BB0_30:
- setp.gt.s32 %p43, %r1, 0;
- bar.sync 0;
- ld.shared.f32 %f109, [%rd4];
- bar.sync 0;
- rcp.rn.f32 %f22, %f109;
- @%p43 bra $L__BB0_32;
-
- mov.u32 %r89, %ctaid.x;
- mad.lo.s32 %r14, %r3, %r89, %r4;
- setp.lt.s32 %p44, %r14, 242;
- @%p44 bra $L__BB0_35;
- bra.uni $L__BB0_32;
-
-$L__BB0_35:
- mul.f32 %f110, %f22, %f12;
- mov.b32 %r97, %f110;
- mul.f32 %f111, %f22, %f13;
- mov.b32 %r98, %f111;
- mul.f32 %f112, %f22, %f14;
- mov.b32 %r99, %f112;
- mul.f32 %f113, %f22, %f15;
- mov.b32 %r100, %f113;
- add.s32 %r101, %r14, %r1;
- shl.b32 %r102, %r101, 2;
- mul.wide.s32 %rd26, %r102, 4;
- add.s64 %rd25, %rd7, %rd26;
+ bar.sync 0;
+ ld.shared.f32 %f19, [%rd4];
+ bar.sync 0;
+ @%p2 bra $L__BB0_27;
+
+ rcp.rn.f32 %f96, %f19;
+ mul.f32 %f97, %f96, %f12;
+ mov.b32 %r80, %f97;
+ mul.f32 %f98, %f96, %f13;
+ mov.b32 %r81, %f98;
+ mul.f32 %f99, %f96, %f14;
+ mov.b32 %r82, %f99;
+ mul.f32 %f100, %f96, %f15;
+ mov.b32 %r83, %f100;
+ shl.b32 %r84, %r3, 2;
+ mul.wide.s32 %rd23, %r84, 4;
+ add.s64 %rd22, %rd6, %rd23;
- st.global.cs.v4.s32 [%rd25], {%r97,%r98,%r99,%r100};
+ st.global.cs.v4.s32 [%rd22], {%r80,%r81,%r82,%r83};
- bra.uni $L__BB0_36;
-
-$L__BB0_32:
- mul.f32 %f23, %f22, %f12;
- mul.f32 %f24, %f22, %f13;
- mul.f32 %f25, %f22, %f14;
- mul.f32 %f26, %f22, %f15;
- @%p43 bra $L__BB0_36;
-
- mov.u32 %r90, %ctaid.x;
- mad.lo.s32 %r15, %r3, %r90, %r4;
- setp.gt.s32 %p46, %r15, 241;
- @%p46 bra $L__BB0_36;
-
- add.s32 %r95, %r15, %r1;
- shl.b32 %r96, %r95, 2;
- mul.wide.s32 %rd24, %r96, 4;
- add.s64 %rd23, %rd7, %rd24;
- mov.b32 %r91, %f23;
- mov.b32 %r92, %f24;
- mov.b32 %r93, %f25;
- mov.b32 %r94, %f26;
-
- st.global.cs.v4.s32 [%rd23], {%r91,%r92,%r93,%r94};
-
-
-$L__BB0_36:
+
+$L__BB0_27:
ret;
}
Kernel 42
CUDA
PTX
53997da5d
Diff
03a1b695e
-8
+8 index type: int
registers: 14
gmem: 3
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 5, 5> T4, Tensor<float, 5, 5> T12) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
Array<float, 4, 4> T13;
T13.set(float(NEG_INFINITY));
if (((((((nvfuser_index_t)threadIdx.x) * 4) + 3) < 4) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242))) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T13[0], &T4[(((4 * ((nvfuser_index_t)threadIdx.x)) + (4 * ((nvfuser_index_t)threadIdx.y))) + ((4 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))]);
}
Array<float, 1, 1> T15;
T15[0] = NEG_INFINITY;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
T15[0] = fmax(
T15[0],
T13[i0]);
}
Array<float, 1, 1> T5;
T5[0] = NEG_INFINITY;
blockReduce<true, false, false, true>(T5[0], T15[0], [](float &a, float b) { a = fmax(a, b); }, static_cast<float*>(shared_mem), true, true, float(NEG_INFINITY), DefaultBlockDim());
Array<float, 1, 1> T6;
broadcast::blockBroadcast<true, false, false, true>(T6[0], T5[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
// Alias Allocation - register
auto& T8 = T13;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
Array<float, 1, 1> T7;
T7[0]
= T13[i1]
- T6[0];
T8[i1]
= expf(T7[0]);
}
Array<float, 1, 1> T16;
T16[0] = 0.000000000e+00f;
if (((((((nvfuser_index_t)threadIdx.x) * 4) + 3) < 4) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242))) {
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 4; ++i2) {
T16[0]
= T16[0]
+ T8[i2];
}
} else {
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 4; ++i2) {
if ((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < 4) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242))) {
T16[0]
= T16[0]
+ T8[i2];
}
}
}
Array<float, 1, 1> T9;
T9[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T9[0], T16[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T10;
broadcast::blockBroadcast<true, false, false, true>(T10[0], T9[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T11;
T11[0]
= reciprocal(T10[0]);
if (((((((nvfuser_index_t)threadIdx.x) * 4) + 3) < 4) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242))) {
Array<float, 4, 4> T14;
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 4; ++i3) {
T14[i3]
= T8[i3]
* T11[0];
}
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T12[(((4 * ((nvfuser_index_t)threadIdx.x)) + (4 * ((nvfuser_index_t)threadIdx.y))) + ((4 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T14[0]);
} else {
Array<float, 4, 4> T14;
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 4; ++i3) {
T14[i3]
= T8[i3]
* T11[0];
}
if ((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < 4) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T12[(((4 * ((nvfuser_index_t)threadIdx.x)) + (4 * ((nvfuser_index_t)threadIdx.y))) + ((4 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T14[0]);
}
}
}
__global__ void nvfuser_N(Tensor<float, 5, 5> T4, Tensor<float, 5, 5> T12) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
Array<float, 4, 4> T13;
T13.set(float(NEG_INFINITY));
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242)) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T13[0], &T4[((4 * ((nvfuser_index_t)threadIdx.y)) + ((4 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))]);
}
Array<float, 1, 1> T15;
T15[0] = NEG_INFINITY;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
T15[0] = fmax(
T15[0],
T13[i0]);
}
Array<float, 1, 1> T5;
T5[0] = NEG_INFINITY;
blockReduce<true, false, false, true>(T5[0], T15[0], [](float &a, float b) { a = fmax(a, b); }, static_cast<float*>(shared_mem), true, true, float(NEG_INFINITY), DefaultBlockDim());
Array<float, 1, 1> T6;
broadcast::blockBroadcast<true, false, false, true>(T6[0], T5[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
// Alias Allocation - register
auto& T8 = T13;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
Array<float, 1, 1> T7;
T7[0]
= T13[i1]
- T6[0];
T8[i1]
= expf(T7[0]);
}
Array<float, 1, 1> T16;
T16[0] = 0.000000000e+00f;
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242)) {
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 4; ++i2) {
T16[0]
= T16[0]
+ T8[i2];
}
} else {
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 4; ++i2) {
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242)) {
T16[0]
= T16[0]
+ T8[i2];
}
}
}
Array<float, 1, 1> T9;
T9[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T9[0], T16[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T10;
broadcast::blockBroadcast<true, false, false, true>(T10[0], T9[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T11;
T11[0]
= reciprocal(T10[0]);
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242)) {
Array<float, 4, 4> T14;
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 4; ++i3) {
T14[i3]
= T8[i3]
* T11[0];
}
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T12[((4 * ((nvfuser_index_t)threadIdx.y)) + ((4 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T14[0]);
} else {
Array<float, 4, 4> T14;
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 4; ++i3) {
T14[i3]
= T8[i3]
* T11[0];
}
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T12[((4 * ((nvfuser_index_t)threadIdx.y)) + ((4 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T14[0]);
}
}
}
--- 53997da5d
+++ 03a1b695e
@@ -1,12 +1,12 @@
__global__ void nvfuser_N(Tensor<float, 5, 5> T4, Tensor<float, 5, 5> T12) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
Array<float, 4, 4> T13;
T13.set(float(NEG_INFINITY));
- if (((((((nvfuser_index_t)threadIdx.x) * 4) + 3) < 4) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242))) {
- loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T13[0], &T4[(((4 * ((nvfuser_index_t)threadIdx.x)) + (4 * ((nvfuser_index_t)threadIdx.y))) + ((4 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))]);
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242)) {
+ loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T13[0], &T4[((4 * ((nvfuser_index_t)threadIdx.y)) + ((4 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))]);
}
Array<float, 1, 1> T15;
T15[0] = NEG_INFINITY;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
@@ -30,21 +30,21 @@
T8[i1]
= expf(T7[0]);
}
Array<float, 1, 1> T16;
T16[0] = 0.000000000e+00f;
- if (((((((nvfuser_index_t)threadIdx.x) * 4) + 3) < 4) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242))) {
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242)) {
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 4; ++i2) {
T16[0]
= T16[0]
+ T8[i2];
}
} else {
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 4; ++i2) {
- if ((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < 4) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242))) {
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242)) {
T16[0]
= T16[0]
+ T8[i2];
}
}
@@ -55,27 +55,27 @@
Array<float, 1, 1> T10;
broadcast::blockBroadcast<true, false, false, true>(T10[0], T9[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T11;
T11[0]
= reciprocal(T10[0]);
- if (((((((nvfuser_index_t)threadIdx.x) * 4) + 3) < 4) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242))) {
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242)) {
Array<float, 4, 4> T14;
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 4; ++i3) {
T14[i3]
= T8[i3]
* T11[0];
}
- loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T12[(((4 * ((nvfuser_index_t)threadIdx.x)) + (4 * ((nvfuser_index_t)threadIdx.y))) + ((4 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T14[0]);
+ loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T12[((4 * ((nvfuser_index_t)threadIdx.y)) + ((4 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T14[0]);
} else {
Array<float, 4, 4> T14;
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 4; ++i3) {
T14[i3]
= T8[i3]
* T11[0];
}
- if ((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < 4) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242))) {
- loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T12[(((4 * ((nvfuser_index_t)threadIdx.x)) + (4 * ((nvfuser_index_t)threadIdx.y))) + ((4 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T14[0]);
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242)) {
+ loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T12[((4 * ((nvfuser_index_t)threadIdx.y)) + ((4 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T14[0]);
}
}
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_47_cu_4c66540e_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_47_cu_4c66540e_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_47_cu_4c66540e_191103std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_47_cu_4c66540e_191105arrayE[];
.entry _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_47_cu_4c66540e_1911010nvfuser_47ENS_6TensorIfLi5ELi5EEES1_(
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_47_cu_4c66540e_1911010nvfuser_47ENS_6TensorIfLi5ELi5EEES1__param_0[48],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_47_cu_4c66540e_1911010nvfuser_47ENS_6TensorIfLi5ELi5EEES1__param_1[48]
)
{
.reg .pred %p<47>;
.reg .f32 %f<125>;
.reg .b32 %r<105>;
.reg .b64 %rd<27>;
ld.param.u64 %rd7, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_47_cu_4c66540e_1911010nvfuser_47ENS_6TensorIfLi5ELi5EEES1__param_1];
ld.param.u64 %rd6, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_47_cu_4c66540e_1911010nvfuser_47ENS_6TensorIfLi5ELi5EEES1__param_0];
mov.u32 %r1, %tid.x;
setp.lt.s32 %p2, %r1, 1;
mov.f32 %f118, 0fFF800000;
mov.f32 %f119, 0fFF800000;
mov.f32 %f120, 0fFF800000;
mov.f32 %f121, 0fFF800000;
@%p2 bra $L__BB0_1;
bra.uni $L__BB0_3;
$L__BB0_1:
mov.u32 %r56, %tid.y;
mov.u32 %r57, %ctaid.x;
mov.u32 %r58, %ntid.y;
mad.lo.s32 %r2, %r58, %r57, %r56;
setp.gt.s32 %p3, %r2, 241;
@%p3 bra $L__BB0_3;
add.s32 %r63, %r2, %r1;
shl.b32 %r64, %r63, 2;
mul.wide.s32 %rd9, %r64, 4;
add.s64 %rd8, %rd6, %rd9;
// begin inline asm
ld.global.cs.v4.u32 {%r59,%r60,%r61,%r62}, [%rd8];
// end inline asm
mov.b32 %f118, %r59;
mov.b32 %f119, %r60;
mov.b32 %f120, %r61;
mov.b32 %f121, %r62;
$L__BB0_3:
setp.gt.f32 %p4, %f118, %f119;
setp.nan.f32 %p5, %f118, %f118;
or.pred %p6, %p5, %p4;
selp.f32 %f35, %f118, %f119, %p6;
setp.nan.f32 %p7, %f35, %f35;
setp.gt.f32 %p8, %f35, %f120;
or.pred %p9, %p7, %p8;
selp.f32 %f36, %f35, %f120, %p9;
setp.nan.f32 %p10, %f36, %f36;
setp.gt.f32 %p11, %f36, %f121;
or.pred %p12, %p10, %p11;
selp.f32 %f37, %f36, %f121, %p12;
mov.u32 %r65, %tid.z;
mov.u32 %r3, %ntid.y;
mov.u32 %r4, %tid.y;
mad.lo.s32 %r5, %r3, %r65, %r4;
mov.u32 %r6, %ntid.x;
mad.lo.s32 %r7, %r5, %r6, %r1;
mul.wide.u32 %rd10, %r7, 4;
mov.u64 %rd11, _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_47_cu_4c66540e_191105arrayE;
add.s64 %rd1, %rd11, %rd10;
st.shared.f32 [%rd1], %f37;
bar.sync 0;
clz.b32 %r66, %r6;
mov.u32 %r67, 31;
sub.s32 %r68, %r67, %r66;
mov.u32 %r69, 1;
shl.b32 %r8, %r69, %r68;
setp.lt.u32 %p13, %r1, %r8;
add.s32 %r70, %r8, %r1;
setp.lt.u32 %p14, %r70, %r6;
and.pred %p1, %p13, %p14;
add.s32 %r71, %r7, %r8;
mul.wide.s32 %rd12, %r71, 4;
add.s64 %rd2, %rd11, %rd12;
not.pred %p15, %p1;
@%p15 bra $L__BB0_5;
ld.shared.f32 %f38, [%rd2];
ld.shared.f32 %f39, [%rd1];
setp.nan.f32 %p16, %f39, %f39;
setp.gt.f32 %p17, %f39, %f38;
or.pred %p18, %p16, %p17;
selp.f32 %f40, %f39, %f38, %p18;
st.shared.f32 [%rd1], %f40;
$L__BB0_5:
bar.sync 0;
shr.u32 %r72, %r8, 31;
add.s32 %r73, %r8, %r72;
shr.s32 %r104, %r73, 1;
setp.lt.s32 %p19, %r8, 4;
@%p19 bra $L__BB0_10;
mov.u32 %r103, %r104;
$L__BB0_7:
setp.ge.u32 %p20, %r1, %r103;
@%p20 bra $L__BB0_9;
add.s32 %r74, %r103, %r7;
mul.wide.s32 %rd13, %r74, 4;
add.s64 %rd15, %rd11, %rd13;
ld.shared.f32 %f41, [%rd1];
setp.nan.f32 %p21, %f41, %f41;
ld.shared.f32 %f42, [%rd15];
setp.gt.f32 %p22, %f41, %f42;
or.pred %p23, %p21, %p22;
selp.f32 %f43, %f41, %f42, %p23;
st.shared.f32 [%rd1], %f43;
$L__BB0_9:
bar.sync 0;
shr.u32 %r11, %r103, 1;
setp.gt.u32 %p24, %r103, 3;
mov.u32 %r103, %r11;
@%p24 bra $L__BB0_7;
$L__BB0_10:
setp.ne.s32 %p25, %r1, 0;
add.s32 %r75, %r7, 1;
mul.wide.u32 %rd16, %r75, 4;
add.s64 %rd3, %rd11, %rd16;
mov.f32 %f122, 0fFF800000;
@%p25 bra $L__BB0_13;
ld.shared.f32 %f122, [%rd1];
setp.lt.u32 %p26, %r6, 2;
@%p26 bra $L__BB0_13;
ld.shared.f32 %f45, [%rd3];
setp.gt.f32 %p27, %f122, %f45;
setp.nan.f32 %p28, %f122, %f122;
or.pred %p29, %p28, %p27;
selp.f32 %f122, %f122, %f45, %p29;
$L__BB0_13:
bar.sync 0;
mul.wide.s32 %rd18, %r5, 4;
add.s64 %rd4, %rd11, %rd18;
setp.eq.s32 %p30, %r1, 0;
@%p30 bra $L__BB0_14;
bra.uni $L__BB0_15;
$L__BB0_14:
st.shared.f32 [%rd4], %f122;
$L__BB0_15:
bar.sync 0;
ld.shared.f32 %f46, [%rd4];
bar.sync 0;
sub.f32 %f47, %f118, %f46;
mov.f32 %f48, 0f3F000000;
mov.f32 %f49, 0f3BBB989D;
fma.rn.f32 %f50, %f47, %f49, %f48;
cvt.sat.f32.f32 %f51, %f50;
mov.f32 %f52, 0f4B400001;
mov.f32 %f53, 0f437C0000;
fma.rm.f32 %f54, %f51, %f53, %f52;
add.f32 %f55, %f54, 0fCB40007F;
neg.f32 %f56, %f55;
mov.f32 %f57, 0f3FB8AA3B;
fma.rn.f32 %f58, %f47, %f57, %f56;
mov.f32 %f59, 0f32A57060;
fma.rn.f32 %f60, %f47, %f59, %f58;
mov.b32 %r76, %f54;
shl.b32 %r77, %r76, 23;
mov.b32 %f61, %r77;
ex2.approx.ftz.f32 %f62, %f60;
mul.f32 %f12, %f62, %f61;
sub.f32 %f63, %f119, %f46;
fma.rn.f32 %f64, %f63, %f49, %f48;
cvt.sat.f32.f32 %f65, %f64;
fma.rm.f32 %f66, %f65, %f53, %f52;
add.f32 %f67, %f66, 0fCB40007F;
neg.f32 %f68, %f67;
fma.rn.f32 %f69, %f63, %f57, %f68;
fma.rn.f32 %f70, %f63, %f59, %f69;
mov.b32 %r78, %f66;
shl.b32 %r79, %r78, 23;
mov.b32 %f71, %r79;
ex2.approx.ftz.f32 %f72, %f70;
mul.f32 %f13, %f72, %f71;
sub.f32 %f73, %f120, %f46;
fma.rn.f32 %f74, %f73, %f49, %f48;
cvt.sat.f32.f32 %f75, %f74;
fma.rm.f32 %f76, %f75, %f53, %f52;
add.f32 %f77, %f76, 0fCB40007F;
neg.f32 %f78, %f77;
fma.rn.f32 %f79, %f73, %f57, %f78;
fma.rn.f32 %f80, %f73, %f59, %f79;
mov.b32 %r80, %f76;
shl.b32 %r81, %r80, 23;
mov.b32 %f81, %r81;
ex2.approx.ftz.f32 %f82, %f80;
mul.f32 %f14, %f82, %f81;
sub.f32 %f83, %f121, %f46;
fma.rn.f32 %f84, %f83, %f49, %f48;
cvt.sat.f32.f32 %f85, %f84;
fma.rm.f32 %f86, %f85, %f53, %f52;
add.f32 %f87, %f86, 0fCB40007F;
neg.f32 %f88, %f87;
fma.rn.f32 %f89, %f83, %f57, %f88;
fma.rn.f32 %f90, %f83, %f59, %f89;
mov.b32 %r82, %f86;
shl.b32 %r83, %r82, 23;
mov.b32 %f91, %r83;
ex2.approx.ftz.f32 %f92, %f90;
mul.f32 %f15, %f92, %f91;
@%p2 bra $L__BB0_16;
bra.uni $L__BB0_17;
$L__BB0_16:
mov.u32 %r84, %ctaid.x;
mad.lo.s32 %r85, %r3, %r84, %r4;
setp.lt.s32 %p32, %r85, 242;
@%p32 bra $L__BB0_18;
bra.uni $L__BB0_17;
$L__BB0_18:
add.f32 %f97, %f12, 0f00000000;
add.f32 %f98, %f97, %f13;
add.f32 %f99, %f98, %f14;
add.f32 %f123, %f99, %f15;
bra.uni $L__BB0_19;
$L__BB0_17:
mov.u32 %r86, %ctaid.x;
mad.lo.s32 %r87, %r3, %r86, %r4;
setp.lt.s32 %p34, %r87, 242;
and.pred %p35, %p2, %p34;
add.f32 %f93, %f12, 0f00000000;
add.f32 %f94, %f93, %f13;
add.f32 %f95, %f94, %f14;
add.f32 %f96, %f95, %f15;
selp.f32 %f123, %f96, 0f00000000, %p35;
$L__BB0_19:
st.shared.f32 [%rd1], %f123;
bar.sync 0;
@%p15 bra $L__BB0_21;
ld.shared.f32 %f100, [%rd2];
ld.shared.f32 %f101, [%rd1];
add.f32 %f102, %f100, %f101;
st.shared.f32 [%rd1], %f102;
$L__BB0_21:
bar.sync 0;
@%p19 bra $L__BB0_25;
$L__BB0_22:
setp.ge.u32 %p38, %r1, %r104;
@%p38 bra $L__BB0_24;
add.s32 %r88, %r104, %r7;
mul.wide.s32 %rd20, %r88, 4;
add.s64 %rd22, %rd11, %rd20;
ld.shared.f32 %f103, [%rd1];
ld.shared.f32 %f104, [%rd22];
add.f32 %f105, %f104, %f103;
st.shared.f32 [%rd1], %f105;
$L__BB0_24:
bar.sync 0;
shr.u32 %r13, %r104, 1;
setp.gt.u32 %p39, %r104, 3;
mov.u32 %r104, %r13;
@%p39 bra $L__BB0_22;
$L__BB0_25:
mov.f32 %f124, 0f00000000;
@%p25 bra $L__BB0_28;
ld.shared.f32 %f107, [%rd1];
add.f32 %f124, %f107, 0f00000000;
setp.lt.u32 %p41, %r6, 2;
@%p41 bra $L__BB0_28;
ld.shared.f32 %f108, [%rd3];
add.f32 %f124, %f124, %f108;
$L__BB0_28:
bar.sync 0;
@%p25 bra $L__BB0_30;
st.shared.f32 [%rd4], %f124;
$L__BB0_30:
setp.gt.s32 %p43, %r1, 0;
bar.sync 0;
ld.shared.f32 %f109, [%rd4];
bar.sync 0;
rcp.rn.f32 %f22, %f109;
@%p43 bra $L__BB0_32;
mov.u32 %r89, %ctaid.x;
mad.lo.s32 %r14, %r3, %r89, %r4;
setp.lt.s32 %p44, %r14, 242;
@%p44 bra $L__BB0_35;
bra.uni $L__BB0_32;
$L__BB0_35:
mul.f32 %f110, %f22, %f12;
mov.b32 %r97, %f110;
mul.f32 %f111, %f22, %f13;
mov.b32 %r98, %f111;
mul.f32 %f112, %f22, %f14;
mov.b32 %r99, %f112;
mul.f32 %f113, %f22, %f15;
mov.b32 %r100, %f113;
add.s32 %r101, %r14, %r1;
shl.b32 %r102, %r101, 2;
mul.wide.s32 %rd26, %r102, 4;
add.s64 %rd25, %rd7, %rd26;
// begin inline asm
st.global.cs.v4.s32 [%rd25], {%r97,%r98,%r99,%r100};
// end inline asm
bra.uni $L__BB0_36;
$L__BB0_32:
mul.f32 %f23, %f22, %f12;
mul.f32 %f24, %f22, %f13;
mul.f32 %f25, %f22, %f14;
mul.f32 %f26, %f22, %f15;
@%p43 bra $L__BB0_36;
mov.u32 %r90, %ctaid.x;
mad.lo.s32 %r15, %r3, %r90, %r4;
setp.gt.s32 %p46, %r15, 241;
@%p46 bra $L__BB0_36;
add.s32 %r95, %r15, %r1;
shl.b32 %r96, %r95, 2;
mul.wide.s32 %rd24, %r96, 4;
add.s64 %rd23, %rd7, %rd24;
mov.b32 %r91, %f23;
mov.b32 %r92, %f24;
mov.b32 %r93, %f25;
mov.b32 %r94, %f26;
// begin inline asm
st.global.cs.v4.s32 [%rd23], {%r91,%r92,%r93,%r94};
// end inline asm
$L__BB0_36:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_47_cu_a6357bae_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_47_cu_a6357bae_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_47_cu_a6357bae_160113std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_47_cu_a6357bae_160115arrayE[];
.entry _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_47_cu_a6357bae_1601110nvfuser_47ENS_6TensorIfLi5ELi5EEES1_(
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_47_cu_a6357bae_1601110nvfuser_47ENS_6TensorIfLi5ELi5EEES1__param_0[48],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_47_cu_a6357bae_1601110nvfuser_47ENS_6TensorIfLi5ELi5EEES1__param_1[48]
)
{
.reg .pred %p<39>;
.reg .f32 %f<107>;
.reg .b32 %r<87>;
.reg .b64 %rd<24>;
ld.param.u64 %rd6, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_47_cu_a6357bae_1601110nvfuser_47ENS_6TensorIfLi5ELi5EEES1__param_1];
ld.param.u64 %rd5, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_47_cu_a6357bae_1601110nvfuser_47ENS_6TensorIfLi5ELi5EEES1__param_0];
mov.u32 %r54, %ctaid.x;
mov.u32 %r1, %ntid.y;
mov.u32 %r2, %tid.y;
mad.lo.s32 %r3, %r1, %r54, %r2;
setp.gt.s32 %p2, %r3, 241;
mov.f32 %f101, 0fFF800000;
mov.f32 %f102, %f101;
mov.f32 %f103, %f101;
mov.f32 %f104, %f101;
@%p2 bra $L__BB0_2;
shl.b32 %r59, %r3, 2;
mul.wide.s32 %rd8, %r59, 4;
add.s64 %rd7, %rd5, %rd8;
// begin inline asm
ld.global.cs.v4.u32 {%r55,%r56,%r57,%r58}, [%rd7];
// end inline asm
mov.b32 %f101, %r55;
mov.b32 %f102, %r56;
mov.b32 %f103, %r57;
mov.b32 %f104, %r58;
$L__BB0_2:
setp.gt.f32 %p3, %f101, %f102;
setp.nan.f32 %p4, %f101, %f101;
or.pred %p5, %p4, %p3;
selp.f32 %f24, %f101, %f102, %p5;
setp.nan.f32 %p6, %f24, %f24;
setp.gt.f32 %p7, %f24, %f103;
or.pred %p8, %p6, %p7;
selp.f32 %f25, %f24, %f103, %p8;
setp.nan.f32 %p9, %f25, %f25;
setp.gt.f32 %p10, %f25, %f104;
or.pred %p11, %p9, %p10;
selp.f32 %f26, %f25, %f104, %p11;
mov.u32 %r60, %tid.z;
mad.lo.s32 %r4, %r1, %r60, %r2;
mov.u32 %r5, %ntid.x;
mov.u32 %r6, %tid.x;
mad.lo.s32 %r7, %r4, %r5, %r6;
mul.wide.u32 %rd9, %r7, 4;
mov.u64 %rd10, _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_47_cu_a6357bae_160115arrayE;
add.s64 %rd1, %rd10, %rd9;
st.shared.f32 [%rd1], %f26;
bar.sync 0;
clz.b32 %r61, %r5;
mov.u32 %r62, 31;
sub.s32 %r63, %r62, %r61;
mov.u32 %r64, 1;
shl.b32 %r8, %r64, %r63;
setp.lt.u32 %p12, %r6, %r8;
add.s32 %r65, %r8, %r6;
setp.lt.u32 %p13, %r65, %r5;
and.pred %p1, %p12, %p13;
add.s32 %r66, %r7, %r8;
mul.wide.s32 %rd11, %r66, 4;
add.s64 %rd2, %rd10, %rd11;
not.pred %p14, %p1;
@%p14 bra $L__BB0_4;
ld.shared.f32 %f27, [%rd2];
ld.shared.f32 %f28, [%rd1];
setp.nan.f32 %p15, %f28, %f28;
setp.gt.f32 %p16, %f28, %f27;
or.pred %p17, %p15, %p16;
selp.f32 %f29, %f28, %f27, %p17;
st.shared.f32 [%rd1], %f29;
$L__BB0_4:
bar.sync 0;
shr.u32 %r67, %r8, 31;
add.s32 %r68, %r8, %r67;
shr.s32 %r86, %r68, 1;
setp.lt.s32 %p18, %r8, 4;
@%p18 bra $L__BB0_9;
mov.u32 %r85, %r86;
$L__BB0_6:
setp.ge.u32 %p19, %r6, %r85;
@%p19 bra $L__BB0_8;
add.s32 %r69, %r85, %r7;
mul.wide.s32 %rd12, %r69, 4;
add.s64 %rd14, %rd10, %rd12;
ld.shared.f32 %f30, [%rd1];
setp.nan.f32 %p20, %f30, %f30;
ld.shared.f32 %f31, [%rd14];
setp.gt.f32 %p21, %f30, %f31;
or.pred %p22, %p20, %p21;
selp.f32 %f32, %f30, %f31, %p22;
st.shared.f32 [%rd1], %f32;
$L__BB0_8:
bar.sync 0;
shr.u32 %r11, %r85, 1;
setp.gt.u32 %p23, %r85, 3;
mov.u32 %r85, %r11;
@%p23 bra $L__BB0_6;
$L__BB0_9:
setp.ne.s32 %p24, %r6, 0;
add.s32 %r70, %r7, 1;
mul.wide.u32 %rd15, %r70, 4;
add.s64 %rd3, %rd10, %rd15;
mov.f32 %f105, 0fFF800000;
@%p24 bra $L__BB0_12;
ld.shared.f32 %f105, [%rd1];
setp.lt.u32 %p25, %r5, 2;
@%p25 bra $L__BB0_12;
ld.shared.f32 %f34, [%rd3];
setp.gt.f32 %p26, %f105, %f34;
setp.nan.f32 %p27, %f105, %f105;
or.pred %p28, %p27, %p26;
selp.f32 %f105, %f105, %f34, %p28;
$L__BB0_12:
bar.sync 0;
mul.wide.s32 %rd17, %r4, 4;
add.s64 %rd4, %rd10, %rd17;
setp.eq.s32 %p29, %r6, 0;
@%p29 bra $L__BB0_13;
bra.uni $L__BB0_14;
$L__BB0_13:
st.shared.f32 [%rd4], %f105;
$L__BB0_14:
setp.lt.s32 %p30, %r3, 242;
bar.sync 0;
ld.shared.f32 %f35, [%rd4];
bar.sync 0;
sub.f32 %f36, %f101, %f35;
mov.f32 %f37, 0f3F000000;
mov.f32 %f38, 0f3BBB989D;
fma.rn.f32 %f39, %f36, %f38, %f37;
cvt.sat.f32.f32 %f40, %f39;
mov.f32 %f41, 0f4B400001;
mov.f32 %f42, 0f437C0000;
fma.rm.f32 %f43, %f40, %f42, %f41;
add.f32 %f44, %f43, 0fCB40007F;
neg.f32 %f45, %f44;
mov.f32 %f46, 0f3FB8AA3B;
fma.rn.f32 %f47, %f36, %f46, %f45;
mov.f32 %f48, 0f32A57060;
fma.rn.f32 %f49, %f36, %f48, %f47;
mov.b32 %r71, %f43;
shl.b32 %r72, %r71, 23;
mov.b32 %f50, %r72;
ex2.approx.ftz.f32 %f51, %f49;
mul.f32 %f12, %f51, %f50;
sub.f32 %f52, %f102, %f35;
fma.rn.f32 %f53, %f52, %f38, %f37;
cvt.sat.f32.f32 %f54, %f53;
fma.rm.f32 %f55, %f54, %f42, %f41;
add.f32 %f56, %f55, 0fCB40007F;
neg.f32 %f57, %f56;
fma.rn.f32 %f58, %f52, %f46, %f57;
fma.rn.f32 %f59, %f52, %f48, %f58;
mov.b32 %r73, %f55;
shl.b32 %r74, %r73, 23;
mov.b32 %f60, %r74;
ex2.approx.ftz.f32 %f61, %f59;
mul.f32 %f13, %f61, %f60;
sub.f32 %f62, %f103, %f35;
fma.rn.f32 %f63, %f62, %f38, %f37;
cvt.sat.f32.f32 %f64, %f63;
fma.rm.f32 %f65, %f64, %f42, %f41;
add.f32 %f66, %f65, 0fCB40007F;
neg.f32 %f67, %f66;
fma.rn.f32 %f68, %f62, %f46, %f67;
fma.rn.f32 %f69, %f62, %f48, %f68;
mov.b32 %r75, %f65;
shl.b32 %r76, %r75, 23;
mov.b32 %f70, %r76;
ex2.approx.ftz.f32 %f71, %f69;
mul.f32 %f14, %f71, %f70;
sub.f32 %f72, %f104, %f35;
fma.rn.f32 %f73, %f72, %f38, %f37;
cvt.sat.f32.f32 %f74, %f73;
fma.rm.f32 %f75, %f74, %f42, %f41;
add.f32 %f76, %f75, 0fCB40007F;
neg.f32 %f77, %f76;
fma.rn.f32 %f78, %f72, %f46, %f77;
fma.rn.f32 %f79, %f72, %f48, %f78;
mov.b32 %r77, %f75;
shl.b32 %r78, %r77, 23;
mov.b32 %f80, %r78;
ex2.approx.ftz.f32 %f81, %f79;
mul.f32 %f15, %f81, %f80;
add.f32 %f82, %f12, 0f00000000;
add.f32 %f83, %f82, %f13;
add.f32 %f84, %f83, %f14;
add.f32 %f85, %f84, %f15;
selp.f32 %f86, %f85, 0f00000000, %p30;
st.shared.f32 [%rd1], %f86;
bar.sync 0;
@%p14 bra $L__BB0_16;
ld.shared.f32 %f87, [%rd2];
ld.shared.f32 %f88, [%rd1];
add.f32 %f89, %f87, %f88;
st.shared.f32 [%rd1], %f89;
$L__BB0_16:
bar.sync 0;
@%p18 bra $L__BB0_20;
$L__BB0_17:
setp.ge.u32 %p33, %r6, %r86;
@%p33 bra $L__BB0_19;
add.s32 %r79, %r86, %r7;
mul.wide.s32 %rd19, %r79, 4;
add.s64 %rd21, %rd10, %rd19;
ld.shared.f32 %f90, [%rd1];
ld.shared.f32 %f91, [%rd21];
add.f32 %f92, %f91, %f90;
st.shared.f32 [%rd1], %f92;
$L__BB0_19:
bar.sync 0;
shr.u32 %r13, %r86, 1;
setp.gt.u32 %p34, %r86, 3;
mov.u32 %r86, %r13;
@%p34 bra $L__BB0_17;
$L__BB0_20:
mov.f32 %f106, 0f00000000;
@%p24 bra $L__BB0_23;
ld.shared.f32 %f94, [%rd1];
add.f32 %f106, %f94, 0f00000000;
setp.lt.u32 %p36, %r5, 2;
@%p36 bra $L__BB0_23;
ld.shared.f32 %f95, [%rd3];
add.f32 %f106, %f106, %f95;
$L__BB0_23:
bar.sync 0;
@%p24 bra $L__BB0_25;
st.shared.f32 [%rd4], %f106;
$L__BB0_25:
bar.sync 0;
ld.shared.f32 %f19, [%rd4];
bar.sync 0;
@%p2 bra $L__BB0_27;
rcp.rn.f32 %f96, %f19;
mul.f32 %f97, %f96, %f12;
mov.b32 %r80, %f97;
mul.f32 %f98, %f96, %f13;
mov.b32 %r81, %f98;
mul.f32 %f99, %f96, %f14;
mov.b32 %r82, %f99;
mul.f32 %f100, %f96, %f15;
mov.b32 %r83, %f100;
shl.b32 %r84, %r3, 2;
mul.wide.s32 %rd23, %r84, 4;
add.s64 %rd22, %rd6, %rd23;
// begin inline asm
st.global.cs.v4.s32 [%rd22], {%r80,%r81,%r82,%r83};
// end inline asm
$L__BB0_27:
ret;
}
--- 53997da5d
+++ 03a1b695e
@@ -18,353 +18,288 @@
.entry _ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_(
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1__param_0[48],
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1__param_1[48]
)
{
- .reg .pred %p<47>;
- .reg .f32 %f<125>;
- .reg .b32 %r<105>;
- .reg .b64 %rd<27>;
-
-
- ld.param.u64 %rd7, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1__param_1];
- ld.param.u64 %rd6, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1__param_0];
- mov.u32 %r1, %tid.x;
- setp.lt.s32 %p2, %r1, 1;
- mov.f32 %f118, 0fFF800000;
- mov.f32 %f119, 0fFF800000;
- mov.f32 %f120, 0fFF800000;
- mov.f32 %f121, 0fFF800000;
- @%p2 bra $L__BB0_1;
- bra.uni $L__BB0_3;
-
-$L__BB0_1:
- mov.u32 %r56, %tid.y;
- mov.u32 %r57, %ctaid.x;
- mov.u32 %r58, %ntid.y;
- mad.lo.s32 %r2, %r58, %r57, %r56;
- setp.gt.s32 %p3, %r2, 241;
- @%p3 bra $L__BB0_3;
-
- add.s32 %r63, %r2, %r1;
- shl.b32 %r64, %r63, 2;
- mul.wide.s32 %rd9, %r64, 4;
- add.s64 %rd8, %rd6, %rd9;
+ .reg .pred %p<39>;
+ .reg .f32 %f<107>;
+ .reg .b32 %r<87>;
+ .reg .b64 %rd<24>;
+
+
+ ld.param.u64 %rd6, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1__param_1];
+ ld.param.u64 %rd5, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1__param_0];
+ mov.u32 %r54, %ctaid.x;
+ mov.u32 %r1, %ntid.y;
+ mov.u32 %r2, %tid.y;
+ mad.lo.s32 %r3, %r1, %r54, %r2;
+ setp.gt.s32 %p2, %r3, 241;
+ mov.f32 %f101, 0fFF800000;
+ mov.f32 %f102, %f101;
+ mov.f32 %f103, %f101;
+ mov.f32 %f104, %f101;
+ @%p2 bra $L__BB0_2;
+
+ shl.b32 %r59, %r3, 2;
+ mul.wide.s32 %rd8, %r59, 4;
+ add.s64 %rd7, %rd5, %rd8;
- ld.global.cs.v4.u32 {%r59,%r60,%r61,%r62}, [%rd8];
+ ld.global.cs.v4.u32 {%r55,%r56,%r57,%r58}, [%rd7];
- mov.b32 %f118, %r59;
- mov.b32 %f119, %r60;
- mov.b32 %f120, %r61;
- mov.b32 %f121, %r62;
-
-$L__BB0_3:
- setp.gt.f32 %p4, %f118, %f119;
- setp.nan.f32 %p5, %f118, %f118;
- or.pred %p6, %p5, %p4;
- selp.f32 %f35, %f118, %f119, %p6;
- setp.nan.f32 %p7, %f35, %f35;
- setp.gt.f32 %p8, %f35, %f120;
- or.pred %p9, %p7, %p8;
- selp.f32 %f36, %f35, %f120, %p9;
- setp.nan.f32 %p10, %f36, %f36;
- setp.gt.f32 %p11, %f36, %f121;
- or.pred %p12, %p10, %p11;
- selp.f32 %f37, %f36, %f121, %p12;
- mov.u32 %r65, %tid.z;
- mov.u32 %r3, %ntid.y;
- mov.u32 %r4, %tid.y;
- mad.lo.s32 %r5, %r3, %r65, %r4;
- mov.u32 %r6, %ntid.x;
- mad.lo.s32 %r7, %r5, %r6, %r1;
- mul.wide.u32 %rd10, %r7, 4;
- mov.u64 %rd11, _ZN11kernelscope6kernelE;
- add.s64 %rd1, %rd11, %rd10;
- st.shared.f32 [%rd1], %f37;
- bar.sync 0;
- clz.b32 %r66, %r6;
- mov.u32 %r67, 31;
- sub.s32 %r68, %r67, %r66;
- mov.u32 %r69, 1;
- shl.b32 %r8, %r69, %r68;
- setp.lt.u32 %p13, %r1, %r8;
- add.s32 %r70, %r8, %r1;
- setp.lt.u32 %p14, %r70, %r6;
- and.pred %p1, %p13, %p14;
- add.s32 %r71, %r7, %r8;
- mul.wide.s32 %rd12, %r71, 4;
- add.s64 %rd2, %rd11, %rd12;
- not.pred %p15, %p1;
- @%p15 bra $L__BB0_5;
-
- ld.shared.f32 %f38, [%rd2];
- ld.shared.f32 %f39, [%rd1];
- setp.nan.f32 %p16, %f39, %f39;
- setp.gt.f32 %p17, %f39, %f38;
- or.pred %p18, %p16, %p17;
- selp.f32 %f40, %f39, %f38, %p18;
- st.shared.f32 [%rd1], %f40;
-
-$L__BB0_5:
- bar.sync 0;
- shr.u32 %r72, %r8, 31;
- add.s32 %r73, %r8, %r72;
- shr.s32 %r104, %r73, 1;
- setp.lt.s32 %p19, %r8, 4;
- @%p19 bra $L__BB0_10;
-
- mov.u32 %r103, %r104;
-
-$L__BB0_7:
- setp.ge.u32 %p20, %r1, %r103;
- @%p20 bra $L__BB0_9;
-
- add.s32 %r74, %r103, %r7;
- mul.wide.s32 %rd13, %r74, 4;
- add.s64 %rd15, %rd11, %rd13;
- ld.shared.f32 %f41, [%rd1];
- setp.nan.f32 %p21, %f41, %f41;
- ld.shared.f32 %f42, [%rd15];
- setp.gt.f32 %p22, %f41, %f42;
- or.pred %p23, %p21, %p22;
- selp.f32 %f43, %f41, %f42, %p23;
- st.shared.f32 [%rd1], %f43;
+ mov.b32 %f101, %r55;
+ mov.b32 %f102, %r56;
+ mov.b32 %f103, %r57;
+ mov.b32 %f104, %r58;
+
+$L__BB0_2:
+ setp.gt.f32 %p3, %f101, %f102;
+ setp.nan.f32 %p4, %f101, %f101;
+ or.pred %p5, %p4, %p3;
+ selp.f32 %f24, %f101, %f102, %p5;
+ setp.nan.f32 %p6, %f24, %f24;
+ setp.gt.f32 %p7, %f24, %f103;
+ or.pred %p8, %p6, %p7;
+ selp.f32 %f25, %f24, %f103, %p8;
+ setp.nan.f32 %p9, %f25, %f25;
+ setp.gt.f32 %p10, %f25, %f104;
+ or.pred %p11, %p9, %p10;
+ selp.f32 %f26, %f25, %f104, %p11;
+ mov.u32 %r60, %tid.z;
+ mad.lo.s32 %r4, %r1, %r60, %r2;
+ mov.u32 %r5, %ntid.x;
+ mov.u32 %r6, %tid.x;
+ mad.lo.s32 %r7, %r4, %r5, %r6;
+ mul.wide.u32 %rd9, %r7, 4;
+ mov.u64 %rd10, _ZN11kernelscope6kernelE;
+ add.s64 %rd1, %rd10, %rd9;
+ st.shared.f32 [%rd1], %f26;
+ bar.sync 0;
+ clz.b32 %r61, %r5;
+ mov.u32 %r62, 31;
+ sub.s32 %r63, %r62, %r61;
+ mov.u32 %r64, 1;
+ shl.b32 %r8, %r64, %r63;
+ setp.lt.u32 %p12, %r6, %r8;
+ add.s32 %r65, %r8, %r6;
+ setp.lt.u32 %p13, %r65, %r5;
+ and.pred %p1, %p12, %p13;
+ add.s32 %r66, %r7, %r8;
+ mul.wide.s32 %rd11, %r66, 4;
+ add.s64 %rd2, %rd10, %rd11;
+ not.pred %p14, %p1;
+ @%p14 bra $L__BB0_4;
+
+ ld.shared.f32 %f27, [%rd2];
+ ld.shared.f32 %f28, [%rd1];
+ setp.nan.f32 %p15, %f28, %f28;
+ setp.gt.f32 %p16, %f28, %f27;
+ or.pred %p17, %p15, %p16;
+ selp.f32 %f29, %f28, %f27, %p17;
+ st.shared.f32 [%rd1], %f29;
+
+$L__BB0_4:
+ bar.sync 0;
+ shr.u32 %r67, %r8, 31;
+ add.s32 %r68, %r8, %r67;
+ shr.s32 %r86, %r68, 1;
+ setp.lt.s32 %p18, %r8, 4;
+ @%p18 bra $L__BB0_9;
+
+ mov.u32 %r85, %r86;
+
+$L__BB0_6:
+ setp.ge.u32 %p19, %r6, %r85;
+ @%p19 bra $L__BB0_8;
+
+ add.s32 %r69, %r85, %r7;
+ mul.wide.s32 %rd12, %r69, 4;
+ add.s64 %rd14, %rd10, %rd12;
+ ld.shared.f32 %f30, [%rd1];
+ setp.nan.f32 %p20, %f30, %f30;
+ ld.shared.f32 %f31, [%rd14];
+ setp.gt.f32 %p21, %f30, %f31;
+ or.pred %p22, %p20, %p21;
+ selp.f32 %f32, %f30, %f31, %p22;
+ st.shared.f32 [%rd1], %f32;
+
+$L__BB0_8:
+ bar.sync 0;
+ shr.u32 %r11, %r85, 1;
+ setp.gt.u32 %p23, %r85, 3;
+ mov.u32 %r85, %r11;
+ @%p23 bra $L__BB0_6;
$L__BB0_9:
- bar.sync 0;
- shr.u32 %r11, %r103, 1;
- setp.gt.u32 %p24, %r103, 3;
- mov.u32 %r103, %r11;
- @%p24 bra $L__BB0_7;
-
-$L__BB0_10:
- setp.ne.s32 %p25, %r1, 0;
- add.s32 %r75, %r7, 1;
- mul.wide.u32 %rd16, %r75, 4;
- add.s64 %rd3, %rd11, %rd16;
- mov.f32 %f122, 0fFF800000;
- @%p25 bra $L__BB0_13;
-
- ld.shared.f32 %f122, [%rd1];
- setp.lt.u32 %p26, %r6, 2;
- @%p26 bra $L__BB0_13;
-
- ld.shared.f32 %f45, [%rd3];
- setp.gt.f32 %p27, %f122, %f45;
- setp.nan.f32 %p28, %f122, %f122;
- or.pred %p29, %p28, %p27;
- selp.f32 %f122, %f122, %f45, %p29;
+ setp.ne.s32 %p24, %r6, 0;
+ add.s32 %r70, %r7, 1;
+ mul.wide.u32 %rd15, %r70, 4;
+ add.s64 %rd3, %rd10, %rd15;
+ mov.f32 %f105, 0fFF800000;
+ @%p24 bra $L__BB0_12;
+
+ ld.shared.f32 %f105, [%rd1];
+ setp.lt.u32 %p25, %r5, 2;
+ @%p25 bra $L__BB0_12;
+
+ ld.shared.f32 %f34, [%rd3];
+ setp.gt.f32 %p26, %f105, %f34;
+ setp.nan.f32 %p27, %f105, %f105;
+ or.pred %p28, %p27, %p26;
+ selp.f32 %f105, %f105, %f34, %p28;
+
+$L__BB0_12:
+ bar.sync 0;
+ mul.wide.s32 %rd17, %r4, 4;
+ add.s64 %rd4, %rd10, %rd17;
+ setp.eq.s32 %p29, %r6, 0;
+ @%p29 bra $L__BB0_13;
+ bra.uni $L__BB0_14;
$L__BB0_13:
- bar.sync 0;
- mul.wide.s32 %rd18, %r5, 4;
- add.s64 %rd4, %rd11, %rd18;
- setp.eq.s32 %p30, %r1, 0;
- @%p30 bra $L__BB0_14;
- bra.uni $L__BB0_15;
+ st.shared.f32 [%rd4], %f105;
$L__BB0_14:
- st.shared.f32 [%rd4], %f122;
-
-$L__BB0_15:
- bar.sync 0;
- ld.shared.f32 %f46, [%rd4];
- bar.sync 0;
- sub.f32 %f47, %f118, %f46;
- mov.f32 %f48, 0f3F000000;
- mov.f32 %f49, 0f3BBB989D;
- fma.rn.f32 %f50, %f47, %f49, %f48;
- cvt.sat.f32.f32 %f51, %f50;
- mov.f32 %f52, 0f4B400001;
- mov.f32 %f53, 0f437C0000;
- fma.rm.f32 %f54, %f51, %f53, %f52;
- add.f32 %f55, %f54, 0fCB40007F;
- neg.f32 %f56, %f55;
- mov.f32 %f57, 0f3FB8AA3B;
- fma.rn.f32 %f58, %f47, %f57, %f56;
- mov.f32 %f59, 0f32A57060;
- fma.rn.f32 %f60, %f47, %f59, %f58;
- mov.b32 %r76, %f54;
- shl.b32 %r77, %r76, 23;
- mov.b32 %f61, %r77;
- ex2.approx.ftz.f32 %f62, %f60;
- mul.f32 %f12, %f62, %f61;
- sub.f32 %f63, %f119, %f46;
- fma.rn.f32 %f64, %f63, %f49, %f48;
- cvt.sat.f32.f32 %f65, %f64;
- fma.rm.f32 %f66, %f65, %f53, %f52;
- add.f32 %f67, %f66, 0fCB40007F;
- neg.f32 %f68, %f67;
- fma.rn.f32 %f69, %f63, %f57, %f68;
- fma.rn.f32 %f70, %f63, %f59, %f69;
- mov.b32 %r78, %f66;
- shl.b32 %r79, %r78, 23;
- mov.b32 %f71, %r79;
- ex2.approx.ftz.f32 %f72, %f70;
- mul.f32 %f13, %f72, %f71;
- sub.f32 %f73, %f120, %f46;
- fma.rn.f32 %f74, %f73, %f49, %f48;
- cvt.sat.f32.f32 %f75, %f74;
- fma.rm.f32 %f76, %f75, %f53, %f52;
- add.f32 %f77, %f76, 0fCB40007F;
- neg.f32 %f78, %f77;
- fma.rn.f32 %f79, %f73, %f57, %f78;
- fma.rn.f32 %f80, %f73, %f59, %f79;
- mov.b32 %r80, %f76;
- shl.b32 %r81, %r80, 23;
- mov.b32 %f81, %r81;
- ex2.approx.ftz.f32 %f82, %f80;
- mul.f32 %f14, %f82, %f81;
- sub.f32 %f83, %f121, %f46;
- fma.rn.f32 %f84, %f83, %f49, %f48;
- cvt.sat.f32.f32 %f85, %f84;
- fma.rm.f32 %f86, %f85, %f53, %f52;
- add.f32 %f87, %f86, 0fCB40007F;
- neg.f32 %f88, %f87;
- fma.rn.f32 %f89, %f83, %f57, %f88;
- fma.rn.f32 %f90, %f83, %f59, %f89;
- mov.b32 %r82, %f86;
- shl.b32 %r83, %r82, 23;
- mov.b32 %f91, %r83;
- ex2.approx.ftz.f32 %f92, %f90;
- mul.f32 %f15, %f92, %f91;
- @%p2 bra $L__BB0_16;
- bra.uni $L__BB0_17;
+ setp.lt.s32 %p30, %r3, 242;
+ bar.sync 0;
+ ld.shared.f32 %f35, [%rd4];
+ bar.sync 0;
+ sub.f32 %f36, %f101, %f35;
+ mov.f32 %f37, 0f3F000000;
+ mov.f32 %f38, 0f3BBB989D;
+ fma.rn.f32 %f39, %f36, %f38, %f37;
+ cvt.sat.f32.f32 %f40, %f39;
+ mov.f32 %f41, 0f4B400001;
+ mov.f32 %f42, 0f437C0000;
+ fma.rm.f32 %f43, %f40, %f42, %f41;
+ add.f32 %f44, %f43, 0fCB40007F;
+ neg.f32 %f45, %f44;
+ mov.f32 %f46, 0f3FB8AA3B;
+ fma.rn.f32 %f47, %f36, %f46, %f45;
+ mov.f32 %f48, 0f32A57060;
+ fma.rn.f32 %f49, %f36, %f48, %f47;
+ mov.b32 %r71, %f43;
+ shl.b32 %r72, %r71, 23;
+ mov.b32 %f50, %r72;
+ ex2.approx.ftz.f32 %f51, %f49;
+ mul.f32 %f12, %f51, %f50;
+ sub.f32 %f52, %f102, %f35;
+ fma.rn.f32 %f53, %f52, %f38, %f37;
+ cvt.sat.f32.f32 %f54, %f53;
+ fma.rm.f32 %f55, %f54, %f42, %f41;
+ add.f32 %f56, %f55, 0fCB40007F;
+ neg.f32 %f57, %f56;
+ fma.rn.f32 %f58, %f52, %f46, %f57;
+ fma.rn.f32 %f59, %f52, %f48, %f58;
+ mov.b32 %r73, %f55;
+ shl.b32 %r74, %r73, 23;
+ mov.b32 %f60, %r74;
+ ex2.approx.ftz.f32 %f61, %f59;
+ mul.f32 %f13, %f61, %f60;
+ sub.f32 %f62, %f103, %f35;
+ fma.rn.f32 %f63, %f62, %f38, %f37;
+ cvt.sat.f32.f32 %f64, %f63;
+ fma.rm.f32 %f65, %f64, %f42, %f41;
+ add.f32 %f66, %f65, 0fCB40007F;
+ neg.f32 %f67, %f66;
+ fma.rn.f32 %f68, %f62, %f46, %f67;
+ fma.rn.f32 %f69, %f62, %f48, %f68;
+ mov.b32 %r75, %f65;
+ shl.b32 %r76, %r75, 23;
+ mov.b32 %f70, %r76;
+ ex2.approx.ftz.f32 %f71, %f69;
+ mul.f32 %f14, %f71, %f70;
+ sub.f32 %f72, %f104, %f35;
+ fma.rn.f32 %f73, %f72, %f38, %f37;
+ cvt.sat.f32.f32 %f74, %f73;
+ fma.rm.f32 %f75, %f74, %f42, %f41;
+ add.f32 %f76, %f75, 0fCB40007F;
+ neg.f32 %f77, %f76;
+ fma.rn.f32 %f78, %f72, %f46, %f77;
+ fma.rn.f32 %f79, %f72, %f48, %f78;
+ mov.b32 %r77, %f75;
+ shl.b32 %r78, %r77, 23;
+ mov.b32 %f80, %r78;
+ ex2.approx.ftz.f32 %f81, %f79;
+ mul.f32 %f15, %f81, %f80;
+ add.f32 %f82, %f12, 0f00000000;
+ add.f32 %f83, %f82, %f13;
+ add.f32 %f84, %f83, %f14;
+ add.f32 %f85, %f84, %f15;
+ selp.f32 %f86, %f85, 0f00000000, %p30;
+ st.shared.f32 [%rd1], %f86;
+ bar.sync 0;
+ @%p14 bra $L__BB0_16;
+
+ ld.shared.f32 %f87, [%rd2];
+ ld.shared.f32 %f88, [%rd1];
+ add.f32 %f89, %f87, %f88;
+ st.shared.f32 [%rd1], %f89;
$L__BB0_16:
- mov.u32 %r84, %ctaid.x;
- mad.lo.s32 %r85, %r3, %r84, %r4;
- setp.lt.s32 %p32, %r85, 242;
- @%p32 bra $L__BB0_18;
- bra.uni $L__BB0_17;
-
-$L__BB0_18:
- add.f32 %f97, %f12, 0f00000000;
- add.f32 %f98, %f97, %f13;
- add.f32 %f99, %f98, %f14;
- add.f32 %f123, %f99, %f15;
- bra.uni $L__BB0_19;
+ bar.sync 0;
+ @%p18 bra $L__BB0_20;
$L__BB0_17:
- mov.u32 %r86, %ctaid.x;
- mad.lo.s32 %r87, %r3, %r86, %r4;
- setp.lt.s32 %p34, %r87, 242;
- and.pred %p35, %p2, %p34;
- add.f32 %f93, %f12, 0f00000000;
- add.f32 %f94, %f93, %f13;
- add.f32 %f95, %f94, %f14;
- add.f32 %f96, %f95, %f15;
- selp.f32 %f123, %f96, 0f00000000, %p35;
+ setp.ge.u32 %p33, %r6, %r86;
+ @%p33 bra $L__BB0_19;
+
+ add.s32 %r79, %r86, %r7;
+ mul.wide.s32 %rd19, %r79, 4;
+ add.s64 %rd21, %rd10, %rd19;
+ ld.shared.f32 %f90, [%rd1];
+ ld.shared.f32 %f91, [%rd21];
+ add.f32 %f92, %f91, %f90;
+ st.shared.f32 [%rd1], %f92;
$L__BB0_19:
- st.shared.f32 [%rd1], %f123;
- bar.sync 0;
- @%p15 bra $L__BB0_21;
-
- ld.shared.f32 %f100, [%rd2];
- ld.shared.f32 %f101, [%rd1];
- add.f32 %f102, %f100, %f101;
- st.shared.f32 [%rd1], %f102;
-
-$L__BB0_21:
- bar.sync 0;
- @%p19 bra $L__BB0_25;
-
-$L__BB0_22:
- setp.ge.u32 %p38, %r1, %r104;
- @%p38 bra $L__BB0_24;
-
- add.s32 %r88, %r104, %r7;
- mul.wide.s32 %rd20, %r88, 4;
- add.s64 %rd22, %rd11, %rd20;
- ld.shared.f32 %f103, [%rd1];
- ld.shared.f32 %f104, [%rd22];
- add.f32 %f105, %f104, %f103;
- st.shared.f32 [%rd1], %f105;
-
-$L__BB0_24:
- bar.sync 0;
- shr.u32 %r13, %r104, 1;
- setp.gt.u32 %p39, %r104, 3;
- mov.u32 %r104, %r13;
- @%p39 bra $L__BB0_22;
+ bar.sync 0;
+ shr.u32 %r13, %r86, 1;
+ setp.gt.u32 %p34, %r86, 3;
+ mov.u32 %r86, %r13;
+ @%p34 bra $L__BB0_17;
+
+$L__BB0_20:
+ mov.f32 %f106, 0f00000000;
+ @%p24 bra $L__BB0_23;
+
+ ld.shared.f32 %f94, [%rd1];
+ add.f32 %f106, %f94, 0f00000000;
+ setp.lt.u32 %p36, %r5, 2;
+ @%p36 bra $L__BB0_23;
+
+ ld.shared.f32 %f95, [%rd3];
+ add.f32 %f106, %f106, %f95;
+
+$L__BB0_23:
+ bar.sync 0;
+ @%p24 bra $L__BB0_25;
+
+ st.shared.f32 [%rd4], %f106;
$L__BB0_25:
- mov.f32 %f124, 0f00000000;
- @%p25 bra $L__BB0_28;
-
- ld.shared.f32 %f107, [%rd1];
- add.f32 %f124, %f107, 0f00000000;
- setp.lt.u32 %p41, %r6, 2;
- @%p41 bra $L__BB0_28;
-
- ld.shared.f32 %f108, [%rd3];
- add.f32 %f124, %f124, %f108;
-
-$L__BB0_28:
- bar.sync 0;
- @%p25 bra $L__BB0_30;
-
- st.shared.f32 [%rd4], %f124;
-
-$L__BB0_30:
- setp.gt.s32 %p43, %r1, 0;
- bar.sync 0;
- ld.shared.f32 %f109, [%rd4];
- bar.sync 0;
- rcp.rn.f32 %f22, %f109;
- @%p43 bra $L__BB0_32;
-
- mov.u32 %r89, %ctaid.x;
- mad.lo.s32 %r14, %r3, %r89, %r4;
- setp.lt.s32 %p44, %r14, 242;
- @%p44 bra $L__BB0_35;
- bra.uni $L__BB0_32;
-
-$L__BB0_35:
- mul.f32 %f110, %f22, %f12;
- mov.b32 %r97, %f110;
- mul.f32 %f111, %f22, %f13;
- mov.b32 %r98, %f111;
- mul.f32 %f112, %f22, %f14;
- mov.b32 %r99, %f112;
- mul.f32 %f113, %f22, %f15;
- mov.b32 %r100, %f113;
- add.s32 %r101, %r14, %r1;
- shl.b32 %r102, %r101, 2;
- mul.wide.s32 %rd26, %r102, 4;
- add.s64 %rd25, %rd7, %rd26;
+ bar.sync 0;
+ ld.shared.f32 %f19, [%rd4];
+ bar.sync 0;
+ @%p2 bra $L__BB0_27;
+
+ rcp.rn.f32 %f96, %f19;
+ mul.f32 %f97, %f96, %f12;
+ mov.b32 %r80, %f97;
+ mul.f32 %f98, %f96, %f13;
+ mov.b32 %r81, %f98;
+ mul.f32 %f99, %f96, %f14;
+ mov.b32 %r82, %f99;
+ mul.f32 %f100, %f96, %f15;
+ mov.b32 %r83, %f100;
+ shl.b32 %r84, %r3, 2;
+ mul.wide.s32 %rd23, %r84, 4;
+ add.s64 %rd22, %rd6, %rd23;
- st.global.cs.v4.s32 [%rd25], {%r97,%r98,%r99,%r100};
+ st.global.cs.v4.s32 [%rd22], {%r80,%r81,%r82,%r83};
- bra.uni $L__BB0_36;
-
-$L__BB0_32:
- mul.f32 %f23, %f22, %f12;
- mul.f32 %f24, %f22, %f13;
- mul.f32 %f25, %f22, %f14;
- mul.f32 %f26, %f22, %f15;
- @%p43 bra $L__BB0_36;
-
- mov.u32 %r90, %ctaid.x;
- mad.lo.s32 %r15, %r3, %r90, %r4;
- setp.gt.s32 %p46, %r15, 241;
- @%p46 bra $L__BB0_36;
-
- add.s32 %r95, %r15, %r1;
- shl.b32 %r96, %r95, 2;
- mul.wide.s32 %rd24, %r96, 4;
- add.s64 %rd23, %rd7, %rd24;
- mov.b32 %r91, %f23;
- mov.b32 %r92, %f24;
- mov.b32 %r93, %f25;
- mov.b32 %r94, %f26;
-
- st.global.cs.v4.s32 [%rd23], {%r91,%r92,%r93,%r94};
-
-
-$L__BB0_36:
+
+$L__BB0_27:
ret;
}
Kernel 71
CUDA
PTX
53997da5d
Diff
03a1b695e
-8
+8 index type: int
registers: 0
gmem: 0
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 3, 3> T4, Tensor<float, 3, 3> T12) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
Array<float, 2, 2> T13;
T13.set(float(NEG_INFINITY));
if (((((((nvfuser_index_t)threadIdx.x) * 2) + 1) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/false, CacheOp::Streaming>(&T13[0], &T4[(((2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))]);
}
Array<float, 1, 1> T15;
T15[0] = NEG_INFINITY;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
T15[0] = fmax(
T15[0],
T13[i0]);
}
Array<float, 1, 1> T5;
T5[0] = NEG_INFINITY;
blockReduce<true, false, false, true>(T5[0], T15[0], [](float &a, float b) { a = fmax(a, b); }, static_cast<float*>(shared_mem), true, true, float(NEG_INFINITY), DefaultBlockDim());
Array<float, 1, 1> T6;
broadcast::blockBroadcast<true, false, false, true>(T6[0], T5[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
// Alias Allocation - register
auto& T8 = T13;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 2; ++i1) {
Array<float, 1, 1> T7;
T7[0]
= T13[i1]
- T6[0];
T8[i1]
= expf(T7[0]);
}
Array<float, 1, 1> T16;
T16[0] = 0.000000000e+00f;
if (((((((nvfuser_index_t)threadIdx.x) * 2) + 1) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 2; ++i2) {
T16[0]
= T16[0]
+ T8[i2];
}
} else {
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 2; ++i2) {
if ((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
T16[0]
= T16[0]
+ T8[i2];
}
}
}
Array<float, 1, 1> T9;
T9[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T9[0], T16[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T10;
broadcast::blockBroadcast<true, false, false, true>(T10[0], T9[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T11;
T11[0]
= reciprocal(T10[0]);
if (((((((nvfuser_index_t)threadIdx.x) * 2) + 1) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
Array<float, 2, 2> T14;
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 2; ++i3) {
T14[i3]
= T8[i3]
* T11[0];
}
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T12[(((2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T14[0]);
} else {
Array<float, 2, 2> T14;
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 2; ++i3) {
T14[i3]
= T8[i3]
* T11[0];
}
if ((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T12[(((2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T14[0]);
}
}
}
__global__ void nvfuser_N(Tensor<float, 3, 3> T4, Tensor<float, 3, 3> T12) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
Array<float, 2, 2> T13;
T13.set(float(NEG_INFINITY));
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/false, CacheOp::Streaming>(&T13[0], &T4[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))]);
}
Array<float, 1, 1> T15;
T15[0] = NEG_INFINITY;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
T15[0] = fmax(
T15[0],
T13[i0]);
}
Array<float, 1, 1> T5;
T5[0] = NEG_INFINITY;
blockReduce<true, false, false, true>(T5[0], T15[0], [](float &a, float b) { a = fmax(a, b); }, static_cast<float*>(shared_mem), true, true, float(NEG_INFINITY), DefaultBlockDim());
Array<float, 1, 1> T6;
broadcast::blockBroadcast<true, false, false, true>(T6[0], T5[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
// Alias Allocation - register
auto& T8 = T13;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 2; ++i1) {
Array<float, 1, 1> T7;
T7[0]
= T13[i1]
- T6[0];
T8[i1]
= expf(T7[0]);
}
Array<float, 1, 1> T16;
T16[0] = 0.000000000e+00f;
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 2; ++i2) {
T16[0]
= T16[0]
+ T8[i2];
}
} else {
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 2; ++i2) {
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
T16[0]
= T16[0]
+ T8[i2];
}
}
}
Array<float, 1, 1> T9;
T9[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T9[0], T16[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T10;
broadcast::blockBroadcast<true, false, false, true>(T10[0], T9[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T11;
T11[0]
= reciprocal(T10[0]);
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
Array<float, 2, 2> T14;
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 2; ++i3) {
T14[i3]
= T8[i3]
* T11[0];
}
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T12[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T14[0]);
} else {
Array<float, 2, 2> T14;
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 2; ++i3) {
T14[i3]
= T8[i3]
* T11[0];
}
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T12[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T14[0]);
}
}
}
--- 53997da5d
+++ 03a1b695e
@@ -1,12 +1,12 @@
__global__ void nvfuser_N(Tensor<float, 3, 3> T4, Tensor<float, 3, 3> T12) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
Array<float, 2, 2> T13;
T13.set(float(NEG_INFINITY));
- if (((((((nvfuser_index_t)threadIdx.x) * 2) + 1) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
- loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/false, CacheOp::Streaming>(&T13[0], &T4[(((2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))]);
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
+ loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/false, CacheOp::Streaming>(&T13[0], &T4[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))]);
}
Array<float, 1, 1> T15;
T15[0] = NEG_INFINITY;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
@@ -30,21 +30,21 @@
T8[i1]
= expf(T7[0]);
}
Array<float, 1, 1> T16;
T16[0] = 0.000000000e+00f;
- if (((((((nvfuser_index_t)threadIdx.x) * 2) + 1) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 2; ++i2) {
T16[0]
= T16[0]
+ T8[i2];
}
} else {
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 2; ++i2) {
- if ((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
T16[0]
= T16[0]
+ T8[i2];
}
}
@@ -55,27 +55,27 @@
Array<float, 1, 1> T10;
broadcast::blockBroadcast<true, false, false, true>(T10[0], T9[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T11;
T11[0]
= reciprocal(T10[0]);
- if (((((((nvfuser_index_t)threadIdx.x) * 2) + 1) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
Array<float, 2, 2> T14;
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 2; ++i3) {
T14[i3]
= T8[i3]
* T11[0];
}
- loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T12[(((2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T14[0]);
+ loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T12[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T14[0]);
} else {
Array<float, 2, 2> T14;
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 2; ++i3) {
T14[i3]
= T8[i3]
* T11[0];
}
- if ((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
- loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T12[(((2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T14[0]);
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
+ loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T12[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T14[0]);
}
}
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_76_cu_012eb9a7_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_76_cu_012eb9a7_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_76_cu_012eb9a7_191103std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_76_cu_012eb9a7_191105arrayE[];
.entry _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_76_cu_012eb9a7_1911010nvfuser_76ENS_6TensorIfLi3ELi3EEES1_(
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_76_cu_012eb9a7_1911010nvfuser_76ENS_6TensorIfLi3ELi3EEES1__param_0[32],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_76_cu_012eb9a7_1911010nvfuser_76ENS_6TensorIfLi3ELi3EEES1__param_1[32]
)
{
.reg .pred %p<41>;
.reg .f32 %f<81>;
.reg .b32 %r<79>;
.reg .b64 %rd<27>;
ld.param.u64 %rd7, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_76_cu_012eb9a7_1911010nvfuser_76ENS_6TensorIfLi3ELi3EEES1__param_1];
ld.param.u64 %rd6, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_76_cu_012eb9a7_1911010nvfuser_76ENS_6TensorIfLi3ELi3EEES1__param_0];
mov.u32 %r1, %tid.x;
setp.lt.s32 %p2, %r1, 1;
mov.f32 %f76, 0fFF800000;
mov.f32 %f77, 0fFF800000;
@%p2 bra $L__BB0_1;
bra.uni $L__BB0_3;
$L__BB0_1:
mov.u32 %r40, %tid.y;
mov.u32 %r41, %ctaid.x;
mov.u32 %r42, %ntid.y;
mad.lo.s32 %r2, %r42, %r41, %r40;
setp.gt.s32 %p3, %r2, 27453;
@%p3 bra $L__BB0_3;
add.s32 %r45, %r2, %r1;
shl.b32 %r46, %r45, 1;
mul.wide.s32 %rd9, %r46, 4;
add.s64 %rd8, %rd6, %rd9;
// begin inline asm
ld.global.cs.v2.u32 {%r43,%r44}, [%rd8];
// end inline asm
mov.b32 %f76, %r43;
mov.b32 %f77, %r44;
$L__BB0_3:
setp.gt.f32 %p4, %f76, %f77;
setp.nan.f32 %p5, %f76, %f76;
or.pred %p6, %p5, %p4;
selp.f32 %f23, %f76, %f77, %p6;
mov.u32 %r47, %tid.z;
mov.u32 %r3, %ntid.y;
mov.u32 %r4, %tid.y;
mad.lo.s32 %r5, %r3, %r47, %r4;
mov.u32 %r6, %ntid.x;
mad.lo.s32 %r7, %r5, %r6, %r1;
mul.wide.u32 %rd10, %r7, 4;
mov.u64 %rd11, _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_76_cu_012eb9a7_191105arrayE;
add.s64 %rd1, %rd11, %rd10;
st.shared.f32 [%rd1], %f23;
bar.sync 0;
clz.b32 %r48, %r6;
mov.u32 %r49, 31;
sub.s32 %r50, %r49, %r48;
mov.u32 %r51, 1;
shl.b32 %r8, %r51, %r50;
setp.lt.u32 %p7, %r1, %r8;
add.s32 %r52, %r8, %r1;
setp.lt.u32 %p8, %r52, %r6;
and.pred %p1, %p7, %p8;
add.s32 %r53, %r7, %r8;
mul.wide.s32 %rd12, %r53, 4;
add.s64 %rd2, %rd11, %rd12;
not.pred %p9, %p1;
@%p9 bra $L__BB0_5;
ld.shared.f32 %f24, [%rd2];
ld.shared.f32 %f25, [%rd1];
setp.nan.f32 %p10, %f25, %f25;
setp.gt.f32 %p11, %f25, %f24;
or.pred %p12, %p10, %p11;
selp.f32 %f26, %f25, %f24, %p12;
st.shared.f32 [%rd1], %f26;
$L__BB0_5:
bar.sync 0;
shr.u32 %r54, %r8, 31;
add.s32 %r55, %r8, %r54;
shr.s32 %r78, %r55, 1;
setp.lt.s32 %p13, %r8, 4;
@%p13 bra $L__BB0_10;
mov.u32 %r77, %r78;
$L__BB0_7:
setp.ge.u32 %p14, %r1, %r77;
@%p14 bra $L__BB0_9;
add.s32 %r56, %r77, %r7;
mul.wide.s32 %rd13, %r56, 4;
add.s64 %rd15, %rd11, %rd13;
ld.shared.f32 %f27, [%rd1];
setp.nan.f32 %p15, %f27, %f27;
ld.shared.f32 %f28, [%rd15];
setp.gt.f32 %p16, %f27, %f28;
or.pred %p17, %p15, %p16;
selp.f32 %f29, %f27, %f28, %p17;
st.shared.f32 [%rd1], %f29;
$L__BB0_9:
bar.sync 0;
shr.u32 %r11, %r77, 1;
setp.gt.u32 %p18, %r77, 3;
mov.u32 %r77, %r11;
@%p18 bra $L__BB0_7;
$L__BB0_10:
setp.ne.s32 %p19, %r1, 0;
add.s32 %r57, %r7, 1;
mul.wide.u32 %rd16, %r57, 4;
add.s64 %rd3, %rd11, %rd16;
mov.f32 %f78, 0fFF800000;
@%p19 bra $L__BB0_13;
ld.shared.f32 %f78, [%rd1];
setp.lt.u32 %p20, %r6, 2;
@%p20 bra $L__BB0_13;
ld.shared.f32 %f31, [%rd3];
setp.gt.f32 %p21, %f78, %f31;
setp.nan.f32 %p22, %f78, %f78;
or.pred %p23, %p22, %p21;
selp.f32 %f78, %f78, %f31, %p23;
$L__BB0_13:
bar.sync 0;
mul.wide.s32 %rd18, %r5, 4;
add.s64 %rd4, %rd11, %rd18;
setp.eq.s32 %p24, %r1, 0;
@%p24 bra $L__BB0_14;
bra.uni $L__BB0_15;
$L__BB0_14:
st.shared.f32 [%rd4], %f78;
$L__BB0_15:
bar.sync 0;
ld.shared.f32 %f32, [%rd4];
bar.sync 0;
sub.f32 %f33, %f76, %f32;
mov.f32 %f34, 0f3F000000;
mov.f32 %f35, 0f3BBB989D;
fma.rn.f32 %f36, %f33, %f35, %f34;
cvt.sat.f32.f32 %f37, %f36;
mov.f32 %f38, 0f4B400001;
mov.f32 %f39, 0f437C0000;
fma.rm.f32 %f40, %f37, %f39, %f38;
add.f32 %f41, %f40, 0fCB40007F;
neg.f32 %f42, %f41;
mov.f32 %f43, 0f3FB8AA3B;
fma.rn.f32 %f44, %f33, %f43, %f42;
mov.f32 %f45, 0f32A57060;
fma.rn.f32 %f46, %f33, %f45, %f44;
mov.b32 %r58, %f40;
shl.b32 %r59, %r58, 23;
mov.b32 %f47, %r59;
ex2.approx.ftz.f32 %f48, %f46;
mul.f32 %f8, %f48, %f47;
sub.f32 %f49, %f77, %f32;
fma.rn.f32 %f50, %f49, %f35, %f34;
cvt.sat.f32.f32 %f51, %f50;
fma.rm.f32 %f52, %f51, %f39, %f38;
add.f32 %f53, %f52, 0fCB40007F;
neg.f32 %f54, %f53;
fma.rn.f32 %f55, %f49, %f43, %f54;
fma.rn.f32 %f56, %f49, %f45, %f55;
mov.b32 %r60, %f52;
shl.b32 %r61, %r60, 23;
mov.b32 %f57, %r61;
ex2.approx.ftz.f32 %f58, %f56;
mul.f32 %f9, %f58, %f57;
@%p2 bra $L__BB0_16;
bra.uni $L__BB0_17;
$L__BB0_16:
mov.u32 %r62, %ctaid.x;
mad.lo.s32 %r63, %r3, %r62, %r4;
setp.lt.s32 %p26, %r63, 27454;
@%p26 bra $L__BB0_18;
bra.uni $L__BB0_17;
$L__BB0_18:
add.f32 %f61, %f8, 0f00000000;
add.f32 %f79, %f61, %f9;
bra.uni $L__BB0_19;
$L__BB0_17:
mov.u32 %r64, %ctaid.x;
mad.lo.s32 %r65, %r3, %r64, %r4;
setp.lt.s32 %p28, %r65, 27454;
and.pred %p29, %p2, %p28;
add.f32 %f59, %f8, 0f00000000;
add.f32 %f60, %f59, %f9;
selp.f32 %f79, %f60, 0f00000000, %p29;
$L__BB0_19:
st.shared.f32 [%rd1], %f79;
bar.sync 0;
@%p9 bra $L__BB0_21;
ld.shared.f32 %f62, [%rd2];
ld.shared.f32 %f63, [%rd1];
add.f32 %f64, %f62, %f63;
st.shared.f32 [%rd1], %f64;
$L__BB0_21:
bar.sync 0;
@%p13 bra $L__BB0_25;
$L__BB0_22:
setp.ge.u32 %p32, %r1, %r78;
@%p32 bra $L__BB0_24;
add.s32 %r66, %r78, %r7;
mul.wide.s32 %rd20, %r66, 4;
add.s64 %rd22, %rd11, %rd20;
ld.shared.f32 %f65, [%rd1];
ld.shared.f32 %f66, [%rd22];
add.f32 %f67, %f66, %f65;
st.shared.f32 [%rd1], %f67;
$L__BB0_24:
bar.sync 0;
shr.u32 %r13, %r78, 1;
setp.gt.u32 %p33, %r78, 3;
mov.u32 %r78, %r13;
@%p33 bra $L__BB0_22;
$L__BB0_25:
mov.f32 %f80, 0f00000000;
@%p19 bra $L__BB0_28;
ld.shared.f32 %f69, [%rd1];
add.f32 %f80, %f69, 0f00000000;
setp.lt.u32 %p35, %r6, 2;
@%p35 bra $L__BB0_28;
ld.shared.f32 %f70, [%rd3];
add.f32 %f80, %f80, %f70;
$L__BB0_28:
bar.sync 0;
@%p19 bra $L__BB0_30;
st.shared.f32 [%rd4], %f80;
$L__BB0_30:
setp.gt.s32 %p37, %r1, 0;
bar.sync 0;
ld.shared.f32 %f71, [%rd4];
bar.sync 0;
rcp.rn.f32 %f16, %f71;
@%p37 bra $L__BB0_32;
mov.u32 %r67, %ctaid.x;
mad.lo.s32 %r14, %r3, %r67, %r4;
setp.lt.s32 %p38, %r14, 27454;
@%p38 bra $L__BB0_35;
bra.uni $L__BB0_32;
$L__BB0_35:
mul.f32 %f72, %f16, %f8;
mov.b32 %r73, %f72;
mul.f32 %f73, %f16, %f9;
mov.b32 %r74, %f73;
add.s32 %r75, %r14, %r1;
shl.b32 %r76, %r75, 1;
mul.wide.s32 %rd26, %r76, 4;
add.s64 %rd25, %rd7, %rd26;
// begin inline asm
st.global.cs.v2.s32 [%rd25], {%r73,%r74};
// end inline asm
bra.uni $L__BB0_36;
$L__BB0_32:
mul.f32 %f17, %f16, %f8;
mul.f32 %f18, %f16, %f9;
@%p37 bra $L__BB0_36;
mov.u32 %r68, %ctaid.x;
mad.lo.s32 %r15, %r3, %r68, %r4;
setp.gt.s32 %p40, %r15, 27453;
@%p40 bra $L__BB0_36;
add.s32 %r71, %r15, %r1;
shl.b32 %r72, %r71, 1;
mul.wide.s32 %rd24, %r72, 4;
add.s64 %rd23, %rd7, %rd24;
mov.b32 %r69, %f17;
mov.b32 %r70, %f18;
// begin inline asm
st.global.cs.v2.s32 [%rd23], {%r69,%r70};
// end inline asm
$L__BB0_36:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_76_cu_386ea3a4_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_76_cu_386ea3a4_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_76_cu_386ea3a4_160113std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_76_cu_386ea3a4_160115arrayE[];
.entry _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_76_cu_386ea3a4_1601110nvfuser_76ENS_6TensorIfLi3ELi3EEES1_(
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_76_cu_386ea3a4_1601110nvfuser_76ENS_6TensorIfLi3ELi3EEES1__param_0[32],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_76_cu_386ea3a4_1601110nvfuser_76ENS_6TensorIfLi3ELi3EEES1__param_1[32]
)
{
.reg .pred %p<33>;
.reg .f32 %f<71>;
.reg .b32 %r<63>;
.reg .b64 %rd<24>;
ld.param.u64 %rd6, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_76_cu_386ea3a4_1601110nvfuser_76ENS_6TensorIfLi3ELi3EEES1__param_1];
ld.param.u64 %rd5, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_76_cu_386ea3a4_1601110nvfuser_76ENS_6TensorIfLi3ELi3EEES1__param_0];
mov.u32 %r38, %ctaid.x;
mov.u32 %r1, %ntid.y;
mov.u32 %r2, %tid.y;
mad.lo.s32 %r3, %r1, %r38, %r2;
setp.gt.s32 %p2, %r3, 27453;
mov.f32 %f67, 0fFF800000;
mov.f32 %f68, %f67;
@%p2 bra $L__BB0_2;
shl.b32 %r41, %r3, 1;
mul.wide.s32 %rd8, %r41, 4;
add.s64 %rd7, %rd5, %rd8;
// begin inline asm
ld.global.cs.v2.u32 {%r39,%r40}, [%rd7];
// end inline asm
mov.b32 %f67, %r39;
mov.b32 %f68, %r40;
$L__BB0_2:
setp.gt.f32 %p3, %f67, %f68;
setp.nan.f32 %p4, %f67, %f67;
or.pred %p5, %p4, %p3;
selp.f32 %f16, %f67, %f68, %p5;
mov.u32 %r42, %tid.z;
mad.lo.s32 %r4, %r1, %r42, %r2;
mov.u32 %r5, %ntid.x;
mov.u32 %r6, %tid.x;
mad.lo.s32 %r7, %r4, %r5, %r6;
mul.wide.u32 %rd9, %r7, 4;
mov.u64 %rd10, _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_76_cu_386ea3a4_160115arrayE;
add.s64 %rd1, %rd10, %rd9;
st.shared.f32 [%rd1], %f16;
bar.sync 0;
clz.b32 %r43, %r5;
mov.u32 %r44, 31;
sub.s32 %r45, %r44, %r43;
mov.u32 %r46, 1;
shl.b32 %r8, %r46, %r45;
setp.lt.u32 %p6, %r6, %r8;
add.s32 %r47, %r8, %r6;
setp.lt.u32 %p7, %r47, %r5;
and.pred %p1, %p6, %p7;
add.s32 %r48, %r7, %r8;
mul.wide.s32 %rd11, %r48, 4;
add.s64 %rd2, %rd10, %rd11;
not.pred %p8, %p1;
@%p8 bra $L__BB0_4;
ld.shared.f32 %f17, [%rd2];
ld.shared.f32 %f18, [%rd1];
setp.nan.f32 %p9, %f18, %f18;
setp.gt.f32 %p10, %f18, %f17;
or.pred %p11, %p9, %p10;
selp.f32 %f19, %f18, %f17, %p11;
st.shared.f32 [%rd1], %f19;
$L__BB0_4:
bar.sync 0;
shr.u32 %r49, %r8, 31;
add.s32 %r50, %r8, %r49;
shr.s32 %r62, %r50, 1;
setp.lt.s32 %p12, %r8, 4;
@%p12 bra $L__BB0_9;
mov.u32 %r61, %r62;
$L__BB0_6:
setp.ge.u32 %p13, %r6, %r61;
@%p13 bra $L__BB0_8;
add.s32 %r51, %r61, %r7;
mul.wide.s32 %rd12, %r51, 4;
add.s64 %rd14, %rd10, %rd12;
ld.shared.f32 %f20, [%rd1];
setp.nan.f32 %p14, %f20, %f20;
ld.shared.f32 %f21, [%rd14];
setp.gt.f32 %p15, %f20, %f21;
or.pred %p16, %p14, %p15;
selp.f32 %f22, %f20, %f21, %p16;
st.shared.f32 [%rd1], %f22;
$L__BB0_8:
bar.sync 0;
shr.u32 %r11, %r61, 1;
setp.gt.u32 %p17, %r61, 3;
mov.u32 %r61, %r11;
@%p17 bra $L__BB0_6;
$L__BB0_9:
setp.ne.s32 %p18, %r6, 0;
add.s32 %r52, %r7, 1;
mul.wide.u32 %rd15, %r52, 4;
add.s64 %rd3, %rd10, %rd15;
mov.f32 %f69, 0fFF800000;
@%p18 bra $L__BB0_12;
ld.shared.f32 %f69, [%rd1];
setp.lt.u32 %p19, %r5, 2;
@%p19 bra $L__BB0_12;
ld.shared.f32 %f24, [%rd3];
setp.gt.f32 %p20, %f69, %f24;
setp.nan.f32 %p21, %f69, %f69;
or.pred %p22, %p21, %p20;
selp.f32 %f69, %f69, %f24, %p22;
$L__BB0_12:
bar.sync 0;
mul.wide.s32 %rd17, %r4, 4;
add.s64 %rd4, %rd10, %rd17;
setp.eq.s32 %p23, %r6, 0;
@%p23 bra $L__BB0_13;
bra.uni $L__BB0_14;
$L__BB0_13:
st.shared.f32 [%rd4], %f69;
$L__BB0_14:
setp.lt.s32 %p24, %r3, 27454;
bar.sync 0;
ld.shared.f32 %f25, [%rd4];
bar.sync 0;
sub.f32 %f26, %f67, %f25;
mov.f32 %f27, 0f3F000000;
mov.f32 %f28, 0f3BBB989D;
fma.rn.f32 %f29, %f26, %f28, %f27;
cvt.sat.f32.f32 %f30, %f29;
mov.f32 %f31, 0f4B400001;
mov.f32 %f32, 0f437C0000;
fma.rm.f32 %f33, %f30, %f32, %f31;
add.f32 %f34, %f33, 0fCB40007F;
neg.f32 %f35, %f34;
mov.f32 %f36, 0f3FB8AA3B;
fma.rn.f32 %f37, %f26, %f36, %f35;
mov.f32 %f38, 0f32A57060;
fma.rn.f32 %f39, %f26, %f38, %f37;
mov.b32 %r53, %f33;
shl.b32 %r54, %r53, 23;
mov.b32 %f40, %r54;
ex2.approx.ftz.f32 %f41, %f39;
mul.f32 %f8, %f41, %f40;
sub.f32 %f42, %f68, %f25;
fma.rn.f32 %f43, %f42, %f28, %f27;
cvt.sat.f32.f32 %f44, %f43;
fma.rm.f32 %f45, %f44, %f32, %f31;
add.f32 %f46, %f45, 0fCB40007F;
neg.f32 %f47, %f46;
fma.rn.f32 %f48, %f42, %f36, %f47;
fma.rn.f32 %f49, %f42, %f38, %f48;
mov.b32 %r55, %f45;
shl.b32 %r56, %r55, 23;
mov.b32 %f50, %r56;
ex2.approx.ftz.f32 %f51, %f49;
mul.f32 %f9, %f51, %f50;
add.f32 %f52, %f8, 0f00000000;
add.f32 %f53, %f52, %f9;
selp.f32 %f54, %f53, 0f00000000, %p24;
st.shared.f32 [%rd1], %f54;
bar.sync 0;
@%p8 bra $L__BB0_16;
ld.shared.f32 %f55, [%rd2];
ld.shared.f32 %f56, [%rd1];
add.f32 %f57, %f55, %f56;
st.shared.f32 [%rd1], %f57;
$L__BB0_16:
bar.sync 0;
@%p12 bra $L__BB0_20;
$L__BB0_17:
setp.ge.u32 %p27, %r6, %r62;
@%p27 bra $L__BB0_19;
add.s32 %r57, %r62, %r7;
mul.wide.s32 %rd19, %r57, 4;
add.s64 %rd21, %rd10, %rd19;
ld.shared.f32 %f58, [%rd1];
ld.shared.f32 %f59, [%rd21];
add.f32 %f60, %f59, %f58;
st.shared.f32 [%rd1], %f60;
$L__BB0_19:
bar.sync 0;
shr.u32 %r13, %r62, 1;
setp.gt.u32 %p28, %r62, 3;
mov.u32 %r62, %r13;
@%p28 bra $L__BB0_17;
$L__BB0_20:
mov.f32 %f70, 0f00000000;
@%p18 bra $L__BB0_23;
ld.shared.f32 %f62, [%rd1];
add.f32 %f70, %f62, 0f00000000;
setp.lt.u32 %p30, %r5, 2;
@%p30 bra $L__BB0_23;
ld.shared.f32 %f63, [%rd3];
add.f32 %f70, %f70, %f63;
$L__BB0_23:
bar.sync 0;
@%p18 bra $L__BB0_25;
st.shared.f32 [%rd4], %f70;
$L__BB0_25:
bar.sync 0;
ld.shared.f32 %f13, [%rd4];
bar.sync 0;
@%p2 bra $L__BB0_27;
rcp.rn.f32 %f64, %f13;
mul.f32 %f65, %f64, %f8;
mov.b32 %r58, %f65;
mul.f32 %f66, %f64, %f9;
mov.b32 %r59, %f66;
shl.b32 %r60, %r3, 1;
mul.wide.s32 %rd23, %r60, 4;
add.s64 %rd22, %rd6, %rd23;
// begin inline asm
st.global.cs.v2.s32 [%rd22], {%r58,%r59};
// end inline asm
$L__BB0_27:
ret;
}
--- 53997da5d
+++ 03a1b695e
@@ -18,303 +18,244 @@
.entry _ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_(
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1__param_0[32],
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1__param_1[32]
)
{
- .reg .pred %p<41>;
- .reg .f32 %f<81>;
- .reg .b32 %r<79>;
- .reg .b64 %rd<27>;
-
-
- ld.param.u64 %rd7, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1__param_1];
- ld.param.u64 %rd6, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1__param_0];
- mov.u32 %r1, %tid.x;
- setp.lt.s32 %p2, %r1, 1;
- mov.f32 %f76, 0fFF800000;
- mov.f32 %f77, 0fFF800000;
- @%p2 bra $L__BB0_1;
- bra.uni $L__BB0_3;
-
-$L__BB0_1:
- mov.u32 %r40, %tid.y;
- mov.u32 %r41, %ctaid.x;
- mov.u32 %r42, %ntid.y;
- mad.lo.s32 %r2, %r42, %r41, %r40;
- setp.gt.s32 %p3, %r2, 27453;
- @%p3 bra $L__BB0_3;
-
- add.s32 %r45, %r2, %r1;
- shl.b32 %r46, %r45, 1;
- mul.wide.s32 %rd9, %r46, 4;
- add.s64 %rd8, %rd6, %rd9;
-
- ld.global.cs.v2.u32 {%r43,%r44}, [%rd8];
-
- mov.b32 %f76, %r43;
- mov.b32 %f77, %r44;
-
-$L__BB0_3:
- setp.gt.f32 %p4, %f76, %f77;
- setp.nan.f32 %p5, %f76, %f76;
- or.pred %p6, %p5, %p4;
- selp.f32 %f23, %f76, %f77, %p6;
- mov.u32 %r47, %tid.z;
- mov.u32 %r3, %ntid.y;
- mov.u32 %r4, %tid.y;
- mad.lo.s32 %r5, %r3, %r47, %r4;
- mov.u32 %r6, %ntid.x;
- mad.lo.s32 %r7, %r5, %r6, %r1;
- mul.wide.u32 %rd10, %r7, 4;
- mov.u64 %rd11, _ZN11kernelscope6kernelE;
- add.s64 %rd1, %rd11, %rd10;
- st.shared.f32 [%rd1], %f23;
- bar.sync 0;
- clz.b32 %r48, %r6;
- mov.u32 %r49, 31;
- sub.s32 %r50, %r49, %r48;
- mov.u32 %r51, 1;
- shl.b32 %r8, %r51, %r50;
- setp.lt.u32 %p7, %r1, %r8;
- add.s32 %r52, %r8, %r1;
- setp.lt.u32 %p8, %r52, %r6;
- and.pred %p1, %p7, %p8;
- add.s32 %r53, %r7, %r8;
- mul.wide.s32 %rd12, %r53, 4;
- add.s64 %rd2, %rd11, %rd12;
- not.pred %p9, %p1;
- @%p9 bra $L__BB0_5;
-
- ld.shared.f32 %f24, [%rd2];
- ld.shared.f32 %f25, [%rd1];
- setp.nan.f32 %p10, %f25, %f25;
- setp.gt.f32 %p11, %f25, %f24;
- or.pred %p12, %p10, %p11;
- selp.f32 %f26, %f25, %f24, %p12;
- st.shared.f32 [%rd1], %f26;
-
-$L__BB0_5:
- bar.sync 0;
- shr.u32 %r54, %r8, 31;
- add.s32 %r55, %r8, %r54;
- shr.s32 %r78, %r55, 1;
- setp.lt.s32 %p13, %r8, 4;
- @%p13 bra $L__BB0_10;
-
- mov.u32 %r77, %r78;
-
-$L__BB0_7:
- setp.ge.u32 %p14, %r1, %r77;
- @%p14 bra $L__BB0_9;
-
- add.s32 %r56, %r77, %r7;
- mul.wide.s32 %rd13, %r56, 4;
- add.s64 %rd15, %rd11, %rd13;
- ld.shared.f32 %f27, [%rd1];
- setp.nan.f32 %p15, %f27, %f27;
- ld.shared.f32 %f28, [%rd15];
- setp.gt.f32 %p16, %f27, %f28;
- or.pred %p17, %p15, %p16;
- selp.f32 %f29, %f27, %f28, %p17;
- st.shared.f32 [%rd1], %f29;
+ .reg .pred %p<33>;
+ .reg .f32 %f<71>;
+ .reg .b32 %r<63>;
+ .reg .b64 %rd<24>;
+
+
+ ld.param.u64 %rd6, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1__param_1];
+ ld.param.u64 %rd5, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1__param_0];
+ mov.u32 %r38, %ctaid.x;
+ mov.u32 %r1, %ntid.y;
+ mov.u32 %r2, %tid.y;
+ mad.lo.s32 %r3, %r1, %r38, %r2;
+ setp.gt.s32 %p2, %r3, 27453;
+ mov.f32 %f67, 0fFF800000;
+ mov.f32 %f68, %f67;
+ @%p2 bra $L__BB0_2;
+
+ shl.b32 %r41, %r3, 1;
+ mul.wide.s32 %rd8, %r41, 4;
+ add.s64 %rd7, %rd5, %rd8;
+
+ ld.global.cs.v2.u32 {%r39,%r40}, [%rd7];
+
+ mov.b32 %f67, %r39;
+ mov.b32 %f68, %r40;
+
+$L__BB0_2:
+ setp.gt.f32 %p3, %f67, %f68;
+ setp.nan.f32 %p4, %f67, %f67;
+ or.pred %p5, %p4, %p3;
+ selp.f32 %f16, %f67, %f68, %p5;
+ mov.u32 %r42, %tid.z;
+ mad.lo.s32 %r4, %r1, %r42, %r2;
+ mov.u32 %r5, %ntid.x;
+ mov.u32 %r6, %tid.x;
+ mad.lo.s32 %r7, %r4, %r5, %r6;
+ mul.wide.u32 %rd9, %r7, 4;
+ mov.u64 %rd10, _ZN11kernelscope6kernelE;
+ add.s64 %rd1, %rd10, %rd9;
+ st.shared.f32 [%rd1], %f16;
+ bar.sync 0;
+ clz.b32 %r43, %r5;
+ mov.u32 %r44, 31;
+ sub.s32 %r45, %r44, %r43;
+ mov.u32 %r46, 1;
+ shl.b32 %r8, %r46, %r45;
+ setp.lt.u32 %p6, %r6, %r8;
+ add.s32 %r47, %r8, %r6;
+ setp.lt.u32 %p7, %r47, %r5;
+ and.pred %p1, %p6, %p7;
+ add.s32 %r48, %r7, %r8;
+ mul.wide.s32 %rd11, %r48, 4;
+ add.s64 %rd2, %rd10, %rd11;
+ not.pred %p8, %p1;
+ @%p8 bra $L__BB0_4;
+
+ ld.shared.f32 %f17, [%rd2];
+ ld.shared.f32 %f18, [%rd1];
+ setp.nan.f32 %p9, %f18, %f18;
+ setp.gt.f32 %p10, %f18, %f17;
+ or.pred %p11, %p9, %p10;
+ selp.f32 %f19, %f18, %f17, %p11;
+ st.shared.f32 [%rd1], %f19;
+
+$L__BB0_4:
+ bar.sync 0;
+ shr.u32 %r49, %r8, 31;
+ add.s32 %r50, %r8, %r49;
+ shr.s32 %r62, %r50, 1;
+ setp.lt.s32 %p12, %r8, 4;
+ @%p12 bra $L__BB0_9;
+
+ mov.u32 %r61, %r62;
+
+$L__BB0_6:
+ setp.ge.u32 %p13, %r6, %r61;
+ @%p13 bra $L__BB0_8;
+
+ add.s32 %r51, %r61, %r7;
+ mul.wide.s32 %rd12, %r51, 4;
+ add.s64 %rd14, %rd10, %rd12;
+ ld.shared.f32 %f20, [%rd1];
+ setp.nan.f32 %p14, %f20, %f20;
+ ld.shared.f32 %f21, [%rd14];
+ setp.gt.f32 %p15, %f20, %f21;
+ or.pred %p16, %p14, %p15;
+ selp.f32 %f22, %f20, %f21, %p16;
+ st.shared.f32 [%rd1], %f22;
+
+$L__BB0_8:
+ bar.sync 0;
+ shr.u32 %r11, %r61, 1;
+ setp.gt.u32 %p17, %r61, 3;
+ mov.u32 %r61, %r11;
+ @%p17 bra $L__BB0_6;
$L__BB0_9:
- bar.sync 0;
- shr.u32 %r11, %r77, 1;
- setp.gt.u32 %p18, %r77, 3;
- mov.u32 %r77, %r11;
- @%p18 bra $L__BB0_7;
-
-$L__BB0_10:
- setp.ne.s32 %p19, %r1, 0;
- add.s32 %r57, %r7, 1;
- mul.wide.u32 %rd16, %r57, 4;
- add.s64 %rd3, %rd11, %rd16;
- mov.f32 %f78, 0fFF800000;
- @%p19 bra $L__BB0_13;
-
- ld.shared.f32 %f78, [%rd1];
- setp.lt.u32 %p20, %r6, 2;
- @%p20 bra $L__BB0_13;
-
- ld.shared.f32 %f31, [%rd3];
- setp.gt.f32 %p21, %f78, %f31;
- setp.nan.f32 %p22, %f78, %f78;
- or.pred %p23, %p22, %p21;
- selp.f32 %f78, %f78, %f31, %p23;
+ setp.ne.s32 %p18, %r6, 0;
+ add.s32 %r52, %r7, 1;
+ mul.wide.u32 %rd15, %r52, 4;
+ add.s64 %rd3, %rd10, %rd15;
+ mov.f32 %f69, 0fFF800000;
+ @%p18 bra $L__BB0_12;
+
+ ld.shared.f32 %f69, [%rd1];
+ setp.lt.u32 %p19, %r5, 2;
+ @%p19 bra $L__BB0_12;
+
+ ld.shared.f32 %f24, [%rd3];
+ setp.gt.f32 %p20, %f69, %f24;
+ setp.nan.f32 %p21, %f69, %f69;
+ or.pred %p22, %p21, %p20;
+ selp.f32 %f69, %f69, %f24, %p22;
+
+$L__BB0_12:
+ bar.sync 0;
+ mul.wide.s32 %rd17, %r4, 4;
+ add.s64 %rd4, %rd10, %rd17;
+ setp.eq.s32 %p23, %r6, 0;
+ @%p23 bra $L__BB0_13;
+ bra.uni $L__BB0_14;
$L__BB0_13:
- bar.sync 0;
- mul.wide.s32 %rd18, %r5, 4;
- add.s64 %rd4, %rd11, %rd18;
- setp.eq.s32 %p24, %r1, 0;
- @%p24 bra $L__BB0_14;
- bra.uni $L__BB0_15;
+ st.shared.f32 [%rd4], %f69;
$L__BB0_14:
- st.shared.f32 [%rd4], %f78;
-
-$L__BB0_15:
- bar.sync 0;
- ld.shared.f32 %f32, [%rd4];
- bar.sync 0;
- sub.f32 %f33, %f76, %f32;
- mov.f32 %f34, 0f3F000000;
- mov.f32 %f35, 0f3BBB989D;
- fma.rn.f32 %f36, %f33, %f35, %f34;
- cvt.sat.f32.f32 %f37, %f36;
- mov.f32 %f38, 0f4B400001;
- mov.f32 %f39, 0f437C0000;
- fma.rm.f32 %f40, %f37, %f39, %f38;
- add.f32 %f41, %f40, 0fCB40007F;
- neg.f32 %f42, %f41;
- mov.f32 %f43, 0f3FB8AA3B;
- fma.rn.f32 %f44, %f33, %f43, %f42;
- mov.f32 %f45, 0f32A57060;
- fma.rn.f32 %f46, %f33, %f45, %f44;
- mov.b32 %r58, %f40;
- shl.b32 %r59, %r58, 23;
- mov.b32 %f47, %r59;
- ex2.approx.ftz.f32 %f48, %f46;
- mul.f32 %f8, %f48, %f47;
- sub.f32 %f49, %f77, %f32;
- fma.rn.f32 %f50, %f49, %f35, %f34;
- cvt.sat.f32.f32 %f51, %f50;
- fma.rm.f32 %f52, %f51, %f39, %f38;
- add.f32 %f53, %f52, 0fCB40007F;
- neg.f32 %f54, %f53;
- fma.rn.f32 %f55, %f49, %f43, %f54;
- fma.rn.f32 %f56, %f49, %f45, %f55;
- mov.b32 %r60, %f52;
- shl.b32 %r61, %r60, 23;
- mov.b32 %f57, %r61;
- ex2.approx.ftz.f32 %f58, %f56;
- mul.f32 %f9, %f58, %f57;
- @%p2 bra $L__BB0_16;
- bra.uni $L__BB0_17;
+ setp.lt.s32 %p24, %r3, 27454;
+ bar.sync 0;
+ ld.shared.f32 %f25, [%rd4];
+ bar.sync 0;
+ sub.f32 %f26, %f67, %f25;
+ mov.f32 %f27, 0f3F000000;
+ mov.f32 %f28, 0f3BBB989D;
+ fma.rn.f32 %f29, %f26, %f28, %f27;
+ cvt.sat.f32.f32 %f30, %f29;
+ mov.f32 %f31, 0f4B400001;
+ mov.f32 %f32, 0f437C0000;
+ fma.rm.f32 %f33, %f30, %f32, %f31;
+ add.f32 %f34, %f33, 0fCB40007F;
+ neg.f32 %f35, %f34;
+ mov.f32 %f36, 0f3FB8AA3B;
+ fma.rn.f32 %f37, %f26, %f36, %f35;
+ mov.f32 %f38, 0f32A57060;
+ fma.rn.f32 %f39, %f26, %f38, %f37;
+ mov.b32 %r53, %f33;
+ shl.b32 %r54, %r53, 23;
+ mov.b32 %f40, %r54;
+ ex2.approx.ftz.f32 %f41, %f39;
+ mul.f32 %f8, %f41, %f40;
+ sub.f32 %f42, %f68, %f25;
+ fma.rn.f32 %f43, %f42, %f28, %f27;
+ cvt.sat.f32.f32 %f44, %f43;
+ fma.rm.f32 %f45, %f44, %f32, %f31;
+ add.f32 %f46, %f45, 0fCB40007F;
+ neg.f32 %f47, %f46;
+ fma.rn.f32 %f48, %f42, %f36, %f47;
+ fma.rn.f32 %f49, %f42, %f38, %f48;
+ mov.b32 %r55, %f45;
+ shl.b32 %r56, %r55, 23;
+ mov.b32 %f50, %r56;
+ ex2.approx.ftz.f32 %f51, %f49;
+ mul.f32 %f9, %f51, %f50;
+ add.f32 %f52, %f8, 0f00000000;
+ add.f32 %f53, %f52, %f9;
+ selp.f32 %f54, %f53, 0f00000000, %p24;
+ st.shared.f32 [%rd1], %f54;
+ bar.sync 0;
+ @%p8 bra $L__BB0_16;
+
+ ld.shared.f32 %f55, [%rd2];
+ ld.shared.f32 %f56, [%rd1];
+ add.f32 %f57, %f55, %f56;
+ st.shared.f32 [%rd1], %f57;
$L__BB0_16:
- mov.u32 %r62, %ctaid.x;
- mad.lo.s32 %r63, %r3, %r62, %r4;
- setp.lt.s32 %p26, %r63, 27454;
- @%p26 bra $L__BB0_18;
- bra.uni $L__BB0_17;
-
-$L__BB0_18:
- add.f32 %f61, %f8, 0f00000000;
- add.f32 %f79, %f61, %f9;
- bra.uni $L__BB0_19;
+ bar.sync 0;
+ @%p12 bra $L__BB0_20;
$L__BB0_17:
- mov.u32 %r64, %ctaid.x;
- mad.lo.s32 %r65, %r3, %r64, %r4;
- setp.lt.s32 %p28, %r65, 27454;
- and.pred %p29, %p2, %p28;
- add.f32 %f59, %f8, 0f00000000;
- add.f32 %f60, %f59, %f9;
- selp.f32 %f79, %f60, 0f00000000, %p29;
+ setp.ge.u32 %p27, %r6, %r62;
+ @%p27 bra $L__BB0_19;
+
+ add.s32 %r57, %r62, %r7;
+ mul.wide.s32 %rd19, %r57, 4;
+ add.s64 %rd21, %rd10, %rd19;
+ ld.shared.f32 %f58, [%rd1];
+ ld.shared.f32 %f59, [%rd21];
+ add.f32 %f60, %f59, %f58;
+ st.shared.f32 [%rd1], %f60;
$L__BB0_19:
- st.shared.f32 [%rd1], %f79;
- bar.sync 0;
- @%p9 bra $L__BB0_21;
-
- ld.shared.f32 %f62, [%rd2];
- ld.shared.f32 %f63, [%rd1];
- add.f32 %f64, %f62, %f63;
- st.shared.f32 [%rd1], %f64;
-
-$L__BB0_21:
- bar.sync 0;
- @%p13 bra $L__BB0_25;
-
-$L__BB0_22:
- setp.ge.u32 %p32, %r1, %r78;
- @%p32 bra $L__BB0_24;
-
- add.s32 %r66, %r78, %r7;
- mul.wide.s32 %rd20, %r66, 4;
- add.s64 %rd22, %rd11, %rd20;
- ld.shared.f32 %f65, [%rd1];
- ld.shared.f32 %f66, [%rd22];
- add.f32 %f67, %f66, %f65;
- st.shared.f32 [%rd1], %f67;
-
-$L__BB0_24:
- bar.sync 0;
- shr.u32 %r13, %r78, 1;
- setp.gt.u32 %p33, %r78, 3;
- mov.u32 %r78, %r13;
- @%p33 bra $L__BB0_22;
+ bar.sync 0;
+ shr.u32 %r13, %r62, 1;
+ setp.gt.u32 %p28, %r62, 3;
+ mov.u32 %r62, %r13;
+ @%p28 bra $L__BB0_17;
+
+$L__BB0_20:
+ mov.f32 %f70, 0f00000000;
+ @%p18 bra $L__BB0_23;
+
+ ld.shared.f32 %f62, [%rd1];
+ add.f32 %f70, %f62, 0f00000000;
+ setp.lt.u32 %p30, %r5, 2;
+ @%p30 bra $L__BB0_23;
+
+ ld.shared.f32 %f63, [%rd3];
+ add.f32 %f70, %f70, %f63;
+
+$L__BB0_23:
+ bar.sync 0;
+ @%p18 bra $L__BB0_25;
+
+ st.shared.f32 [%rd4], %f70;
$L__BB0_25:
- mov.f32 %f80, 0f00000000;
- @%p19 bra $L__BB0_28;
-
- ld.shared.f32 %f69, [%rd1];
- add.f32 %f80, %f69, 0f00000000;
- setp.lt.u32 %p35, %r6, 2;
- @%p35 bra $L__BB0_28;
-
- ld.shared.f32 %f70, [%rd3];
- add.f32 %f80, %f80, %f70;
-
-$L__BB0_28:
- bar.sync 0;
- @%p19 bra $L__BB0_30;
-
- st.shared.f32 [%rd4], %f80;
-
-$L__BB0_30:
- setp.gt.s32 %p37, %r1, 0;
- bar.sync 0;
- ld.shared.f32 %f71, [%rd4];
- bar.sync 0;
- rcp.rn.f32 %f16, %f71;
- @%p37 bra $L__BB0_32;
-
- mov.u32 %r67, %ctaid.x;
- mad.lo.s32 %r14, %r3, %r67, %r4;
- setp.lt.s32 %p38, %r14, 27454;
- @%p38 bra $L__BB0_35;
- bra.uni $L__BB0_32;
-
-$L__BB0_35:
- mul.f32 %f72, %f16, %f8;
- mov.b32 %r73, %f72;
- mul.f32 %f73, %f16, %f9;
- mov.b32 %r74, %f73;
- add.s32 %r75, %r14, %r1;
- shl.b32 %r76, %r75, 1;
- mul.wide.s32 %rd26, %r76, 4;
- add.s64 %rd25, %rd7, %rd26;
-
- st.global.cs.v2.s32 [%rd25], {%r73,%r74};
-
- bra.uni $L__BB0_36;
-
-$L__BB0_32:
- mul.f32 %f17, %f16, %f8;
- mul.f32 %f18, %f16, %f9;
- @%p37 bra $L__BB0_36;
-
- mov.u32 %r68, %ctaid.x;
- mad.lo.s32 %r15, %r3, %r68, %r4;
- setp.gt.s32 %p40, %r15, 27453;
- @%p40 bra $L__BB0_36;
-
- add.s32 %r71, %r15, %r1;
- shl.b32 %r72, %r71, 1;
- mul.wide.s32 %rd24, %r72, 4;
- add.s64 %rd23, %rd7, %rd24;
- mov.b32 %r69, %f17;
- mov.b32 %r70, %f18;
-
- st.global.cs.v2.s32 [%rd23], {%r69,%r70};
-
-
-$L__BB0_36:
+ bar.sync 0;
+ ld.shared.f32 %f13, [%rd4];
+ bar.sync 0;
+ @%p2 bra $L__BB0_27;
+
+ rcp.rn.f32 %f64, %f13;
+ mul.f32 %f65, %f64, %f8;
+ mov.b32 %r58, %f65;
+ mul.f32 %f66, %f64, %f9;
+ mov.b32 %r59, %f66;
+ shl.b32 %r60, %r3, 1;
+ mul.wide.s32 %rd23, %r60, 4;
+ add.s64 %rd22, %rd6, %rd23;
+
+ st.global.cs.v2.s32 [%rd22], {%r58,%r59};
+
+
+$L__BB0_27:
ret;
}
Kernel 79
CUDA
PTX
53997da5d
Diff
03a1b695e
-8
+8 index type: int
registers: 0
gmem: 0
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 3, 3> T4, Tensor<float, 3, 3> T12) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
Array<float, 2, 2> T13;
T13.set(float(NEG_INFINITY));
if (((((((nvfuser_index_t)threadIdx.x) * 2) + 1) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/false, CacheOp::Streaming>(&T13[0], &T4[(((2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))]);
}
Array<float, 1, 1> T15;
T15[0] = NEG_INFINITY;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
T15[0] = fmax(
T15[0],
T13[i0]);
}
Array<float, 1, 1> T5;
T5[0] = NEG_INFINITY;
blockReduce<true, false, false, true>(T5[0], T15[0], [](float &a, float b) { a = fmax(a, b); }, static_cast<float*>(shared_mem), true, true, float(NEG_INFINITY), DefaultBlockDim());
Array<float, 1, 1> T6;
broadcast::blockBroadcast<true, false, false, true>(T6[0], T5[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
// Alias Allocation - register
auto& T8 = T13;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 2; ++i1) {
Array<float, 1, 1> T7;
T7[0]
= T13[i1]
- T6[0];
T8[i1]
= expf(T7[0]);
}
Array<float, 1, 1> T16;
T16[0] = 0.000000000e+00f;
if (((((((nvfuser_index_t)threadIdx.x) * 2) + 1) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 2; ++i2) {
T16[0]
= T16[0]
+ T8[i2];
}
} else {
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 2; ++i2) {
if ((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
T16[0]
= T16[0]
+ T8[i2];
}
}
}
Array<float, 1, 1> T9;
T9[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T9[0], T16[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T10;
broadcast::blockBroadcast<true, false, false, true>(T10[0], T9[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T11;
T11[0]
= reciprocal(T10[0]);
if (((((((nvfuser_index_t)threadIdx.x) * 2) + 1) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
Array<float, 2, 2> T14;
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 2; ++i3) {
T14[i3]
= T8[i3]
* T11[0];
}
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T12[(((2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T14[0]);
} else {
Array<float, 2, 2> T14;
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 2; ++i3) {
T14[i3]
= T8[i3]
* T11[0];
}
if ((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T12[(((2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T14[0]);
}
}
}
__global__ void nvfuser_N(Tensor<float, 3, 3> T4, Tensor<float, 3, 3> T12) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
Array<float, 2, 2> T13;
T13.set(float(NEG_INFINITY));
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/false, CacheOp::Streaming>(&T13[0], &T4[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))]);
}
Array<float, 1, 1> T15;
T15[0] = NEG_INFINITY;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
T15[0] = fmax(
T15[0],
T13[i0]);
}
Array<float, 1, 1> T5;
T5[0] = NEG_INFINITY;
blockReduce<true, false, false, true>(T5[0], T15[0], [](float &a, float b) { a = fmax(a, b); }, static_cast<float*>(shared_mem), true, true, float(NEG_INFINITY), DefaultBlockDim());
Array<float, 1, 1> T6;
broadcast::blockBroadcast<true, false, false, true>(T6[0], T5[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
// Alias Allocation - register
auto& T8 = T13;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 2; ++i1) {
Array<float, 1, 1> T7;
T7[0]
= T13[i1]
- T6[0];
T8[i1]
= expf(T7[0]);
}
Array<float, 1, 1> T16;
T16[0] = 0.000000000e+00f;
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 2; ++i2) {
T16[0]
= T16[0]
+ T8[i2];
}
} else {
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 2; ++i2) {
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
T16[0]
= T16[0]
+ T8[i2];
}
}
}
Array<float, 1, 1> T9;
T9[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T9[0], T16[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T10;
broadcast::blockBroadcast<true, false, false, true>(T10[0], T9[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T11;
T11[0]
= reciprocal(T10[0]);
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
Array<float, 2, 2> T14;
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 2; ++i3) {
T14[i3]
= T8[i3]
* T11[0];
}
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T12[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T14[0]);
} else {
Array<float, 2, 2> T14;
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 2; ++i3) {
T14[i3]
= T8[i3]
* T11[0];
}
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T12[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T14[0]);
}
}
}
--- 53997da5d
+++ 03a1b695e
@@ -1,12 +1,12 @@
__global__ void nvfuser_N(Tensor<float, 3, 3> T4, Tensor<float, 3, 3> T12) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
Array<float, 2, 2> T13;
T13.set(float(NEG_INFINITY));
- if (((((((nvfuser_index_t)threadIdx.x) * 2) + 1) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
- loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/false, CacheOp::Streaming>(&T13[0], &T4[(((2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))]);
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
+ loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/false, CacheOp::Streaming>(&T13[0], &T4[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))]);
}
Array<float, 1, 1> T15;
T15[0] = NEG_INFINITY;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
@@ -30,21 +30,21 @@
T8[i1]
= expf(T7[0]);
}
Array<float, 1, 1> T16;
T16[0] = 0.000000000e+00f;
- if (((((((nvfuser_index_t)threadIdx.x) * 2) + 1) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 2; ++i2) {
T16[0]
= T16[0]
+ T8[i2];
}
} else {
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 2; ++i2) {
- if ((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
T16[0]
= T16[0]
+ T8[i2];
}
}
@@ -55,27 +55,27 @@
Array<float, 1, 1> T10;
broadcast::blockBroadcast<true, false, false, true>(T10[0], T9[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T11;
T11[0]
= reciprocal(T10[0]);
- if (((((((nvfuser_index_t)threadIdx.x) * 2) + 1) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
Array<float, 2, 2> T14;
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 2; ++i3) {
T14[i3]
= T8[i3]
* T11[0];
}
- loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T12[(((2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T14[0]);
+ loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T12[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T14[0]);
} else {
Array<float, 2, 2> T14;
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 2; ++i3) {
T14[i3]
= T8[i3]
* T11[0];
}
- if ((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
- loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T12[(((2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T14[0]);
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
+ loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T12[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T14[0]);
}
}
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_84_cu_4bb7ced3_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_84_cu_4bb7ced3_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_84_cu_4bb7ced3_191103std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_84_cu_4bb7ced3_191105arrayE[];
.entry _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_84_cu_4bb7ced3_1911010nvfuser_84ENS_6TensorIfLi3ELi3EEES1_(
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_84_cu_4bb7ced3_1911010nvfuser_84ENS_6TensorIfLi3ELi3EEES1__param_0[32],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_84_cu_4bb7ced3_1911010nvfuser_84ENS_6TensorIfLi3ELi3EEES1__param_1[32]
)
{
.reg .pred %p<41>;
.reg .f32 %f<81>;
.reg .b32 %r<79>;
.reg .b64 %rd<27>;
ld.param.u64 %rd7, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_84_cu_4bb7ced3_1911010nvfuser_84ENS_6TensorIfLi3ELi3EEES1__param_1];
ld.param.u64 %rd6, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_84_cu_4bb7ced3_1911010nvfuser_84ENS_6TensorIfLi3ELi3EEES1__param_0];
mov.u32 %r1, %tid.x;
setp.lt.s32 %p2, %r1, 1;
mov.f32 %f76, 0fFF800000;
mov.f32 %f77, 0fFF800000;
@%p2 bra $L__BB0_1;
bra.uni $L__BB0_3;
$L__BB0_1:
mov.u32 %r40, %tid.y;
mov.u32 %r41, %ctaid.x;
mov.u32 %r42, %ntid.y;
mad.lo.s32 %r2, %r42, %r41, %r40;
setp.gt.s32 %p3, %r2, 27453;
@%p3 bra $L__BB0_3;
add.s32 %r45, %r2, %r1;
shl.b32 %r46, %r45, 1;
mul.wide.s32 %rd9, %r46, 4;
add.s64 %rd8, %rd6, %rd9;
// begin inline asm
ld.global.cs.v2.u32 {%r43,%r44}, [%rd8];
// end inline asm
mov.b32 %f76, %r43;
mov.b32 %f77, %r44;
$L__BB0_3:
setp.gt.f32 %p4, %f76, %f77;
setp.nan.f32 %p5, %f76, %f76;
or.pred %p6, %p5, %p4;
selp.f32 %f23, %f76, %f77, %p6;
mov.u32 %r47, %tid.z;
mov.u32 %r3, %ntid.y;
mov.u32 %r4, %tid.y;
mad.lo.s32 %r5, %r3, %r47, %r4;
mov.u32 %r6, %ntid.x;
mad.lo.s32 %r7, %r5, %r6, %r1;
mul.wide.u32 %rd10, %r7, 4;
mov.u64 %rd11, _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_84_cu_4bb7ced3_191105arrayE;
add.s64 %rd1, %rd11, %rd10;
st.shared.f32 [%rd1], %f23;
bar.sync 0;
clz.b32 %r48, %r6;
mov.u32 %r49, 31;
sub.s32 %r50, %r49, %r48;
mov.u32 %r51, 1;
shl.b32 %r8, %r51, %r50;
setp.lt.u32 %p7, %r1, %r8;
add.s32 %r52, %r8, %r1;
setp.lt.u32 %p8, %r52, %r6;
and.pred %p1, %p7, %p8;
add.s32 %r53, %r7, %r8;
mul.wide.s32 %rd12, %r53, 4;
add.s64 %rd2, %rd11, %rd12;
not.pred %p9, %p1;
@%p9 bra $L__BB0_5;
ld.shared.f32 %f24, [%rd2];
ld.shared.f32 %f25, [%rd1];
setp.nan.f32 %p10, %f25, %f25;
setp.gt.f32 %p11, %f25, %f24;
or.pred %p12, %p10, %p11;
selp.f32 %f26, %f25, %f24, %p12;
st.shared.f32 [%rd1], %f26;
$L__BB0_5:
bar.sync 0;
shr.u32 %r54, %r8, 31;
add.s32 %r55, %r8, %r54;
shr.s32 %r78, %r55, 1;
setp.lt.s32 %p13, %r8, 4;
@%p13 bra $L__BB0_10;
mov.u32 %r77, %r78;
$L__BB0_7:
setp.ge.u32 %p14, %r1, %r77;
@%p14 bra $L__BB0_9;
add.s32 %r56, %r77, %r7;
mul.wide.s32 %rd13, %r56, 4;
add.s64 %rd15, %rd11, %rd13;
ld.shared.f32 %f27, [%rd1];
setp.nan.f32 %p15, %f27, %f27;
ld.shared.f32 %f28, [%rd15];
setp.gt.f32 %p16, %f27, %f28;
or.pred %p17, %p15, %p16;
selp.f32 %f29, %f27, %f28, %p17;
st.shared.f32 [%rd1], %f29;
$L__BB0_9:
bar.sync 0;
shr.u32 %r11, %r77, 1;
setp.gt.u32 %p18, %r77, 3;
mov.u32 %r77, %r11;
@%p18 bra $L__BB0_7;
$L__BB0_10:
setp.ne.s32 %p19, %r1, 0;
add.s32 %r57, %r7, 1;
mul.wide.u32 %rd16, %r57, 4;
add.s64 %rd3, %rd11, %rd16;
mov.f32 %f78, 0fFF800000;
@%p19 bra $L__BB0_13;
ld.shared.f32 %f78, [%rd1];
setp.lt.u32 %p20, %r6, 2;
@%p20 bra $L__BB0_13;
ld.shared.f32 %f31, [%rd3];
setp.gt.f32 %p21, %f78, %f31;
setp.nan.f32 %p22, %f78, %f78;
or.pred %p23, %p22, %p21;
selp.f32 %f78, %f78, %f31, %p23;
$L__BB0_13:
bar.sync 0;
mul.wide.s32 %rd18, %r5, 4;
add.s64 %rd4, %rd11, %rd18;
setp.eq.s32 %p24, %r1, 0;
@%p24 bra $L__BB0_14;
bra.uni $L__BB0_15;
$L__BB0_14:
st.shared.f32 [%rd4], %f78;
$L__BB0_15:
bar.sync 0;
ld.shared.f32 %f32, [%rd4];
bar.sync 0;
sub.f32 %f33, %f76, %f32;
mov.f32 %f34, 0f3F000000;
mov.f32 %f35, 0f3BBB989D;
fma.rn.f32 %f36, %f33, %f35, %f34;
cvt.sat.f32.f32 %f37, %f36;
mov.f32 %f38, 0f4B400001;
mov.f32 %f39, 0f437C0000;
fma.rm.f32 %f40, %f37, %f39, %f38;
add.f32 %f41, %f40, 0fCB40007F;
neg.f32 %f42, %f41;
mov.f32 %f43, 0f3FB8AA3B;
fma.rn.f32 %f44, %f33, %f43, %f42;
mov.f32 %f45, 0f32A57060;
fma.rn.f32 %f46, %f33, %f45, %f44;
mov.b32 %r58, %f40;
shl.b32 %r59, %r58, 23;
mov.b32 %f47, %r59;
ex2.approx.ftz.f32 %f48, %f46;
mul.f32 %f8, %f48, %f47;
sub.f32 %f49, %f77, %f32;
fma.rn.f32 %f50, %f49, %f35, %f34;
cvt.sat.f32.f32 %f51, %f50;
fma.rm.f32 %f52, %f51, %f39, %f38;
add.f32 %f53, %f52, 0fCB40007F;
neg.f32 %f54, %f53;
fma.rn.f32 %f55, %f49, %f43, %f54;
fma.rn.f32 %f56, %f49, %f45, %f55;
mov.b32 %r60, %f52;
shl.b32 %r61, %r60, 23;
mov.b32 %f57, %r61;
ex2.approx.ftz.f32 %f58, %f56;
mul.f32 %f9, %f58, %f57;
@%p2 bra $L__BB0_16;
bra.uni $L__BB0_17;
$L__BB0_16:
mov.u32 %r62, %ctaid.x;
mad.lo.s32 %r63, %r3, %r62, %r4;
setp.lt.s32 %p26, %r63, 27454;
@%p26 bra $L__BB0_18;
bra.uni $L__BB0_17;
$L__BB0_18:
add.f32 %f61, %f8, 0f00000000;
add.f32 %f79, %f61, %f9;
bra.uni $L__BB0_19;
$L__BB0_17:
mov.u32 %r64, %ctaid.x;
mad.lo.s32 %r65, %r3, %r64, %r4;
setp.lt.s32 %p28, %r65, 27454;
and.pred %p29, %p2, %p28;
add.f32 %f59, %f8, 0f00000000;
add.f32 %f60, %f59, %f9;
selp.f32 %f79, %f60, 0f00000000, %p29;
$L__BB0_19:
st.shared.f32 [%rd1], %f79;
bar.sync 0;
@%p9 bra $L__BB0_21;
ld.shared.f32 %f62, [%rd2];
ld.shared.f32 %f63, [%rd1];
add.f32 %f64, %f62, %f63;
st.shared.f32 [%rd1], %f64;
$L__BB0_21:
bar.sync 0;
@%p13 bra $L__BB0_25;
$L__BB0_22:
setp.ge.u32 %p32, %r1, %r78;
@%p32 bra $L__BB0_24;
add.s32 %r66, %r78, %r7;
mul.wide.s32 %rd20, %r66, 4;
add.s64 %rd22, %rd11, %rd20;
ld.shared.f32 %f65, [%rd1];
ld.shared.f32 %f66, [%rd22];
add.f32 %f67, %f66, %f65;
st.shared.f32 [%rd1], %f67;
$L__BB0_24:
bar.sync 0;
shr.u32 %r13, %r78, 1;
setp.gt.u32 %p33, %r78, 3;
mov.u32 %r78, %r13;
@%p33 bra $L__BB0_22;
$L__BB0_25:
mov.f32 %f80, 0f00000000;
@%p19 bra $L__BB0_28;
ld.shared.f32 %f69, [%rd1];
add.f32 %f80, %f69, 0f00000000;
setp.lt.u32 %p35, %r6, 2;
@%p35 bra $L__BB0_28;
ld.shared.f32 %f70, [%rd3];
add.f32 %f80, %f80, %f70;
$L__BB0_28:
bar.sync 0;
@%p19 bra $L__BB0_30;
st.shared.f32 [%rd4], %f80;
$L__BB0_30:
setp.gt.s32 %p37, %r1, 0;
bar.sync 0;
ld.shared.f32 %f71, [%rd4];
bar.sync 0;
rcp.rn.f32 %f16, %f71;
@%p37 bra $L__BB0_32;
mov.u32 %r67, %ctaid.x;
mad.lo.s32 %r14, %r3, %r67, %r4;
setp.lt.s32 %p38, %r14, 27454;
@%p38 bra $L__BB0_35;
bra.uni $L__BB0_32;
$L__BB0_35:
mul.f32 %f72, %f16, %f8;
mov.b32 %r73, %f72;
mul.f32 %f73, %f16, %f9;
mov.b32 %r74, %f73;
add.s32 %r75, %r14, %r1;
shl.b32 %r76, %r75, 1;
mul.wide.s32 %rd26, %r76, 4;
add.s64 %rd25, %rd7, %rd26;
// begin inline asm
st.global.cs.v2.s32 [%rd25], {%r73,%r74};
// end inline asm
bra.uni $L__BB0_36;
$L__BB0_32:
mul.f32 %f17, %f16, %f8;
mul.f32 %f18, %f16, %f9;
@%p37 bra $L__BB0_36;
mov.u32 %r68, %ctaid.x;
mad.lo.s32 %r15, %r3, %r68, %r4;
setp.gt.s32 %p40, %r15, 27453;
@%p40 bra $L__BB0_36;
add.s32 %r71, %r15, %r1;
shl.b32 %r72, %r71, 1;
mul.wide.s32 %rd24, %r72, 4;
add.s64 %rd23, %rd7, %rd24;
mov.b32 %r69, %f17;
mov.b32 %r70, %f18;
// begin inline asm
st.global.cs.v2.s32 [%rd23], {%r69,%r70};
// end inline asm
$L__BB0_36:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_84_cu_72f7d4d0_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_84_cu_72f7d4d0_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_84_cu_72f7d4d0_160113std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_84_cu_72f7d4d0_160115arrayE[];
.entry _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_84_cu_72f7d4d0_1601110nvfuser_84ENS_6TensorIfLi3ELi3EEES1_(
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_84_cu_72f7d4d0_1601110nvfuser_84ENS_6TensorIfLi3ELi3EEES1__param_0[32],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_84_cu_72f7d4d0_1601110nvfuser_84ENS_6TensorIfLi3ELi3EEES1__param_1[32]
)
{
.reg .pred %p<33>;
.reg .f32 %f<71>;
.reg .b32 %r<63>;
.reg .b64 %rd<24>;
ld.param.u64 %rd6, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_84_cu_72f7d4d0_1601110nvfuser_84ENS_6TensorIfLi3ELi3EEES1__param_1];
ld.param.u64 %rd5, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_84_cu_72f7d4d0_1601110nvfuser_84ENS_6TensorIfLi3ELi3EEES1__param_0];
mov.u32 %r38, %ctaid.x;
mov.u32 %r1, %ntid.y;
mov.u32 %r2, %tid.y;
mad.lo.s32 %r3, %r1, %r38, %r2;
setp.gt.s32 %p2, %r3, 27453;
mov.f32 %f67, 0fFF800000;
mov.f32 %f68, %f67;
@%p2 bra $L__BB0_2;
shl.b32 %r41, %r3, 1;
mul.wide.s32 %rd8, %r41, 4;
add.s64 %rd7, %rd5, %rd8;
// begin inline asm
ld.global.cs.v2.u32 {%r39,%r40}, [%rd7];
// end inline asm
mov.b32 %f67, %r39;
mov.b32 %f68, %r40;
$L__BB0_2:
setp.gt.f32 %p3, %f67, %f68;
setp.nan.f32 %p4, %f67, %f67;
or.pred %p5, %p4, %p3;
selp.f32 %f16, %f67, %f68, %p5;
mov.u32 %r42, %tid.z;
mad.lo.s32 %r4, %r1, %r42, %r2;
mov.u32 %r5, %ntid.x;
mov.u32 %r6, %tid.x;
mad.lo.s32 %r7, %r4, %r5, %r6;
mul.wide.u32 %rd9, %r7, 4;
mov.u64 %rd10, _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_84_cu_72f7d4d0_160115arrayE;
add.s64 %rd1, %rd10, %rd9;
st.shared.f32 [%rd1], %f16;
bar.sync 0;
clz.b32 %r43, %r5;
mov.u32 %r44, 31;
sub.s32 %r45, %r44, %r43;
mov.u32 %r46, 1;
shl.b32 %r8, %r46, %r45;
setp.lt.u32 %p6, %r6, %r8;
add.s32 %r47, %r8, %r6;
setp.lt.u32 %p7, %r47, %r5;
and.pred %p1, %p6, %p7;
add.s32 %r48, %r7, %r8;
mul.wide.s32 %rd11, %r48, 4;
add.s64 %rd2, %rd10, %rd11;
not.pred %p8, %p1;
@%p8 bra $L__BB0_4;
ld.shared.f32 %f17, [%rd2];
ld.shared.f32 %f18, [%rd1];
setp.nan.f32 %p9, %f18, %f18;
setp.gt.f32 %p10, %f18, %f17;
or.pred %p11, %p9, %p10;
selp.f32 %f19, %f18, %f17, %p11;
st.shared.f32 [%rd1], %f19;
$L__BB0_4:
bar.sync 0;
shr.u32 %r49, %r8, 31;
add.s32 %r50, %r8, %r49;
shr.s32 %r62, %r50, 1;
setp.lt.s32 %p12, %r8, 4;
@%p12 bra $L__BB0_9;
mov.u32 %r61, %r62;
$L__BB0_6:
setp.ge.u32 %p13, %r6, %r61;
@%p13 bra $L__BB0_8;
add.s32 %r51, %r61, %r7;
mul.wide.s32 %rd12, %r51, 4;
add.s64 %rd14, %rd10, %rd12;
ld.shared.f32 %f20, [%rd1];
setp.nan.f32 %p14, %f20, %f20;
ld.shared.f32 %f21, [%rd14];
setp.gt.f32 %p15, %f20, %f21;
or.pred %p16, %p14, %p15;
selp.f32 %f22, %f20, %f21, %p16;
st.shared.f32 [%rd1], %f22;
$L__BB0_8:
bar.sync 0;
shr.u32 %r11, %r61, 1;
setp.gt.u32 %p17, %r61, 3;
mov.u32 %r61, %r11;
@%p17 bra $L__BB0_6;
$L__BB0_9:
setp.ne.s32 %p18, %r6, 0;
add.s32 %r52, %r7, 1;
mul.wide.u32 %rd15, %r52, 4;
add.s64 %rd3, %rd10, %rd15;
mov.f32 %f69, 0fFF800000;
@%p18 bra $L__BB0_12;
ld.shared.f32 %f69, [%rd1];
setp.lt.u32 %p19, %r5, 2;
@%p19 bra $L__BB0_12;
ld.shared.f32 %f24, [%rd3];
setp.gt.f32 %p20, %f69, %f24;
setp.nan.f32 %p21, %f69, %f69;
or.pred %p22, %p21, %p20;
selp.f32 %f69, %f69, %f24, %p22;
$L__BB0_12:
bar.sync 0;
mul.wide.s32 %rd17, %r4, 4;
add.s64 %rd4, %rd10, %rd17;
setp.eq.s32 %p23, %r6, 0;
@%p23 bra $L__BB0_13;
bra.uni $L__BB0_14;
$L__BB0_13:
st.shared.f32 [%rd4], %f69;
$L__BB0_14:
setp.lt.s32 %p24, %r3, 27454;
bar.sync 0;
ld.shared.f32 %f25, [%rd4];
bar.sync 0;
sub.f32 %f26, %f67, %f25;
mov.f32 %f27, 0f3F000000;
mov.f32 %f28, 0f3BBB989D;
fma.rn.f32 %f29, %f26, %f28, %f27;
cvt.sat.f32.f32 %f30, %f29;
mov.f32 %f31, 0f4B400001;
mov.f32 %f32, 0f437C0000;
fma.rm.f32 %f33, %f30, %f32, %f31;
add.f32 %f34, %f33, 0fCB40007F;
neg.f32 %f35, %f34;
mov.f32 %f36, 0f3FB8AA3B;
fma.rn.f32 %f37, %f26, %f36, %f35;
mov.f32 %f38, 0f32A57060;
fma.rn.f32 %f39, %f26, %f38, %f37;
mov.b32 %r53, %f33;
shl.b32 %r54, %r53, 23;
mov.b32 %f40, %r54;
ex2.approx.ftz.f32 %f41, %f39;
mul.f32 %f8, %f41, %f40;
sub.f32 %f42, %f68, %f25;
fma.rn.f32 %f43, %f42, %f28, %f27;
cvt.sat.f32.f32 %f44, %f43;
fma.rm.f32 %f45, %f44, %f32, %f31;
add.f32 %f46, %f45, 0fCB40007F;
neg.f32 %f47, %f46;
fma.rn.f32 %f48, %f42, %f36, %f47;
fma.rn.f32 %f49, %f42, %f38, %f48;
mov.b32 %r55, %f45;
shl.b32 %r56, %r55, 23;
mov.b32 %f50, %r56;
ex2.approx.ftz.f32 %f51, %f49;
mul.f32 %f9, %f51, %f50;
add.f32 %f52, %f8, 0f00000000;
add.f32 %f53, %f52, %f9;
selp.f32 %f54, %f53, 0f00000000, %p24;
st.shared.f32 [%rd1], %f54;
bar.sync 0;
@%p8 bra $L__BB0_16;
ld.shared.f32 %f55, [%rd2];
ld.shared.f32 %f56, [%rd1];
add.f32 %f57, %f55, %f56;
st.shared.f32 [%rd1], %f57;
$L__BB0_16:
bar.sync 0;
@%p12 bra $L__BB0_20;
$L__BB0_17:
setp.ge.u32 %p27, %r6, %r62;
@%p27 bra $L__BB0_19;
add.s32 %r57, %r62, %r7;
mul.wide.s32 %rd19, %r57, 4;
add.s64 %rd21, %rd10, %rd19;
ld.shared.f32 %f58, [%rd1];
ld.shared.f32 %f59, [%rd21];
add.f32 %f60, %f59, %f58;
st.shared.f32 [%rd1], %f60;
$L__BB0_19:
bar.sync 0;
shr.u32 %r13, %r62, 1;
setp.gt.u32 %p28, %r62, 3;
mov.u32 %r62, %r13;
@%p28 bra $L__BB0_17;
$L__BB0_20:
mov.f32 %f70, 0f00000000;
@%p18 bra $L__BB0_23;
ld.shared.f32 %f62, [%rd1];
add.f32 %f70, %f62, 0f00000000;
setp.lt.u32 %p30, %r5, 2;
@%p30 bra $L__BB0_23;
ld.shared.f32 %f63, [%rd3];
add.f32 %f70, %f70, %f63;
$L__BB0_23:
bar.sync 0;
@%p18 bra $L__BB0_25;
st.shared.f32 [%rd4], %f70;
$L__BB0_25:
bar.sync 0;
ld.shared.f32 %f13, [%rd4];
bar.sync 0;
@%p2 bra $L__BB0_27;
rcp.rn.f32 %f64, %f13;
mul.f32 %f65, %f64, %f8;
mov.b32 %r58, %f65;
mul.f32 %f66, %f64, %f9;
mov.b32 %r59, %f66;
shl.b32 %r60, %r3, 1;
mul.wide.s32 %rd23, %r60, 4;
add.s64 %rd22, %rd6, %rd23;
// begin inline asm
st.global.cs.v2.s32 [%rd22], {%r58,%r59};
// end inline asm
$L__BB0_27:
ret;
}
--- 53997da5d
+++ 03a1b695e
@@ -18,303 +18,244 @@
.entry _ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_(
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1__param_0[32],
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1__param_1[32]
)
{
- .reg .pred %p<41>;
- .reg .f32 %f<81>;
- .reg .b32 %r<79>;
- .reg .b64 %rd<27>;
-
-
- ld.param.u64 %rd7, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1__param_1];
- ld.param.u64 %rd6, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1__param_0];
- mov.u32 %r1, %tid.x;
- setp.lt.s32 %p2, %r1, 1;
- mov.f32 %f76, 0fFF800000;
- mov.f32 %f77, 0fFF800000;
- @%p2 bra $L__BB0_1;
- bra.uni $L__BB0_3;
-
-$L__BB0_1:
- mov.u32 %r40, %tid.y;
- mov.u32 %r41, %ctaid.x;
- mov.u32 %r42, %ntid.y;
- mad.lo.s32 %r2, %r42, %r41, %r40;
- setp.gt.s32 %p3, %r2, 27453;
- @%p3 bra $L__BB0_3;
-
- add.s32 %r45, %r2, %r1;
- shl.b32 %r46, %r45, 1;
- mul.wide.s32 %rd9, %r46, 4;
- add.s64 %rd8, %rd6, %rd9;
-
- ld.global.cs.v2.u32 {%r43,%r44}, [%rd8];
-
- mov.b32 %f76, %r43;
- mov.b32 %f77, %r44;
-
-$L__BB0_3:
- setp.gt.f32 %p4, %f76, %f77;
- setp.nan.f32 %p5, %f76, %f76;
- or.pred %p6, %p5, %p4;
- selp.f32 %f23, %f76, %f77, %p6;
- mov.u32 %r47, %tid.z;
- mov.u32 %r3, %ntid.y;
- mov.u32 %r4, %tid.y;
- mad.lo.s32 %r5, %r3, %r47, %r4;
- mov.u32 %r6, %ntid.x;
- mad.lo.s32 %r7, %r5, %r6, %r1;
- mul.wide.u32 %rd10, %r7, 4;
- mov.u64 %rd11, _ZN11kernelscope6kernelE;
- add.s64 %rd1, %rd11, %rd10;
- st.shared.f32 [%rd1], %f23;
- bar.sync 0;
- clz.b32 %r48, %r6;
- mov.u32 %r49, 31;
- sub.s32 %r50, %r49, %r48;
- mov.u32 %r51, 1;
- shl.b32 %r8, %r51, %r50;
- setp.lt.u32 %p7, %r1, %r8;
- add.s32 %r52, %r8, %r1;
- setp.lt.u32 %p8, %r52, %r6;
- and.pred %p1, %p7, %p8;
- add.s32 %r53, %r7, %r8;
- mul.wide.s32 %rd12, %r53, 4;
- add.s64 %rd2, %rd11, %rd12;
- not.pred %p9, %p1;
- @%p9 bra $L__BB0_5;
-
- ld.shared.f32 %f24, [%rd2];
- ld.shared.f32 %f25, [%rd1];
- setp.nan.f32 %p10, %f25, %f25;
- setp.gt.f32 %p11, %f25, %f24;
- or.pred %p12, %p10, %p11;
- selp.f32 %f26, %f25, %f24, %p12;
- st.shared.f32 [%rd1], %f26;
-
-$L__BB0_5:
- bar.sync 0;
- shr.u32 %r54, %r8, 31;
- add.s32 %r55, %r8, %r54;
- shr.s32 %r78, %r55, 1;
- setp.lt.s32 %p13, %r8, 4;
- @%p13 bra $L__BB0_10;
-
- mov.u32 %r77, %r78;
-
-$L__BB0_7:
- setp.ge.u32 %p14, %r1, %r77;
- @%p14 bra $L__BB0_9;
-
- add.s32 %r56, %r77, %r7;
- mul.wide.s32 %rd13, %r56, 4;
- add.s64 %rd15, %rd11, %rd13;
- ld.shared.f32 %f27, [%rd1];
- setp.nan.f32 %p15, %f27, %f27;
- ld.shared.f32 %f28, [%rd15];
- setp.gt.f32 %p16, %f27, %f28;
- or.pred %p17, %p15, %p16;
- selp.f32 %f29, %f27, %f28, %p17;
- st.shared.f32 [%rd1], %f29;
+ .reg .pred %p<33>;
+ .reg .f32 %f<71>;
+ .reg .b32 %r<63>;
+ .reg .b64 %rd<24>;
+
+
+ ld.param.u64 %rd6, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1__param_1];
+ ld.param.u64 %rd5, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1__param_0];
+ mov.u32 %r38, %ctaid.x;
+ mov.u32 %r1, %ntid.y;
+ mov.u32 %r2, %tid.y;
+ mad.lo.s32 %r3, %r1, %r38, %r2;
+ setp.gt.s32 %p2, %r3, 27453;
+ mov.f32 %f67, 0fFF800000;
+ mov.f32 %f68, %f67;
+ @%p2 bra $L__BB0_2;
+
+ shl.b32 %r41, %r3, 1;
+ mul.wide.s32 %rd8, %r41, 4;
+ add.s64 %rd7, %rd5, %rd8;
+
+ ld.global.cs.v2.u32 {%r39,%r40}, [%rd7];
+
+ mov.b32 %f67, %r39;
+ mov.b32 %f68, %r40;
+
+$L__BB0_2:
+ setp.gt.f32 %p3, %f67, %f68;
+ setp.nan.f32 %p4, %f67, %f67;
+ or.pred %p5, %p4, %p3;
+ selp.f32 %f16, %f67, %f68, %p5;
+ mov.u32 %r42, %tid.z;
+ mad.lo.s32 %r4, %r1, %r42, %r2;
+ mov.u32 %r5, %ntid.x;
+ mov.u32 %r6, %tid.x;
+ mad.lo.s32 %r7, %r4, %r5, %r6;
+ mul.wide.u32 %rd9, %r7, 4;
+ mov.u64 %rd10, _ZN11kernelscope6kernelE;
+ add.s64 %rd1, %rd10, %rd9;
+ st.shared.f32 [%rd1], %f16;
+ bar.sync 0;
+ clz.b32 %r43, %r5;
+ mov.u32 %r44, 31;
+ sub.s32 %r45, %r44, %r43;
+ mov.u32 %r46, 1;
+ shl.b32 %r8, %r46, %r45;
+ setp.lt.u32 %p6, %r6, %r8;
+ add.s32 %r47, %r8, %r6;
+ setp.lt.u32 %p7, %r47, %r5;
+ and.pred %p1, %p6, %p7;
+ add.s32 %r48, %r7, %r8;
+ mul.wide.s32 %rd11, %r48, 4;
+ add.s64 %rd2, %rd10, %rd11;
+ not.pred %p8, %p1;
+ @%p8 bra $L__BB0_4;
+
+ ld.shared.f32 %f17, [%rd2];
+ ld.shared.f32 %f18, [%rd1];
+ setp.nan.f32 %p9, %f18, %f18;
+ setp.gt.f32 %p10, %f18, %f17;
+ or.pred %p11, %p9, %p10;
+ selp.f32 %f19, %f18, %f17, %p11;
+ st.shared.f32 [%rd1], %f19;
+
+$L__BB0_4:
+ bar.sync 0;
+ shr.u32 %r49, %r8, 31;
+ add.s32 %r50, %r8, %r49;
+ shr.s32 %r62, %r50, 1;
+ setp.lt.s32 %p12, %r8, 4;
+ @%p12 bra $L__BB0_9;
+
+ mov.u32 %r61, %r62;
+
+$L__BB0_6:
+ setp.ge.u32 %p13, %r6, %r61;
+ @%p13 bra $L__BB0_8;
+
+ add.s32 %r51, %r61, %r7;
+ mul.wide.s32 %rd12, %r51, 4;
+ add.s64 %rd14, %rd10, %rd12;
+ ld.shared.f32 %f20, [%rd1];
+ setp.nan.f32 %p14, %f20, %f20;
+ ld.shared.f32 %f21, [%rd14];
+ setp.gt.f32 %p15, %f20, %f21;
+ or.pred %p16, %p14, %p15;
+ selp.f32 %f22, %f20, %f21, %p16;
+ st.shared.f32 [%rd1], %f22;
+
+$L__BB0_8:
+ bar.sync 0;
+ shr.u32 %r11, %r61, 1;
+ setp.gt.u32 %p17, %r61, 3;
+ mov.u32 %r61, %r11;
+ @%p17 bra $L__BB0_6;
$L__BB0_9:
- bar.sync 0;
- shr.u32 %r11, %r77, 1;
- setp.gt.u32 %p18, %r77, 3;
- mov.u32 %r77, %r11;
- @%p18 bra $L__BB0_7;
-
-$L__BB0_10:
- setp.ne.s32 %p19, %r1, 0;
- add.s32 %r57, %r7, 1;
- mul.wide.u32 %rd16, %r57, 4;
- add.s64 %rd3, %rd11, %rd16;
- mov.f32 %f78, 0fFF800000;
- @%p19 bra $L__BB0_13;
-
- ld.shared.f32 %f78, [%rd1];
- setp.lt.u32 %p20, %r6, 2;
- @%p20 bra $L__BB0_13;
-
- ld.shared.f32 %f31, [%rd3];
- setp.gt.f32 %p21, %f78, %f31;
- setp.nan.f32 %p22, %f78, %f78;
- or.pred %p23, %p22, %p21;
- selp.f32 %f78, %f78, %f31, %p23;
+ setp.ne.s32 %p18, %r6, 0;
+ add.s32 %r52, %r7, 1;
+ mul.wide.u32 %rd15, %r52, 4;
+ add.s64 %rd3, %rd10, %rd15;
+ mov.f32 %f69, 0fFF800000;
+ @%p18 bra $L__BB0_12;
+
+ ld.shared.f32 %f69, [%rd1];
+ setp.lt.u32 %p19, %r5, 2;
+ @%p19 bra $L__BB0_12;
+
+ ld.shared.f32 %f24, [%rd3];
+ setp.gt.f32 %p20, %f69, %f24;
+ setp.nan.f32 %p21, %f69, %f69;
+ or.pred %p22, %p21, %p20;
+ selp.f32 %f69, %f69, %f24, %p22;
+
+$L__BB0_12:
+ bar.sync 0;
+ mul.wide.s32 %rd17, %r4, 4;
+ add.s64 %rd4, %rd10, %rd17;
+ setp.eq.s32 %p23, %r6, 0;
+ @%p23 bra $L__BB0_13;
+ bra.uni $L__BB0_14;
$L__BB0_13:
- bar.sync 0;
- mul.wide.s32 %rd18, %r5, 4;
- add.s64 %rd4, %rd11, %rd18;
- setp.eq.s32 %p24, %r1, 0;
- @%p24 bra $L__BB0_14;
- bra.uni $L__BB0_15;
+ st.shared.f32 [%rd4], %f69;
$L__BB0_14:
- st.shared.f32 [%rd4], %f78;
-
-$L__BB0_15:
- bar.sync 0;
- ld.shared.f32 %f32, [%rd4];
- bar.sync 0;
- sub.f32 %f33, %f76, %f32;
- mov.f32 %f34, 0f3F000000;
- mov.f32 %f35, 0f3BBB989D;
- fma.rn.f32 %f36, %f33, %f35, %f34;
- cvt.sat.f32.f32 %f37, %f36;
- mov.f32 %f38, 0f4B400001;
- mov.f32 %f39, 0f437C0000;
- fma.rm.f32 %f40, %f37, %f39, %f38;
- add.f32 %f41, %f40, 0fCB40007F;
- neg.f32 %f42, %f41;
- mov.f32 %f43, 0f3FB8AA3B;
- fma.rn.f32 %f44, %f33, %f43, %f42;
- mov.f32 %f45, 0f32A57060;
- fma.rn.f32 %f46, %f33, %f45, %f44;
- mov.b32 %r58, %f40;
- shl.b32 %r59, %r58, 23;
- mov.b32 %f47, %r59;
- ex2.approx.ftz.f32 %f48, %f46;
- mul.f32 %f8, %f48, %f47;
- sub.f32 %f49, %f77, %f32;
- fma.rn.f32 %f50, %f49, %f35, %f34;
- cvt.sat.f32.f32 %f51, %f50;
- fma.rm.f32 %f52, %f51, %f39, %f38;
- add.f32 %f53, %f52, 0fCB40007F;
- neg.f32 %f54, %f53;
- fma.rn.f32 %f55, %f49, %f43, %f54;
- fma.rn.f32 %f56, %f49, %f45, %f55;
- mov.b32 %r60, %f52;
- shl.b32 %r61, %r60, 23;
- mov.b32 %f57, %r61;
- ex2.approx.ftz.f32 %f58, %f56;
- mul.f32 %f9, %f58, %f57;
- @%p2 bra $L__BB0_16;
- bra.uni $L__BB0_17;
+ setp.lt.s32 %p24, %r3, 27454;
+ bar.sync 0;
+ ld.shared.f32 %f25, [%rd4];
+ bar.sync 0;
+ sub.f32 %f26, %f67, %f25;
+ mov.f32 %f27, 0f3F000000;
+ mov.f32 %f28, 0f3BBB989D;
+ fma.rn.f32 %f29, %f26, %f28, %f27;
+ cvt.sat.f32.f32 %f30, %f29;
+ mov.f32 %f31, 0f4B400001;
+ mov.f32 %f32, 0f437C0000;
+ fma.rm.f32 %f33, %f30, %f32, %f31;
+ add.f32 %f34, %f33, 0fCB40007F;
+ neg.f32 %f35, %f34;
+ mov.f32 %f36, 0f3FB8AA3B;
+ fma.rn.f32 %f37, %f26, %f36, %f35;
+ mov.f32 %f38, 0f32A57060;
+ fma.rn.f32 %f39, %f26, %f38, %f37;
+ mov.b32 %r53, %f33;
+ shl.b32 %r54, %r53, 23;
+ mov.b32 %f40, %r54;
+ ex2.approx.ftz.f32 %f41, %f39;
+ mul.f32 %f8, %f41, %f40;
+ sub.f32 %f42, %f68, %f25;
+ fma.rn.f32 %f43, %f42, %f28, %f27;
+ cvt.sat.f32.f32 %f44, %f43;
+ fma.rm.f32 %f45, %f44, %f32, %f31;
+ add.f32 %f46, %f45, 0fCB40007F;
+ neg.f32 %f47, %f46;
+ fma.rn.f32 %f48, %f42, %f36, %f47;
+ fma.rn.f32 %f49, %f42, %f38, %f48;
+ mov.b32 %r55, %f45;
+ shl.b32 %r56, %r55, 23;
+ mov.b32 %f50, %r56;
+ ex2.approx.ftz.f32 %f51, %f49;
+ mul.f32 %f9, %f51, %f50;
+ add.f32 %f52, %f8, 0f00000000;
+ add.f32 %f53, %f52, %f9;
+ selp.f32 %f54, %f53, 0f00000000, %p24;
+ st.shared.f32 [%rd1], %f54;
+ bar.sync 0;
+ @%p8 bra $L__BB0_16;
+
+ ld.shared.f32 %f55, [%rd2];
+ ld.shared.f32 %f56, [%rd1];
+ add.f32 %f57, %f55, %f56;
+ st.shared.f32 [%rd1], %f57;
$L__BB0_16:
- mov.u32 %r62, %ctaid.x;
- mad.lo.s32 %r63, %r3, %r62, %r4;
- setp.lt.s32 %p26, %r63, 27454;
- @%p26 bra $L__BB0_18;
- bra.uni $L__BB0_17;
-
-$L__BB0_18:
- add.f32 %f61, %f8, 0f00000000;
- add.f32 %f79, %f61, %f9;
- bra.uni $L__BB0_19;
+ bar.sync 0;
+ @%p12 bra $L__BB0_20;
$L__BB0_17:
- mov.u32 %r64, %ctaid.x;
- mad.lo.s32 %r65, %r3, %r64, %r4;
- setp.lt.s32 %p28, %r65, 27454;
- and.pred %p29, %p2, %p28;
- add.f32 %f59, %f8, 0f00000000;
- add.f32 %f60, %f59, %f9;
- selp.f32 %f79, %f60, 0f00000000, %p29;
+ setp.ge.u32 %p27, %r6, %r62;
+ @%p27 bra $L__BB0_19;
+
+ add.s32 %r57, %r62, %r7;
+ mul.wide.s32 %rd19, %r57, 4;
+ add.s64 %rd21, %rd10, %rd19;
+ ld.shared.f32 %f58, [%rd1];
+ ld.shared.f32 %f59, [%rd21];
+ add.f32 %f60, %f59, %f58;
+ st.shared.f32 [%rd1], %f60;
$L__BB0_19:
- st.shared.f32 [%rd1], %f79;
- bar.sync 0;
- @%p9 bra $L__BB0_21;
-
- ld.shared.f32 %f62, [%rd2];
- ld.shared.f32 %f63, [%rd1];
- add.f32 %f64, %f62, %f63;
- st.shared.f32 [%rd1], %f64;
-
-$L__BB0_21:
- bar.sync 0;
- @%p13 bra $L__BB0_25;
-
-$L__BB0_22:
- setp.ge.u32 %p32, %r1, %r78;
- @%p32 bra $L__BB0_24;
-
- add.s32 %r66, %r78, %r7;
- mul.wide.s32 %rd20, %r66, 4;
- add.s64 %rd22, %rd11, %rd20;
- ld.shared.f32 %f65, [%rd1];
- ld.shared.f32 %f66, [%rd22];
- add.f32 %f67, %f66, %f65;
- st.shared.f32 [%rd1], %f67;
-
-$L__BB0_24:
- bar.sync 0;
- shr.u32 %r13, %r78, 1;
- setp.gt.u32 %p33, %r78, 3;
- mov.u32 %r78, %r13;
- @%p33 bra $L__BB0_22;
+ bar.sync 0;
+ shr.u32 %r13, %r62, 1;
+ setp.gt.u32 %p28, %r62, 3;
+ mov.u32 %r62, %r13;
+ @%p28 bra $L__BB0_17;
+
+$L__BB0_20:
+ mov.f32 %f70, 0f00000000;
+ @%p18 bra $L__BB0_23;
+
+ ld.shared.f32 %f62, [%rd1];
+ add.f32 %f70, %f62, 0f00000000;
+ setp.lt.u32 %p30, %r5, 2;
+ @%p30 bra $L__BB0_23;
+
+ ld.shared.f32 %f63, [%rd3];
+ add.f32 %f70, %f70, %f63;
+
+$L__BB0_23:
+ bar.sync 0;
+ @%p18 bra $L__BB0_25;
+
+ st.shared.f32 [%rd4], %f70;
$L__BB0_25:
- mov.f32 %f80, 0f00000000;
- @%p19 bra $L__BB0_28;
-
- ld.shared.f32 %f69, [%rd1];
- add.f32 %f80, %f69, 0f00000000;
- setp.lt.u32 %p35, %r6, 2;
- @%p35 bra $L__BB0_28;
-
- ld.shared.f32 %f70, [%rd3];
- add.f32 %f80, %f80, %f70;
-
-$L__BB0_28:
- bar.sync 0;
- @%p19 bra $L__BB0_30;
-
- st.shared.f32 [%rd4], %f80;
-
-$L__BB0_30:
- setp.gt.s32 %p37, %r1, 0;
- bar.sync 0;
- ld.shared.f32 %f71, [%rd4];
- bar.sync 0;
- rcp.rn.f32 %f16, %f71;
- @%p37 bra $L__BB0_32;
-
- mov.u32 %r67, %ctaid.x;
- mad.lo.s32 %r14, %r3, %r67, %r4;
- setp.lt.s32 %p38, %r14, 27454;
- @%p38 bra $L__BB0_35;
- bra.uni $L__BB0_32;
-
-$L__BB0_35:
- mul.f32 %f72, %f16, %f8;
- mov.b32 %r73, %f72;
- mul.f32 %f73, %f16, %f9;
- mov.b32 %r74, %f73;
- add.s32 %r75, %r14, %r1;
- shl.b32 %r76, %r75, 1;
- mul.wide.s32 %rd26, %r76, 4;
- add.s64 %rd25, %rd7, %rd26;
-
- st.global.cs.v2.s32 [%rd25], {%r73,%r74};
-
- bra.uni $L__BB0_36;
-
-$L__BB0_32:
- mul.f32 %f17, %f16, %f8;
- mul.f32 %f18, %f16, %f9;
- @%p37 bra $L__BB0_36;
-
- mov.u32 %r68, %ctaid.x;
- mad.lo.s32 %r15, %r3, %r68, %r4;
- setp.gt.s32 %p40, %r15, 27453;
- @%p40 bra $L__BB0_36;
-
- add.s32 %r71, %r15, %r1;
- shl.b32 %r72, %r71, 1;
- mul.wide.s32 %rd24, %r72, 4;
- add.s64 %rd23, %rd7, %rd24;
- mov.b32 %r69, %f17;
- mov.b32 %r70, %f18;
-
- st.global.cs.v2.s32 [%rd23], {%r69,%r70};
-
-
-$L__BB0_36:
+ bar.sync 0;
+ ld.shared.f32 %f13, [%rd4];
+ bar.sync 0;
+ @%p2 bra $L__BB0_27;
+
+ rcp.rn.f32 %f64, %f13;
+ mul.f32 %f65, %f64, %f8;
+ mov.b32 %r58, %f65;
+ mul.f32 %f66, %f64, %f9;
+ mov.b32 %r59, %f66;
+ shl.b32 %r60, %r3, 1;
+ mul.wide.s32 %rd23, %r60, 4;
+ add.s64 %rd22, %rd6, %rd23;
+
+ st.global.cs.v2.s32 [%rd22], {%r58,%r59};
+
+
+$L__BB0_27:
ret;
}
Kernel 90
CUDA
PTX
53997da5d
Diff
03a1b695e
-4
+4 index type: int
registers: 18
gmem: 3
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 4, 4> T0, Tensor<float, 4, 4> T1, Tensor<float, 7, 7> T12) {
if (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < 120)) {
Array<float, 1, 1> T14;
T14[0] = 0;
T14[0]
= T1[(((((T1.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 60)) + (T1.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 20) % 3))) + ((2 * T1.alloc_stride[2LL]) * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) / 10))) + (T1.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) / 5))) + (T1.alloc_stride[3LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) % 5)))];
Array<float, 1, 1> T13;
T13[0] = 0;
T13[0]
= T0[(((((T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 60)) + (T0.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 20) % 3))) + ((2 * T0.alloc_stride[2LL]) * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) / 10))) + (T0.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) / 5))) + (T0.alloc_stride[3LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) % 5)))];
Array<float, 1, 1> T2;
T2[0]
= T13[0]
+ T14[0];
Array<float, 1, 1> T3;
T3[0]
= T2[0];
Array<float, 1, 1> T4;
T4[0]
= T3[0];
Array<float, 1, 1> T7;
T7[0]
= T4[0]
- T4[0];
Array<float, 1, 1> T8;
T8[0]
= expf(T7[0]);
Array<float, 1, 1> T11;
T11[0]
= reciprocal(T8[0]);
Array<float, 1, 1> T15;
T15[0]
= T8[0]
* T11[0];
T12[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
= T15[0];
}
}
__global__ void nvfuser_N(Tensor<float, 4, 4> T0, Tensor<float, 4, 4> T1, Tensor<float, 7, 7> T12) {
if ((((nvfuser_index_t)threadIdx.x) < 120)) {
Array<float, 1, 1> T14;
T14[0] = 0;
T14[0]
= T1[(((((T1.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) / 60)) + (T1.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.x) / 20) % 3))) + ((2 * T1.alloc_stride[2LL]) * ((((nvfuser_index_t)threadIdx.x) % 20) / 10))) + (T1.alloc_stride[2LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) / 5))) + (T1.alloc_stride[3LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) % 5)))];
Array<float, 1, 1> T13;
T13[0] = 0;
T13[0]
= T0[(((((T0.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) / 60)) + (T0.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.x) / 20) % 3))) + ((2 * T0.alloc_stride[2LL]) * ((((nvfuser_index_t)threadIdx.x) % 20) / 10))) + (T0.alloc_stride[2LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) / 5))) + (T0.alloc_stride[3LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) % 5)))];
Array<float, 1, 1> T2;
T2[0]
= T13[0]
+ T14[0];
Array<float, 1, 1> T3;
T3[0]
= T2[0];
Array<float, 1, 1> T4;
T4[0]
= T3[0];
Array<float, 1, 1> T7;
T7[0]
= T4[0]
- T4[0];
Array<float, 1, 1> T8;
T8[0]
= expf(T7[0]);
Array<float, 1, 1> T11;
T11[0]
= reciprocal(T8[0]);
Array<float, 1, 1> T15;
T15[0]
= T8[0]
* T11[0];
T12[((nvfuser_index_t)threadIdx.x)]
= T15[0];
}
}
--- 53997da5d
+++ 03a1b695e
@@ -1,15 +1,15 @@
__global__ void nvfuser_N(Tensor<float, 4, 4> T0, Tensor<float, 4, 4> T1, Tensor<float, 7, 7> T12) {
- if (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < 120)) {
+ if ((((nvfuser_index_t)threadIdx.x) < 120)) {
Array<float, 1, 1> T14;
T14[0] = 0;
T14[0]
- = T1[(((((T1.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 60)) + (T1.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 20) % 3))) + ((2 * T1.alloc_stride[2LL]) * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) / 10))) + (T1.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) / 5))) + (T1.alloc_stride[3LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) % 5)))];
+ = T1[(((((T1.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) / 60)) + (T1.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.x) / 20) % 3))) + ((2 * T1.alloc_stride[2LL]) * ((((nvfuser_index_t)threadIdx.x) % 20) / 10))) + (T1.alloc_stride[2LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) / 5))) + (T1.alloc_stride[3LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) % 5)))];
Array<float, 1, 1> T13;
T13[0] = 0;
T13[0]
- = T0[(((((T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 60)) + (T0.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 20) % 3))) + ((2 * T0.alloc_stride[2LL]) * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) / 10))) + (T0.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) / 5))) + (T0.alloc_stride[3LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) % 5)))];
+ = T0[(((((T0.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) / 60)) + (T0.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.x) / 20) % 3))) + ((2 * T0.alloc_stride[2LL]) * ((((nvfuser_index_t)threadIdx.x) % 20) / 10))) + (T0.alloc_stride[2LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) / 5))) + (T0.alloc_stride[3LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) % 5)))];
Array<float, 1, 1> T2;
T2[0]
= T13[0]
+ T14[0];
Array<float, 1, 1> T3;
@@ -30,9 +30,9 @@
= reciprocal(T8[0]);
Array<float, 1, 1> T15;
T15[0]
= T8[0]
* T11[0];
- T12[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
+ T12[((nvfuser_index_t)threadIdx.x)]
= T15[0];
}
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_95_cu_de85203b_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_95_cu_de85203b_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_95_cu_de85203b_191103std14__numeric_typeIvE5valueE = 1;
.entry _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_95_cu_de85203b_1911010nvfuser_95ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE(
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_95_cu_de85203b_1911010nvfuser_95ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0[40],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_95_cu_de85203b_1911010nvfuser_95ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1[40],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_95_cu_de85203b_1911010nvfuser_95ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_2[64]
)
{
.reg .pred %p<2>;
.reg .f32 %f<23>;
.reg .b32 %r<113>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r44, %r45}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_95_cu_de85203b_1911010nvfuser_95ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0+24];
ld.param.v2.u32 {%r46, %r47}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_95_cu_de85203b_1911010nvfuser_95ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0+32];
ld.param.v2.u32 {%r52, %r53}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_95_cu_de85203b_1911010nvfuser_95ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1+24];
ld.param.v2.u32 {%r54, %r55}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_95_cu_de85203b_1911010nvfuser_95ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1+32];
ld.param.u64 %rd3, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_95_cu_de85203b_1911010nvfuser_95ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_2];
ld.param.u64 %rd2, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_95_cu_de85203b_1911010nvfuser_95ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1];
ld.param.u64 %rd1, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_95_cu_de85203b_1911010nvfuser_95ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0];
mov.u32 %r70, %ctaid.x;
shl.b32 %r71, %r70, 7;
mov.u32 %r72, %tid.x;
add.s32 %r9, %r71, %r72;
setp.gt.s32 %p1, %r9, 119;
@%p1 bra $L__BB0_2;
cvta.to.global.u64 %rd4, %rd2;
cvta.to.global.u64 %rd5, %rd1;
cvta.to.global.u64 %rd6, %rd3;
mul.hi.s32 %r73, %r9, -2004318071;
add.s32 %r74, %r73, %r9;
shr.u32 %r75, %r74, 31;
shr.s32 %r76, %r74, 5;
add.s32 %r77, %r76, %r75;
mul.hi.s32 %r78, %r9, 1717986919;
shr.u32 %r79, %r78, 31;
shr.s32 %r80, %r78, 3;
add.s32 %r81, %r80, %r79;
mul.hi.s32 %r82, %r81, 1431655766;
shr.u32 %r83, %r82, 31;
add.s32 %r84, %r82, %r83;
mul.lo.s32 %r85, %r84, 3;
sub.s32 %r86, %r81, %r85;
mul.lo.s32 %r87, %r53, %r86;
mul.lo.s32 %r88, %r81, 20;
sub.s32 %r89, %r9, %r88;
mul.hi.s32 %r90, %r89, 1717986919;
shr.u32 %r91, %r90, 31;
shr.s32 %r92, %r90, 2;
add.s32 %r93, %r92, %r91;
shl.b32 %r94, %r93, 1;
mul.lo.s32 %r95, %r93, 10;
sub.s32 %r96, %r89, %r95;
mul.hi.s32 %r97, %r96, 1717986919;
shr.u32 %r98, %r97, 31;
shr.s32 %r99, %r97, 1;
add.s32 %r100, %r99, %r98;
mul.lo.s32 %r101, %r100, 5;
sub.s32 %r102, %r96, %r101;
add.s32 %r103, %r94, %r100;
mad.lo.s32 %r104, %r52, %r77, %r87;
mad.lo.s32 %r105, %r55, %r102, %r104;
mad.lo.s32 %r106, %r103, %r54, %r105;
mul.wide.s32 %rd7, %r106, 4;
add.s64 %rd8, %rd4, %rd7;
mul.lo.s32 %r107, %r45, %r86;
mad.lo.s32 %r108, %r44, %r77, %r107;
mad.lo.s32 %r109, %r102, %r47, %r108;
mad.lo.s32 %r110, %r103, %r46, %r109;
mul.wide.s32 %rd9, %r110, 4;
add.s64 %rd10, %rd5, %rd9;
ld.global.f32 %f1, [%rd10];
ld.global.f32 %f2, [%rd8];
add.f32 %f3, %f2, %f1;
sub.f32 %f4, %f3, %f3;
mov.f32 %f5, 0f3F000000;
mov.f32 %f6, 0f3BBB989D;
fma.rn.f32 %f7, %f4, %f6, %f5;
cvt.sat.f32.f32 %f8, %f7;
mov.f32 %f9, 0f4B400001;
mov.f32 %f10, 0f437C0000;
fma.rm.f32 %f11, %f8, %f10, %f9;
add.f32 %f12, %f11, 0fCB40007F;
neg.f32 %f13, %f12;
mov.f32 %f14, 0f3FB8AA3B;
fma.rn.f32 %f15, %f4, %f14, %f13;
mov.f32 %f16, 0f32A57060;
fma.rn.f32 %f17, %f4, %f16, %f15;
mov.b32 %r111, %f11;
shl.b32 %r112, %r111, 23;
mov.b32 %f18, %r112;
ex2.approx.ftz.f32 %f19, %f17;
mul.f32 %f20, %f19, %f18;
rcp.rn.f32 %f21, %f20;
mul.f32 %f22, %f20, %f21;
mul.wide.s32 %rd11, %r9, 4;
add.s64 %rd12, %rd6, %rd11;
st.global.f32 [%rd12], %f22;
$L__BB0_2:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_95_cu_e7c53a38_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_95_cu_e7c53a38_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_95_cu_e7c53a38_160113std14__numeric_typeIvE5valueE = 1;
.entry _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_95_cu_e7c53a38_1601110nvfuser_95ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE(
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_95_cu_e7c53a38_1601110nvfuser_95ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0[40],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_95_cu_e7c53a38_1601110nvfuser_95ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1[40],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_95_cu_e7c53a38_1601110nvfuser_95ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_2[64]
)
{
.reg .pred %p<2>;
.reg .f32 %f<23>;
.reg .b32 %r<110>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r44, %r45}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_95_cu_e7c53a38_1601110nvfuser_95ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0+24];
ld.param.v2.u32 {%r46, %r47}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_95_cu_e7c53a38_1601110nvfuser_95ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0+32];
ld.param.v2.u32 {%r52, %r53}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_95_cu_e7c53a38_1601110nvfuser_95ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1+24];
ld.param.v2.u32 {%r54, %r55}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_95_cu_e7c53a38_1601110nvfuser_95ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1+32];
ld.param.u64 %rd3, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_95_cu_e7c53a38_1601110nvfuser_95ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_2];
ld.param.u64 %rd2, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_95_cu_e7c53a38_1601110nvfuser_95ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1];
ld.param.u64 %rd1, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_95_cu_e7c53a38_1601110nvfuser_95ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0];
mov.u32 %r9, %tid.x;
setp.gt.s32 %p1, %r9, 119;
@%p1 bra $L__BB0_2;
cvta.to.global.u64 %rd4, %rd2;
cvta.to.global.u64 %rd5, %rd1;
cvta.to.global.u64 %rd6, %rd3;
mul.hi.s32 %r70, %r9, -2004318071;
add.s32 %r71, %r70, %r9;
shr.u32 %r72, %r71, 31;
shr.s32 %r73, %r71, 5;
add.s32 %r74, %r73, %r72;
mul.hi.s32 %r75, %r9, 1717986919;
shr.u32 %r76, %r75, 31;
shr.s32 %r77, %r75, 3;
add.s32 %r78, %r77, %r76;
mul.hi.s32 %r79, %r78, 1431655766;
shr.u32 %r80, %r79, 31;
add.s32 %r81, %r79, %r80;
mul.lo.s32 %r82, %r81, 3;
sub.s32 %r83, %r78, %r82;
mul.lo.s32 %r84, %r53, %r83;
mul.lo.s32 %r85, %r78, 20;
sub.s32 %r86, %r9, %r85;
mul.hi.s32 %r87, %r86, 1717986919;
shr.u32 %r88, %r87, 31;
shr.s32 %r89, %r87, 2;
add.s32 %r90, %r89, %r88;
shl.b32 %r91, %r90, 1;
mul.lo.s32 %r92, %r90, 10;
sub.s32 %r93, %r86, %r92;
mul.hi.s32 %r94, %r93, 1717986919;
shr.u32 %r95, %r94, 31;
shr.s32 %r96, %r94, 1;
add.s32 %r97, %r96, %r95;
mul.lo.s32 %r98, %r97, 5;
sub.s32 %r99, %r93, %r98;
add.s32 %r100, %r91, %r97;
mad.lo.s32 %r101, %r52, %r74, %r84;
mad.lo.s32 %r102, %r55, %r99, %r101;
mad.lo.s32 %r103, %r100, %r54, %r102;
mul.wide.s32 %rd7, %r103, 4;
add.s64 %rd8, %rd4, %rd7;
mul.lo.s32 %r104, %r45, %r83;
mad.lo.s32 %r105, %r44, %r74, %r104;
mad.lo.s32 %r106, %r47, %r99, %r105;
mad.lo.s32 %r107, %r100, %r46, %r106;
mul.wide.s32 %rd9, %r107, 4;
add.s64 %rd10, %rd5, %rd9;
ld.global.f32 %f1, [%rd10];
ld.global.f32 %f2, [%rd8];
add.f32 %f3, %f2, %f1;
sub.f32 %f4, %f3, %f3;
mov.f32 %f5, 0f3F000000;
mov.f32 %f6, 0f3BBB989D;
fma.rn.f32 %f7, %f4, %f6, %f5;
cvt.sat.f32.f32 %f8, %f7;
mov.f32 %f9, 0f4B400001;
mov.f32 %f10, 0f437C0000;
fma.rm.f32 %f11, %f8, %f10, %f9;
add.f32 %f12, %f11, 0fCB40007F;
neg.f32 %f13, %f12;
mov.f32 %f14, 0f3FB8AA3B;
fma.rn.f32 %f15, %f4, %f14, %f13;
mov.f32 %f16, 0f32A57060;
fma.rn.f32 %f17, %f4, %f16, %f15;
mov.b32 %r108, %f11;
shl.b32 %r109, %r108, 23;
mov.b32 %f18, %r109;
ex2.approx.ftz.f32 %f19, %f17;
mul.f32 %f20, %f19, %f18;
rcp.rn.f32 %f21, %f20;
mul.f32 %f22, %f20, %f21;
mul.wide.s32 %rd11, %r9, 4;
add.s64 %rd12, %rd6, %rd11;
st.global.f32 [%rd12], %f22;
$L__BB0_2:
ret;
}
--- 53997da5d
+++ 03a1b695e
@@ -20,72 +20,69 @@
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_2[64]
)
{
.reg .pred %p<2>;
.reg .f32 %f<23>;
- .reg .b32 %r<113>;
+ .reg .b32 %r<110>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r44, %r45}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0+24];
ld.param.v2.u32 {%r46, %r47}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0+32];
ld.param.v2.u32 {%r52, %r53}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1+24];
ld.param.v2.u32 {%r54, %r55}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1+32];
ld.param.u64 %rd3, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_2];
ld.param.u64 %rd2, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1];
ld.param.u64 %rd1, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0];
- mov.u32 %r70, %ctaid.x;
- shl.b32 %r71, %r70, 7;
- mov.u32 %r72, %tid.x;
- add.s32 %r9, %r71, %r72;
+ mov.u32 %r9, %tid.x;
setp.gt.s32 %p1, %r9, 119;
@%p1 bra $L__BB0_2;
cvta.to.global.u64 %rd4, %rd2;
cvta.to.global.u64 %rd5, %rd1;
cvta.to.global.u64 %rd6, %rd3;
- mul.hi.s32 %r73, %r9, -2004318071;
- add.s32 %r74, %r73, %r9;
- shr.u32 %r75, %r74, 31;
- shr.s32 %r76, %r74, 5;
- add.s32 %r77, %r76, %r75;
- mul.hi.s32 %r78, %r9, 1717986919;
- shr.u32 %r79, %r78, 31;
- shr.s32 %r80, %r78, 3;
- add.s32 %r81, %r80, %r79;
- mul.hi.s32 %r82, %r81, 1431655766;
- shr.u32 %r83, %r82, 31;
- add.s32 %r84, %r82, %r83;
- mul.lo.s32 %r85, %r84, 3;
- sub.s32 %r86, %r81, %r85;
- mul.lo.s32 %r87, %r53, %r86;
- mul.lo.s32 %r88, %r81, 20;
- sub.s32 %r89, %r9, %r88;
- mul.hi.s32 %r90, %r89, 1717986919;
- shr.u32 %r91, %r90, 31;
- shr.s32 %r92, %r90, 2;
- add.s32 %r93, %r92, %r91;
- shl.b32 %r94, %r93, 1;
- mul.lo.s32 %r95, %r93, 10;
- sub.s32 %r96, %r89, %r95;
- mul.hi.s32 %r97, %r96, 1717986919;
- shr.u32 %r98, %r97, 31;
- shr.s32 %r99, %r97, 1;
- add.s32 %r100, %r99, %r98;
- mul.lo.s32 %r101, %r100, 5;
- sub.s32 %r102, %r96, %r101;
- add.s32 %r103, %r94, %r100;
- mad.lo.s32 %r104, %r52, %r77, %r87;
- mad.lo.s32 %r105, %r55, %r102, %r104;
- mad.lo.s32 %r106, %r103, %r54, %r105;
- mul.wide.s32 %rd7, %r106, 4;
+ mul.hi.s32 %r70, %r9, -2004318071;
+ add.s32 %r71, %r70, %r9;
+ shr.u32 %r72, %r71, 31;
+ shr.s32 %r73, %r71, 5;
+ add.s32 %r74, %r73, %r72;
+ mul.hi.s32 %r75, %r9, 1717986919;
+ shr.u32 %r76, %r75, 31;
+ shr.s32 %r77, %r75, 3;
+ add.s32 %r78, %r77, %r76;
+ mul.hi.s32 %r79, %r78, 1431655766;
+ shr.u32 %r80, %r79, 31;
+ add.s32 %r81, %r79, %r80;
+ mul.lo.s32 %r82, %r81, 3;
+ sub.s32 %r83, %r78, %r82;
+ mul.lo.s32 %r84, %r53, %r83;
+ mul.lo.s32 %r85, %r78, 20;
+ sub.s32 %r86, %r9, %r85;
+ mul.hi.s32 %r87, %r86, 1717986919;
+ shr.u32 %r88, %r87, 31;
+ shr.s32 %r89, %r87, 2;
+ add.s32 %r90, %r89, %r88;
+ shl.b32 %r91, %r90, 1;
+ mul.lo.s32 %r92, %r90, 10;
+ sub.s32 %r93, %r86, %r92;
+ mul.hi.s32 %r94, %r93, 1717986919;
+ shr.u32 %r95, %r94, 31;
+ shr.s32 %r96, %r94, 1;
+ add.s32 %r97, %r96, %r95;
+ mul.lo.s32 %r98, %r97, 5;
+ sub.s32 %r99, %r93, %r98;
+ add.s32 %r100, %r91, %r97;
+ mad.lo.s32 %r101, %r52, %r74, %r84;
+ mad.lo.s32 %r102, %r55, %r99, %r101;
+ mad.lo.s32 %r103, %r100, %r54, %r102;
+ mul.wide.s32 %rd7, %r103, 4;
add.s64 %rd8, %rd4, %rd7;
- mul.lo.s32 %r107, %r45, %r86;
- mad.lo.s32 %r108, %r44, %r77, %r107;
- mad.lo.s32 %r109, %r102, %r47, %r108;
- mad.lo.s32 %r110, %r103, %r46, %r109;
- mul.wide.s32 %rd9, %r110, 4;
+ mul.lo.s32 %r104, %r45, %r83;
+ mad.lo.s32 %r105, %r44, %r74, %r104;
+ mad.lo.s32 %r106, %r47, %r99, %r105;
+ mad.lo.s32 %r107, %r100, %r46, %r106;
+ mul.wide.s32 %rd9, %r107, 4;
add.s64 %rd10, %rd5, %rd9;
ld.global.f32 %f1, [%rd10];
ld.global.f32 %f2, [%rd8];
add.f32 %f3, %f2, %f1;
sub.f32 %f4, %f3, %f3;
@@ -100,13 +97,13 @@
neg.f32 %f13, %f12;
mov.f32 %f14, 0f3FB8AA3B;
fma.rn.f32 %f15, %f4, %f14, %f13;
mov.f32 %f16, 0f32A57060;
fma.rn.f32 %f17, %f4, %f16, %f15;
- mov.b32 %r111, %f11;
- shl.b32 %r112, %r111, 23;
- mov.b32 %f18, %r112;
+ mov.b32 %r108, %f11;
+ shl.b32 %r109, %r108, 23;
+ mov.b32 %f18, %r109;
ex2.approx.ftz.f32 %f19, %f17;
mul.f32 %f20, %f19, %f18;
rcp.rn.f32 %f21, %f20;
mul.f32 %f22, %f20, %f21;
mul.wide.s32 %rd11, %r9, 4;
Kernel 133
CUDA
PTX
53997da5d
Diff
03a1b695e
-9
+9 index type: int
registers: 15
gmem: 3
static smem: 16
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 3, 3> T0, Tensor<float, 3, 3> T8, Tensor<float, 3, 3> T15) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
NVFUSER_DEFINE_MAGIC_ZERO;
Array<float, 2, 1> T13;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
T13[i0] = NEG_INFINITY;
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((((nvfuser_index_t)threadIdx.x) * 2) + 1) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484))) {
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
T13[i0]
= T0[(((((2 * T0.alloc_stride[2LL]) * ((nvfuser_index_t)threadIdx.x)) + (T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) / 22))) + (T0.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) % 22))) + (T0.alloc_stride[2LL] * (i0 + nvfuser_zero)))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
if ((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484))) {
T13[i0]
= T0[(((((2 * T0.alloc_stride[2LL]) * ((nvfuser_index_t)threadIdx.x)) + (T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) / 22))) + (T0.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) % 22))) + (T0.alloc_stride[2LL] * (i0 + nvfuser_zero)))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<float, 1, 1> T17;
T17[0] = NEG_INFINITY;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 2; ++i1) {
T17[0] = fmax(
T17[0],
T13[i1]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 1, 1> T2;
T2[0] = NEG_INFINITY;
blockReduce<true, false, false, true>(T2[0], T17[0], [](float &a, float b) { a = fmax(a, b); }, static_cast<float*>(shared_mem), true, true, float(NEG_INFINITY), DefaultBlockDim());
Array<float, 1, 1> T3;
broadcast::blockBroadcast<true, false, false, true>(T3[0], T2[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T18;
T18[0] = 0.000000000e+00f;
if (((((((nvfuser_index_t)threadIdx.x) * 2) + 1) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484))) {
Array<float, 2, 2> T16;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 2; ++i2) {
Array<float, 1, 1> T4;
T4[0]
= T13[i2]
- T3[0];
Array<float, 1, 1> T5;
T5[0]
= expf(T4[0]);
T18[0]
= T18[0]
+ T5[0];
T16[i2]
= T5[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T15[(((2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T16[0]);
} else {
Array<float, 2, 2> T16;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 2; ++i2) {
Array<float, 1, 1> T4;
T4[0]
= T13[i2]
- T3[0];
Array<float, 1, 1> T5;
T5[0]
= expf(T4[0]);
if ((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484))) {
T18[0]
= T18[0]
+ T5[0];
}
T16[i2]
= T5[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T15[(((2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T16[0]);
}
}
Array<float, 1, 1> T6;
T6[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T6[0], T18[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T7;
T7[0]
= T6[0];
Array<float, 1, 1> T14;
T14[0]
= reciprocal(T7[0]);
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484)) {
T8[(((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x)))]
= T14[0];
}
}
__global__ void nvfuser_N(Tensor<float, 3, 3> T0, Tensor<float, 3, 3> T8, Tensor<float, 3, 3> T15) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
NVFUSER_DEFINE_MAGIC_ZERO;
Array<float, 2, 1> T13;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
T13[i0] = NEG_INFINITY;
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484)) {
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
T13[i0]
= T0[(((T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) / 22)) + (T0.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) % 22))) + (T0.alloc_stride[2LL] * (i0 + nvfuser_zero)))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484)) {
T13[i0]
= T0[(((T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) / 22)) + (T0.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) % 22))) + (T0.alloc_stride[2LL] * (i0 + nvfuser_zero)))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<float, 1, 1> T17;
T17[0] = NEG_INFINITY;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 2; ++i1) {
T17[0] = fmax(
T17[0],
T13[i1]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 1, 1> T2;
T2[0] = NEG_INFINITY;
blockReduce<true, false, false, true>(T2[0], T17[0], [](float &a, float b) { a = fmax(a, b); }, static_cast<float*>(shared_mem), true, true, float(NEG_INFINITY), DefaultBlockDim());
Array<float, 1, 1> T3;
broadcast::blockBroadcast<true, false, false, true>(T3[0], T2[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T18;
T18[0] = 0.000000000e+00f;
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484)) {
Array<float, 2, 2> T16;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 2; ++i2) {
Array<float, 1, 1> T4;
T4[0]
= T13[i2]
- T3[0];
Array<float, 1, 1> T5;
T5[0]
= expf(T4[0]);
T18[0]
= T18[0]
+ T5[0];
T16[i2]
= T5[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T15[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T16[0]);
} else {
Array<float, 2, 2> T16;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 2; ++i2) {
Array<float, 1, 1> T4;
T4[0]
= T13[i2]
- T3[0];
Array<float, 1, 1> T5;
T5[0]
= expf(T4[0]);
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484)) {
T18[0]
= T18[0]
+ T5[0];
}
T16[i2]
= T5[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484)) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T15[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T16[0]);
}
}
Array<float, 1, 1> T6;
T6[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T6[0], T18[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T7;
T7[0]
= T6[0];
Array<float, 1, 1> T14;
T14[0]
= reciprocal(T7[0]);
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484)) {
T8[(((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x)))]
= T14[0];
}
}
--- 53997da5d
+++ 03a1b695e
@@ -6,23 +6,23 @@
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
T13[i0] = NEG_INFINITY;
}
NVFUSER_UPDATE_MAGIC_ZERO;
- if (((((((nvfuser_index_t)threadIdx.x) * 2) + 1) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484))) {
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484)) {
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
T13[i0]
- = T0[(((((2 * T0.alloc_stride[2LL]) * ((nvfuser_index_t)threadIdx.x)) + (T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) / 22))) + (T0.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) % 22))) + (T0.alloc_stride[2LL] * (i0 + nvfuser_zero)))];
+ = T0[(((T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) / 22)) + (T0.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) % 22))) + (T0.alloc_stride[2LL] * (i0 + nvfuser_zero)))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
- if ((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484))) {
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484)) {
T13[i0]
- = T0[(((((2 * T0.alloc_stride[2LL]) * ((nvfuser_index_t)threadIdx.x)) + (T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) / 22))) + (T0.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) % 22))) + (T0.alloc_stride[2LL] * (i0 + nvfuser_zero)))];
+ = T0[(((T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) / 22)) + (T0.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) % 22))) + (T0.alloc_stride[2LL] * (i0 + nvfuser_zero)))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<float, 1, 1> T17;
@@ -39,11 +39,11 @@
blockReduce<true, false, false, true>(T2[0], T17[0], [](float &a, float b) { a = fmax(a, b); }, static_cast<float*>(shared_mem), true, true, float(NEG_INFINITY), DefaultBlockDim());
Array<float, 1, 1> T3;
broadcast::blockBroadcast<true, false, false, true>(T3[0], T2[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T18;
T18[0] = 0.000000000e+00f;
- if (((((((nvfuser_index_t)threadIdx.x) * 2) + 1) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484))) {
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484)) {
Array<float, 2, 2> T16;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 2; ++i2) {
Array<float, 1, 1> T4;
T4[0]
@@ -57,11 +57,11 @@
+ T5[0];
T16[i2]
= T5[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
- loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T15[(((2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T16[0]);
+ loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T15[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T16[0]);
} else {
Array<float, 2, 2> T16;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 2; ++i2) {
Array<float, 1, 1> T4;
@@ -69,21 +69,21 @@
= T13[i2]
- T3[0];
Array<float, 1, 1> T5;
T5[0]
= expf(T4[0]);
- if ((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484))) {
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484)) {
T18[0]
= T18[0]
+ T5[0];
}
T16[i2]
= T5[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
- if ((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484))) {
- loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T15[(((2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T16[0]);
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484)) {
+ loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T15[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T16[0]);
}
}
Array<float, 1, 1> T6;
T6[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T6[0], T18[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_138_cu_851fa056_1911011nvfuser_138ENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_138_cu_851fa056_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_138_cu_851fa056_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_138_cu_851fa056_191103std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_138_cu_851fa056_191105arrayE[];
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_138_cu_851fa056_1911011nvfuser_138ENS_6TensorIfLi3ELi3EEES1_S1_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_138_cu_851fa056_1911011nvfuser_138ENS_6TensorIfLi3ELi3EEES1_S1__param_0[32],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_138_cu_851fa056_1911011nvfuser_138ENS_6TensorIfLi3ELi3EEES1_S1__param_1[32],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_138_cu_851fa056_1911011nvfuser_138ENS_6TensorIfLi3ELi3EEES1_S1__param_2[32]
)
{
.reg .pred %p<44>;
.reg .f32 %f<105>;
.reg .b32 %r<127>;
.reg .b64 %rd<39>;
// demoted variable
.shared .align 4 .u32 _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_138_cu_851fa056_1911011nvfuser_138ENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s;
ld.param.v2.u32 {%r42, %r43}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_138_cu_851fa056_1911011nvfuser_138ENS_6TensorIfLi3ELi3EEES1_S1__param_0+16];
ld.param.v2.u32 {%r44, %r45}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_138_cu_851fa056_1911011nvfuser_138ENS_6TensorIfLi3ELi3EEES1_S1__param_0+24];
ld.param.u64 %rd9, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_138_cu_851fa056_1911011nvfuser_138ENS_6TensorIfLi3ELi3EEES1_S1__param_2];
ld.param.u64 %rd8, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_138_cu_851fa056_1911011nvfuser_138ENS_6TensorIfLi3ELi3EEES1_S1__param_1];
ld.param.u64 %rd7, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_138_cu_851fa056_1911011nvfuser_138ENS_6TensorIfLi3ELi3EEES1_S1__param_0];
cvta.to.global.u64 %rd1, %rd7;
mov.u32 %r1, %tid.x;
setp.ne.s32 %p4, %r1, 0;
@%p4 bra $L__BB0_2;
mov.u32 %r58, 0;
st.shared.u32 [_ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_138_cu_851fa056_1911011nvfuser_138ENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s], %r58;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd10, _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_138_cu_851fa056_1911011nvfuser_138ENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s;
atom.shared.min.s32 %r59, [%rd10], %r1;
ld.shared.u32 %r60, [_ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_138_cu_851fa056_1911011nvfuser_138ENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s];
shl.b32 %r5, %r60, 1;
setp.lt.s32 %p5, %r1, 1;
@%p5 bra $L__BB0_3;
bra.uni $L__BB0_4;
$L__BB0_3:
mov.u32 %r61, %tid.y;
mov.u32 %r62, %ctaid.x;
mov.u32 %r63, %ntid.y;
mad.lo.s32 %r6, %r63, %r62, %r61;
setp.lt.s32 %p6, %r6, 484;
@%p6 bra $L__BB0_8;
bra.uni $L__BB0_4;
$L__BB0_8:
shl.b32 %r79, %r1, 1;
mul.hi.s32 %r80, %r6, 780903145;
shr.u32 %r81, %r80, 31;
shr.s32 %r82, %r80, 2;
add.s32 %r83, %r82, %r81;
mul.lo.s32 %r84, %r43, %r83;
mul.lo.s32 %r85, %r83, 22;
sub.s32 %r86, %r6, %r85;
add.s32 %r87, %r5, %r79;
mad.lo.s32 %r88, %r44, %r86, %r84;
mad.lo.s32 %r89, %r87, %r45, %r88;
mul.wide.s32 %rd15, %r89, 4;
add.s64 %rd16, %rd1, %rd15;
ld.global.f32 %f100, [%rd16];
add.s32 %r90, %r89, %r45;
mul.wide.s32 %rd17, %r90, 4;
add.s64 %rd18, %rd1, %rd17;
ld.global.f32 %f101, [%rd18];
bra.uni $L__BB0_9;
$L__BB0_4:
mov.u32 %r64, %ntid.y;
mov.u32 %r65, %ctaid.x;
mov.u32 %r66, %tid.y;
mad.lo.s32 %r67, %r64, %r65, %r66;
setp.lt.s32 %p8, %r67, 484;
mul.hi.s32 %r68, %r67, 780903145;
shr.u32 %r69, %r68, 31;
shr.s32 %r70, %r68, 2;
add.s32 %r71, %r70, %r69;
mul.lo.s32 %r72, %r43, %r71;
mul.lo.s32 %r73, %r71, 22;
sub.s32 %r74, %r67, %r73;
shl.b32 %r75, %r1, 1;
add.s32 %r7, %r5, %r75;
mad.lo.s32 %r8, %r44, %r74, %r72;
and.pred %p1, %p5, %p8;
mov.f32 %f101, 0fFF800000;
not.pred %p9, %p1;
mov.f32 %f100, %f101;
@%p9 bra $L__BB0_6;
mad.lo.s32 %r76, %r7, %r45, %r8;
mul.wide.s32 %rd11, %r76, 4;
add.s64 %rd12, %rd1, %rd11;
ld.global.f32 %f100, [%rd12];
$L__BB0_6:
@%p9 bra $L__BB0_9;
add.s32 %r77, %r7, 1;
mad.lo.s32 %r78, %r77, %r45, %r8;
mul.wide.s32 %rd13, %r78, 4;
add.s64 %rd14, %rd1, %rd13;
ld.global.f32 %f101, [%rd14];
$L__BB0_9:
setp.gt.f32 %p11, %f100, %f101;
setp.nan.f32 %p12, %f100, %f100;
or.pred %p13, %p12, %p11;
selp.f32 %f23, %f100, %f101, %p13;
mov.u32 %r91, %tid.z;
mov.u32 %r9, %ntid.y;
mov.u32 %r10, %tid.y;
mad.lo.s32 %r11, %r9, %r91, %r10;
mov.u32 %r12, %ntid.x;
mad.lo.s32 %r13, %r11, %r12, %r1;
mul.wide.u32 %rd19, %r13, 4;
mov.u64 %rd20, _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_138_cu_851fa056_191105arrayE;
add.s64 %rd2, %rd20, %rd19;
st.shared.f32 [%rd2], %f23;
bar.sync 0;
clz.b32 %r92, %r12;
mov.u32 %r93, 31;
sub.s32 %r94, %r93, %r92;
mov.u32 %r95, 1;
shl.b32 %r14, %r95, %r94;
setp.lt.u32 %p14, %r1, %r14;
add.s32 %r96, %r14, %r1;
setp.lt.u32 %p15, %r96, %r12;
and.pred %p2, %p14, %p15;
add.s32 %r97, %r13, %r14;
mul.wide.s32 %rd21, %r97, 4;
add.s64 %rd3, %rd20, %rd21;
not.pred %p16, %p2;
@%p16 bra $L__BB0_11;
ld.shared.f32 %f24, [%rd3];
ld.shared.f32 %f25, [%rd2];
setp.nan.f32 %p17, %f25, %f25;
setp.gt.f32 %p18, %f25, %f24;
or.pred %p19, %p17, %p18;
selp.f32 %f26, %f25, %f24, %p19;
st.shared.f32 [%rd2], %f26;
$L__BB0_11:
bar.sync 0;
shr.u32 %r98, %r14, 31;
add.s32 %r99, %r14, %r98;
shr.s32 %r126, %r99, 1;
setp.lt.s32 %p20, %r14, 4;
@%p20 bra $L__BB0_16;
mov.u32 %r125, %r126;
$L__BB0_13:
setp.ge.u32 %p21, %r1, %r125;
@%p21 bra $L__BB0_15;
add.s32 %r100, %r125, %r13;
mul.wide.s32 %rd22, %r100, 4;
add.s64 %rd24, %rd20, %rd22;
ld.shared.f32 %f27, [%rd2];
setp.nan.f32 %p22, %f27, %f27;
ld.shared.f32 %f28, [%rd24];
setp.gt.f32 %p23, %f27, %f28;
or.pred %p24, %p22, %p23;
selp.f32 %f29, %f27, %f28, %p24;
st.shared.f32 [%rd2], %f29;
$L__BB0_15:
bar.sync 0;
shr.u32 %r17, %r125, 1;
setp.gt.u32 %p25, %r125, 3;
mov.u32 %r125, %r17;
@%p25 bra $L__BB0_13;
$L__BB0_16:
add.s32 %r101, %r13, 1;
mul.wide.u32 %rd25, %r101, 4;
add.s64 %rd4, %rd20, %rd25;
mov.f32 %f102, 0fFF800000;
@%p4 bra $L__BB0_19;
ld.shared.f32 %f102, [%rd2];
setp.lt.u32 %p27, %r12, 2;
@%p27 bra $L__BB0_19;
ld.shared.f32 %f31, [%rd4];
setp.gt.f32 %p28, %f102, %f31;
setp.nan.f32 %p29, %f102, %f102;
or.pred %p30, %p29, %p28;
selp.f32 %f102, %f102, %f31, %p30;
$L__BB0_19:
bar.sync 0;
mul.wide.s32 %rd27, %r11, 4;
add.s64 %rd5, %rd20, %rd27;
setp.eq.s32 %p31, %r1, 0;
@%p31 bra $L__BB0_20;
bra.uni $L__BB0_21;
$L__BB0_20:
st.shared.f32 [%rd5], %f102;
$L__BB0_21:
bar.sync 0;
ld.shared.f32 %f11, [%rd5];
bar.sync 0;
@%p5 bra $L__BB0_22;
bra.uni $L__BB0_23;
$L__BB0_22:
mov.u32 %r102, %ctaid.x;
mad.lo.s32 %r18, %r9, %r102, %r10;
setp.lt.s32 %p33, %r18, 484;
@%p33 bra $L__BB0_25;
bra.uni $L__BB0_23;
$L__BB0_25:
sub.f32 %f60, %f100, %f11;
mov.f32 %f61, 0f3F000000;
mov.f32 %f62, 0f3BBB989D;
fma.rn.f32 %f63, %f60, %f62, %f61;
cvt.sat.f32.f32 %f64, %f63;
mov.f32 %f65, 0f4B400001;
mov.f32 %f66, 0f437C0000;
fma.rm.f32 %f67, %f64, %f66, %f65;
add.f32 %f68, %f67, 0fCB40007F;
neg.f32 %f69, %f68;
mov.f32 %f70, 0f3FB8AA3B;
fma.rn.f32 %f71, %f60, %f70, %f69;
mov.f32 %f72, 0f32A57060;
fma.rn.f32 %f73, %f60, %f72, %f71;
mov.b32 %r117, %f67;
shl.b32 %r118, %r117, 23;
mov.b32 %f74, %r118;
ex2.approx.ftz.f32 %f75, %f73;
mul.f32 %f76, %f75, %f74;
add.f32 %f77, %f76, 0f00000000;
mov.b32 %r115, %f76;
sub.f32 %f78, %f101, %f11;
fma.rn.f32 %f79, %f78, %f62, %f61;
cvt.sat.f32.f32 %f80, %f79;
fma.rm.f32 %f81, %f80, %f66, %f65;
add.f32 %f82, %f81, 0fCB40007F;
neg.f32 %f83, %f82;
fma.rn.f32 %f84, %f78, %f70, %f83;
fma.rn.f32 %f85, %f78, %f72, %f84;
mov.b32 %r119, %f81;
shl.b32 %r120, %r119, 23;
mov.b32 %f86, %r120;
ex2.approx.ftz.f32 %f87, %f85;
mul.f32 %f88, %f87, %f86;
add.f32 %f103, %f77, %f88;
mov.b32 %r116, %f88;
add.s32 %r121, %r18, %r1;
shl.b32 %r122, %r121, 1;
mul.wide.s32 %rd32, %r122, 4;
add.s64 %rd31, %rd9, %rd32;
// begin inline asm
st.global.cs.v2.s32 [%rd31], {%r115,%r116};
// end inline asm
bra.uni $L__BB0_26;
$L__BB0_23:
mov.u32 %r103, %ctaid.x;
mad.lo.s32 %r104, %r9, %r103, %r10;
setp.lt.s32 %p35, %r104, 484;
sub.f32 %f33, %f100, %f11;
mov.f32 %f34, 0f3F000000;
mov.f32 %f35, 0f3BBB989D;
fma.rn.f32 %f36, %f33, %f35, %f34;
cvt.sat.f32.f32 %f37, %f36;
mov.f32 %f38, 0f4B400001;
mov.f32 %f39, 0f437C0000;
fma.rm.f32 %f40, %f37, %f39, %f38;
add.f32 %f41, %f40, 0fCB40007F;
neg.f32 %f42, %f41;
mov.f32 %f43, 0f3FB8AA3B;
fma.rn.f32 %f44, %f33, %f43, %f42;
mov.f32 %f45, 0f32A57060;
fma.rn.f32 %f46, %f33, %f45, %f44;
mov.b32 %r105, %f40;
shl.b32 %r106, %r105, 23;
mov.b32 %f47, %r106;
ex2.approx.ftz.f32 %f48, %f46;
mul.f32 %f12, %f48, %f47;
add.f32 %f49, %f12, 0f00000000;
mov.f32 %f103, 0f00000000;
and.pred %p3, %p5, %p35;
sub.f32 %f50, %f101, %f11;
fma.rn.f32 %f51, %f50, %f35, %f34;
cvt.sat.f32.f32 %f52, %f51;
fma.rm.f32 %f53, %f52, %f39, %f38;
add.f32 %f54, %f53, 0fCB40007F;
neg.f32 %f55, %f54;
fma.rn.f32 %f56, %f50, %f43, %f55;
fma.rn.f32 %f57, %f50, %f45, %f56;
mov.b32 %r107, %f53;
shl.b32 %r108, %r107, 23;
mov.b32 %f58, %r108;
ex2.approx.ftz.f32 %f59, %f57;
mul.f32 %f13, %f59, %f58;
add.f32 %f14, %f49, %f13;
not.pred %p36, %p3;
@%p36 bra $L__BB0_26;
mov.b32 %r110, %f13;
add.s32 %r111, %r1, %r10;
mad.lo.s32 %r113, %r9, %r103, %r111;
shl.b32 %r114, %r113, 1;
mul.wide.s32 %rd30, %r114, 4;
add.s64 %rd29, %rd9, %rd30;
mov.b32 %r109, %f12;
// begin inline asm
st.global.cs.v2.s32 [%rd29], {%r109,%r110};
// end inline asm
selp.f32 %f103, %f14, 0f00000000, %p3;
$L__BB0_26:
st.shared.f32 [%rd2], %f103;
bar.sync 0;
@%p16 bra $L__BB0_28;
ld.shared.f32 %f89, [%rd3];
ld.shared.f32 %f90, [%rd2];
add.f32 %f91, %f89, %f90;
st.shared.f32 [%rd2], %f91;
$L__BB0_28:
bar.sync 0;
@%p20 bra $L__BB0_32;
$L__BB0_29:
setp.ge.u32 %p39, %r1, %r126;
@%p39 bra $L__BB0_31;
add.s32 %r123, %r126, %r13;
mul.wide.s32 %rd33, %r123, 4;
add.s64 %rd35, %rd20, %rd33;
ld.shared.f32 %f92, [%rd2];
ld.shared.f32 %f93, [%rd35];
add.f32 %f94, %f93, %f92;
st.shared.f32 [%rd2], %f94;
$L__BB0_31:
bar.sync 0;
shr.u32 %r20, %r126, 1;
setp.gt.u32 %p40, %r126, 3;
mov.u32 %r126, %r20;
@%p40 bra $L__BB0_29;
$L__BB0_32:
mov.f32 %f104, 0f00000000;
@%p4 bra $L__BB0_35;
ld.shared.f32 %f96, [%rd2];
add.f32 %f104, %f96, 0f00000000;
setp.lt.u32 %p42, %r12, 2;
@%p42 bra $L__BB0_35;
ld.shared.f32 %f97, [%rd4];
add.f32 %f104, %f104, %f97;
$L__BB0_35:
bar.sync 0;
mov.u32 %r124, %ctaid.x;
mad.lo.s32 %r21, %r9, %r124, %r10;
setp.gt.s32 %p43, %r21, 483;
@%p43 bra $L__BB0_37;
rcp.rn.f32 %f98, %f104;
cvta.to.global.u64 %rd36, %rd8;
mul.wide.s32 %rd37, %r21, 4;
add.s64 %rd38, %rd36, %rd37;
st.global.f32 [%rd38], %f98;
$L__BB0_37:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_138_cu_6f4c8ff6_1601111nvfuser_138ENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_138_cu_6f4c8ff6_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_138_cu_6f4c8ff6_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_138_cu_6f4c8ff6_160113std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_138_cu_6f4c8ff6_160115arrayE[];
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_138_cu_6f4c8ff6_1601111nvfuser_138ENS_6TensorIfLi3ELi3EEES1_S1_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_138_cu_6f4c8ff6_1601111nvfuser_138ENS_6TensorIfLi3ELi3EEES1_S1__param_0[32],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_138_cu_6f4c8ff6_1601111nvfuser_138ENS_6TensorIfLi3ELi3EEES1_S1__param_1[32],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_138_cu_6f4c8ff6_1601111nvfuser_138ENS_6TensorIfLi3ELi3EEES1_S1__param_2[32]
)
{
.reg .pred %p<33>;
.reg .f32 %f<70>;
.reg .b32 %r<86>;
.reg .b64 %rd<32>;
// demoted variable
.shared .align 4 .u32 _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_138_cu_6f4c8ff6_1601111nvfuser_138ENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s;
ld.param.v2.u32 {%r34, %r35}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_138_cu_6f4c8ff6_1601111nvfuser_138ENS_6TensorIfLi3ELi3EEES1_S1__param_0+16];
ld.param.v2.u32 {%r36, %r37}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_138_cu_6f4c8ff6_1601111nvfuser_138ENS_6TensorIfLi3ELi3EEES1_S1__param_0+24];
ld.param.u64 %rd7, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_138_cu_6f4c8ff6_1601111nvfuser_138ENS_6TensorIfLi3ELi3EEES1_S1__param_2];
ld.param.u64 %rd6, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_138_cu_6f4c8ff6_1601111nvfuser_138ENS_6TensorIfLi3ELi3EEES1_S1__param_1];
ld.param.u64 %rd5, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_138_cu_6f4c8ff6_1601111nvfuser_138ENS_6TensorIfLi3ELi3EEES1_S1__param_0];
mov.u32 %r1, %tid.x;
setp.ne.s32 %p2, %r1, 0;
@%p2 bra $L__BB0_2;
mov.u32 %r50, 0;
st.shared.u32 [_ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_138_cu_6f4c8ff6_1601111nvfuser_138ENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s], %r50;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd8, _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_138_cu_6f4c8ff6_1601111nvfuser_138ENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s;
atom.shared.min.s32 %r51, [%rd8], %r1;
mov.u32 %r52, %ctaid.x;
mov.u32 %r2, %ntid.y;
mov.u32 %r3, %tid.y;
mad.lo.s32 %r4, %r2, %r52, %r3;
setp.gt.s32 %p3, %r4, 483;
mov.f32 %f65, 0fFF800000;
mov.f32 %f66, %f65;
@%p3 bra $L__BB0_4;
cvta.to.global.u64 %rd9, %rd5;
ld.shared.u32 %r53, [_ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_138_cu_6f4c8ff6_1601111nvfuser_138ENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s];
shl.b32 %r54, %r53, 1;
mul.hi.s32 %r55, %r4, 780903145;
shr.u32 %r56, %r55, 31;
shr.s32 %r57, %r55, 2;
add.s32 %r58, %r57, %r56;
mul.lo.s32 %r59, %r58, 22;
sub.s32 %r60, %r4, %r59;
mul.lo.s32 %r61, %r36, %r60;
mad.lo.s32 %r62, %r35, %r58, %r61;
mad.lo.s32 %r63, %r54, %r37, %r62;
mul.wide.s32 %rd10, %r63, 4;
add.s64 %rd11, %rd9, %rd10;
ld.global.f32 %f65, [%rd11];
add.s32 %r64, %r63, %r37;
mul.wide.s32 %rd12, %r64, 4;
add.s64 %rd13, %rd9, %rd12;
ld.global.f32 %f66, [%rd13];
$L__BB0_4:
setp.gt.f32 %p4, %f65, %f66;
setp.nan.f32 %p5, %f65, %f65;
or.pred %p6, %p5, %p4;
selp.f32 %f16, %f65, %f66, %p6;
mov.u32 %r65, %tid.z;
mad.lo.s32 %r5, %r2, %r65, %r3;
mov.u32 %r6, %ntid.x;
mad.lo.s32 %r7, %r5, %r6, %r1;
mul.wide.u32 %rd14, %r7, 4;
mov.u64 %rd15, _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_138_cu_6f4c8ff6_160115arrayE;
add.s64 %rd1, %rd15, %rd14;
st.shared.f32 [%rd1], %f16;
bar.sync 0;
clz.b32 %r66, %r6;
mov.u32 %r67, 31;
sub.s32 %r68, %r67, %r66;
mov.u32 %r69, 1;
shl.b32 %r8, %r69, %r68;
setp.lt.u32 %p7, %r1, %r8;
add.s32 %r70, %r8, %r1;
setp.lt.u32 %p8, %r70, %r6;
and.pred %p1, %p7, %p8;
add.s32 %r71, %r7, %r8;
mul.wide.s32 %rd16, %r71, 4;
add.s64 %rd2, %rd15, %rd16;
not.pred %p9, %p1;
@%p9 bra $L__BB0_6;
ld.shared.f32 %f17, [%rd2];
ld.shared.f32 %f18, [%rd1];
setp.nan.f32 %p10, %f18, %f18;
setp.gt.f32 %p11, %f18, %f17;
or.pred %p12, %p10, %p11;
selp.f32 %f19, %f18, %f17, %p12;
st.shared.f32 [%rd1], %f19;
$L__BB0_6:
bar.sync 0;
shr.u32 %r72, %r8, 31;
add.s32 %r73, %r8, %r72;
shr.s32 %r85, %r73, 1;
setp.lt.s32 %p13, %r8, 4;
@%p13 bra $L__BB0_11;
mov.u32 %r84, %r85;
$L__BB0_8:
setp.ge.u32 %p14, %r1, %r84;
@%p14 bra $L__BB0_10;
add.s32 %r74, %r84, %r7;
mul.wide.s32 %rd17, %r74, 4;
add.s64 %rd19, %rd15, %rd17;
ld.shared.f32 %f20, [%rd1];
setp.nan.f32 %p15, %f20, %f20;
ld.shared.f32 %f21, [%rd19];
setp.gt.f32 %p16, %f20, %f21;
or.pred %p17, %p15, %p16;
selp.f32 %f22, %f20, %f21, %p17;
st.shared.f32 [%rd1], %f22;
$L__BB0_10:
bar.sync 0;
shr.u32 %r11, %r84, 1;
setp.gt.u32 %p18, %r84, 3;
mov.u32 %r84, %r11;
@%p18 bra $L__BB0_8;
$L__BB0_11:
add.s32 %r75, %r7, 1;
mul.wide.u32 %rd20, %r75, 4;
add.s64 %rd3, %rd15, %rd20;
mov.f32 %f67, 0fFF800000;
@%p2 bra $L__BB0_14;
ld.shared.f32 %f67, [%rd1];
setp.lt.u32 %p20, %r6, 2;
@%p20 bra $L__BB0_14;
ld.shared.f32 %f24, [%rd3];
setp.gt.f32 %p21, %f67, %f24;
setp.nan.f32 %p22, %f67, %f67;
or.pred %p23, %p22, %p21;
selp.f32 %f67, %f67, %f24, %p23;
$L__BB0_14:
bar.sync 0;
mul.wide.s32 %rd22, %r5, 4;
add.s64 %rd4, %rd15, %rd22;
setp.eq.s32 %p24, %r1, 0;
@%p24 bra $L__BB0_15;
bra.uni $L__BB0_16;
$L__BB0_15:
st.shared.f32 [%rd4], %f67;
$L__BB0_16:
bar.sync 0;
ld.shared.f32 %f8, [%rd4];
bar.sync 0;
mov.f32 %f68, 0f00000000;
@%p3 bra $L__BB0_18;
sub.f32 %f26, %f65, %f8;
mov.f32 %f27, 0f3F000000;
mov.f32 %f28, 0f3BBB989D;
fma.rn.f32 %f29, %f26, %f28, %f27;
cvt.sat.f32.f32 %f30, %f29;
mov.f32 %f31, 0f4B400001;
mov.f32 %f32, 0f437C0000;
fma.rm.f32 %f33, %f30, %f32, %f31;
add.f32 %f34, %f33, 0fCB40007F;
neg.f32 %f35, %f34;
mov.f32 %f36, 0f3FB8AA3B;
fma.rn.f32 %f37, %f26, %f36, %f35;
mov.f32 %f38, 0f32A57060;
fma.rn.f32 %f39, %f26, %f38, %f37;
mov.b32 %r78, %f33;
shl.b32 %r79, %r78, 23;
mov.b32 %f40, %r79;
ex2.approx.ftz.f32 %f41, %f39;
mul.f32 %f42, %f41, %f40;
add.f32 %f43, %f42, 0f00000000;
mov.b32 %r76, %f42;
sub.f32 %f44, %f66, %f8;
fma.rn.f32 %f45, %f44, %f28, %f27;
cvt.sat.f32.f32 %f46, %f45;
fma.rm.f32 %f47, %f46, %f32, %f31;
add.f32 %f48, %f47, 0fCB40007F;
neg.f32 %f49, %f48;
fma.rn.f32 %f50, %f44, %f36, %f49;
fma.rn.f32 %f51, %f44, %f38, %f50;
mov.b32 %r80, %f47;
shl.b32 %r81, %r80, 23;
mov.b32 %f52, %r81;
ex2.approx.ftz.f32 %f53, %f51;
mul.f32 %f54, %f53, %f52;
add.f32 %f68, %f43, %f54;
mov.b32 %r77, %f54;
shl.b32 %r82, %r4, 1;
mul.wide.s32 %rd25, %r82, 4;
add.s64 %rd24, %rd7, %rd25;
// begin inline asm
st.global.cs.v2.s32 [%rd24], {%r76,%r77};
// end inline asm
$L__BB0_18:
st.shared.f32 [%rd1], %f68;
bar.sync 0;
@%p9 bra $L__BB0_20;
ld.shared.f32 %f55, [%rd2];
ld.shared.f32 %f56, [%rd1];
add.f32 %f57, %f55, %f56;
st.shared.f32 [%rd1], %f57;
$L__BB0_20:
bar.sync 0;
@%p13 bra $L__BB0_24;
$L__BB0_21:
setp.ge.u32 %p28, %r1, %r85;
@%p28 bra $L__BB0_23;
add.s32 %r83, %r85, %r7;
mul.wide.s32 %rd26, %r83, 4;
add.s64 %rd28, %rd15, %rd26;
ld.shared.f32 %f58, [%rd1];
ld.shared.f32 %f59, [%rd28];
add.f32 %f60, %f59, %f58;
st.shared.f32 [%rd1], %f60;
$L__BB0_23:
bar.sync 0;
shr.u32 %r13, %r85, 1;
setp.gt.u32 %p29, %r85, 3;
mov.u32 %r85, %r13;
@%p29 bra $L__BB0_21;
$L__BB0_24:
mov.f32 %f69, 0f00000000;
@%p2 bra $L__BB0_27;
ld.shared.f32 %f62, [%rd1];
add.f32 %f69, %f62, 0f00000000;
setp.lt.u32 %p31, %r6, 2;
@%p31 bra $L__BB0_27;
ld.shared.f32 %f63, [%rd3];
add.f32 %f69, %f69, %f63;
$L__BB0_27:
bar.sync 0;
@%p3 bra $L__BB0_29;
rcp.rn.f32 %f64, %f69;
cvta.to.global.u64 %rd29, %rd6;
mul.wide.s32 %rd30, %r4, 4;
add.s64 %rd31, %rd29, %rd30;
st.global.f32 [%rd31], %f64;
$L__BB0_29:
ret;
}
--- 53997da5d
+++ 03a1b695e
@@ -20,379 +20,265 @@
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1__param_0[32],
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1__param_1[32],
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1__param_2[32]
)
{
- .reg .pred %p<44>;
- .reg .f32 %f<105>;
- .reg .b32 %r<127>;
- .reg .b64 %rd<39>;
+ .reg .pred %p<33>;
+ .reg .f32 %f<70>;
+ .reg .b32 %r<86>;
+ .reg .b64 %rd<32>;
.shared .align 4 .u32 _ZZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s;
- ld.param.v2.u32 {%r42, %r43}, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1__param_0+16];
- ld.param.v2.u32 {%r44, %r45}, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1__param_0+24];
- ld.param.u64 %rd9, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1__param_2];
- ld.param.u64 %rd8, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1__param_1];
- ld.param.u64 %rd7, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1__param_0];
- cvta.to.global.u64 %rd1, %rd7;
+ ld.param.v2.u32 {%r34, %r35}, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1__param_0+16];
+ ld.param.v2.u32 {%r36, %r37}, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1__param_0+24];
+ ld.param.u64 %rd7, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1__param_2];
+ ld.param.u64 %rd6, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1__param_1];
+ ld.param.u64 %rd5, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1__param_0];
mov.u32 %r1, %tid.x;
- setp.ne.s32 %p4, %r1, 0;
- @%p4 bra $L__BB0_2;
-
- mov.u32 %r58, 0;
- st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s], %r58;
+ setp.ne.s32 %p2, %r1, 0;
+ @%p2 bra $L__BB0_2;
+
+ mov.u32 %r50, 0;
+ st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s], %r50;
$L__BB0_2:
bar.sync 0;
- mov.u64 %rd10, _ZZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s;
- atom.shared.min.s32 %r59, [%rd10], %r1;
- ld.shared.u32 %r60, [_ZZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s];
- shl.b32 %r5, %r60, 1;
- setp.lt.s32 %p5, %r1, 1;
- @%p5 bra $L__BB0_3;
- bra.uni $L__BB0_4;
-
-$L__BB0_3:
- mov.u32 %r61, %tid.y;
- mov.u32 %r62, %ctaid.x;
- mov.u32 %r63, %ntid.y;
- mad.lo.s32 %r6, %r63, %r62, %r61;
- setp.lt.s32 %p6, %r6, 484;
- @%p6 bra $L__BB0_8;
- bra.uni $L__BB0_4;
+ mov.u64 %rd8, _ZZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s;
+ atom.shared.min.s32 %r51, [%rd8], %r1;
+ mov.u32 %r52, %ctaid.x;
+ mov.u32 %r2, %ntid.y;
+ mov.u32 %r3, %tid.y;
+ mad.lo.s32 %r4, %r2, %r52, %r3;
+ setp.gt.s32 %p3, %r4, 483;
+ mov.f32 %f65, 0fFF800000;
+ mov.f32 %f66, %f65;
+ @%p3 bra $L__BB0_4;
+
+ cvta.to.global.u64 %rd9, %rd5;
+ ld.shared.u32 %r53, [_ZZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s];
+ shl.b32 %r54, %r53, 1;
+ mul.hi.s32 %r55, %r4, 780903145;
+ shr.u32 %r56, %r55, 31;
+ shr.s32 %r57, %r55, 2;
+ add.s32 %r58, %r57, %r56;
+ mul.lo.s32 %r59, %r58, 22;
+ sub.s32 %r60, %r4, %r59;
+ mul.lo.s32 %r61, %r36, %r60;
+ mad.lo.s32 %r62, %r35, %r58, %r61;
+ mad.lo.s32 %r63, %r54, %r37, %r62;
+ mul.wide.s32 %rd10, %r63, 4;
+ add.s64 %rd11, %rd9, %rd10;
+ ld.global.f32 %f65, [%rd11];
+ add.s32 %r64, %r63, %r37;
+ mul.wide.s32 %rd12, %r64, 4;
+ add.s64 %rd13, %rd9, %rd12;
+ ld.global.f32 %f66, [%rd13];
+
+$L__BB0_4:
+ setp.gt.f32 %p4, %f65, %f66;
+ setp.nan.f32 %p5, %f65, %f65;
+ or.pred %p6, %p5, %p4;
+ selp.f32 %f16, %f65, %f66, %p6;
+ mov.u32 %r65, %tid.z;
+ mad.lo.s32 %r5, %r2, %r65, %r3;
+ mov.u32 %r6, %ntid.x;
+ mad.lo.s32 %r7, %r5, %r6, %r1;
+ mul.wide.u32 %rd14, %r7, 4;
+ mov.u64 %rd15, _ZN11kernelscope6kernelE;
+ add.s64 %rd1, %rd15, %rd14;
+ st.shared.f32 [%rd1], %f16;
+ bar.sync 0;
+ clz.b32 %r66, %r6;
+ mov.u32 %r67, 31;
+ sub.s32 %r68, %r67, %r66;
+ mov.u32 %r69, 1;
+ shl.b32 %r8, %r69, %r68;
+ setp.lt.u32 %p7, %r1, %r8;
+ add.s32 %r70, %r8, %r1;
+ setp.lt.u32 %p8, %r70, %r6;
+ and.pred %p1, %p7, %p8;
+ add.s32 %r71, %r7, %r8;
+ mul.wide.s32 %rd16, %r71, 4;
+ add.s64 %rd2, %rd15, %rd16;
+ not.pred %p9, %p1;
+ @%p9 bra $L__BB0_6;
+
+ ld.shared.f32 %f17, [%rd2];
+ ld.shared.f32 %f18, [%rd1];
+ setp.nan.f32 %p10, %f18, %f18;
+ setp.gt.f32 %p11, %f18, %f17;
+ or.pred %p12, %p10, %p11;
+ selp.f32 %f19, %f18, %f17, %p12;
+ st.shared.f32 [%rd1], %f19;
+
+$L__BB0_6:
+ bar.sync 0;
+ shr.u32 %r72, %r8, 31;
+ add.s32 %r73, %r8, %r72;
+ shr.s32 %r85, %r73, 1;
+ setp.lt.s32 %p13, %r8, 4;
+ @%p13 bra $L__BB0_11;
+
+ mov.u32 %r84, %r85;
$L__BB0_8:
- shl.b32 %r79, %r1, 1;
- mul.hi.s32 %r80, %r6, 780903145;
- shr.u32 %r81, %r80, 31;
- shr.s32 %r82, %r80, 2;
- add.s32 %r83, %r82, %r81;
- mul.lo.s32 %r84, %r43, %r83;
- mul.lo.s32 %r85, %r83, 22;
- sub.s32 %r86, %r6, %r85;
- add.s32 %r87, %r5, %r79;
- mad.lo.s32 %r88, %r44, %r86, %r84;
- mad.lo.s32 %r89, %r87, %r45, %r88;
- mul.wide.s32 %rd15, %r89, 4;
- add.s64 %rd16, %rd1, %rd15;
- ld.global.f32 %f100, [%rd16];
- add.s32 %r90, %r89, %r45;
- mul.wide.s32 %rd17, %r90, 4;
- add.s64 %rd18, %rd1, %rd17;
- ld.global.f32 %f101, [%rd18];
- bra.uni $L__BB0_9;
-
-$L__BB0_4:
- mov.u32 %r64, %ntid.y;
- mov.u32 %r65, %ctaid.x;
- mov.u32 %r66, %tid.y;
- mad.lo.s32 %r67, %r64, %r65, %r66;
- setp.lt.s32 %p8, %r67, 484;
- mul.hi.s32 %r68, %r67, 780903145;
- shr.u32 %r69, %r68, 31;
- shr.s32 %r70, %r68, 2;
- add.s32 %r71, %r70, %r69;
- mul.lo.s32 %r72, %r43, %r71;
- mul.lo.s32 %r73, %r71, 22;
- sub.s32 %r74, %r67, %r73;
- shl.b32 %r75, %r1, 1;
- add.s32 %r7, %r5, %r75;
- mad.lo.s32 %r8, %r44, %r74, %r72;
- and.pred %p1, %p5, %p8;
- mov.f32 %f101, 0fFF800000;
- not.pred %p9, %p1;
- mov.f32 %f100, %f101;
- @%p9 bra $L__BB0_6;
-
- mad.lo.s32 %r76, %r7, %r45, %r8;
- mul.wide.s32 %rd11, %r76, 4;
- add.s64 %rd12, %rd1, %rd11;
- ld.global.f32 %f100, [%rd12];
-
-$L__BB0_6:
- @%p9 bra $L__BB0_9;
-
- add.s32 %r77, %r7, 1;
- mad.lo.s32 %r78, %r77, %r45, %r8;
- mul.wide.s32 %rd13, %r78, 4;
- add.s64 %rd14, %rd1, %rd13;
- ld.global.f32 %f101, [%rd14];
-
-$L__BB0_9:
- setp.gt.f32 %p11, %f100, %f101;
- setp.nan.f32 %p12, %f100, %f100;
- or.pred %p13, %p12, %p11;
- selp.f32 %f23, %f100, %f101, %p13;
- mov.u32 %r91, %tid.z;
- mov.u32 %r9, %ntid.y;
- mov.u32 %r10, %tid.y;
- mad.lo.s32 %r11, %r9, %r91, %r10;
- mov.u32 %r12, %ntid.x;
- mad.lo.s32 %r13, %r11, %r12, %r1;
- mul.wide.u32 %rd19, %r13, 4;
- mov.u64 %rd20, _ZN11kernelscope6kernelE;
- add.s64 %rd2, %rd20, %rd19;
- st.shared.f32 [%rd2], %f23;
- bar.sync 0;
- clz.b32 %r92, %r12;
- mov.u32 %r93, 31;
- sub.s32 %r94, %r93, %r92;
- mov.u32 %r95, 1;
- shl.b32 %r14, %r95, %r94;
- setp.lt.u32 %p14, %r1, %r14;
- add.s32 %r96, %r14, %r1;
- setp.lt.u32 %p15, %r96, %r12;
- and.pred %p2, %p14, %p15;
- add.s32 %r97, %r13, %r14;
- mul.wide.s32 %rd21, %r97, 4;
- add.s64 %rd3, %rd20, %rd21;
- not.pred %p16, %p2;
- @%p16 bra $L__BB0_11;
+ setp.ge.u32 %p14, %r1, %r84;
+ @%p14 bra $L__BB0_10;
+
+ add.s32 %r74, %r84, %r7;
+ mul.wide.s32 %rd17, %r74, 4;
+ add.s64 %rd19, %rd15, %rd17;
+ ld.shared.f32 %f20, [%rd1];
+ setp.nan.f32 %p15, %f20, %f20;
+ ld.shared.f32 %f21, [%rd19];
+ setp.gt.f32 %p16, %f20, %f21;
+ or.pred %p17, %p15, %p16;
+ selp.f32 %f22, %f20, %f21, %p17;
+ st.shared.f32 [%rd1], %f22;
+
+$L__BB0_10:
+ bar.sync 0;
+ shr.u32 %r11, %r84, 1;
+ setp.gt.u32 %p18, %r84, 3;
+ mov.u32 %r84, %r11;
+ @%p18 bra $L__BB0_8;
+
+$L__BB0_11:
+ add.s32 %r75, %r7, 1;
+ mul.wide.u32 %rd20, %r75, 4;
+ add.s64 %rd3, %rd15, %rd20;
+ mov.f32 %f67, 0fFF800000;
+ @%p2 bra $L__BB0_14;
+
+ ld.shared.f32 %f67, [%rd1];
+ setp.lt.u32 %p20, %r6, 2;
+ @%p20 bra $L__BB0_14;
ld.shared.f32 %f24, [%rd3];
- ld.shared.f32 %f25, [%rd2];
- setp.nan.f32 %p17, %f25, %f25;
- setp.gt.f32 %p18, %f25, %f24;
- or.pred %p19, %p17, %p18;
- selp.f32 %f26, %f25, %f24, %p19;
- st.shared.f32 [%rd2], %f26;
-
-$L__BB0_11:
- bar.sync 0;
- shr.u32 %r98, %r14, 31;
- add.s32 %r99, %r14, %r98;
- shr.s32 %r126, %r99, 1;
- setp.lt.s32 %p20, %r14, 4;
- @%p20 bra $L__BB0_16;
-
- mov.u32 %r125, %r126;
-
-$L__BB0_13:
- setp.ge.u32 %p21, %r1, %r125;
- @%p21 bra $L__BB0_15;
-
- add.s32 %r100, %r125, %r13;
- mul.wide.s32 %rd22, %r100, 4;
- add.s64 %rd24, %rd20, %rd22;
- ld.shared.f32 %f27, [%rd2];
- setp.nan.f32 %p22, %f27, %f27;
- ld.shared.f32 %f28, [%rd24];
- setp.gt.f32 %p23, %f27, %f28;
- or.pred %p24, %p22, %p23;
- selp.f32 %f29, %f27, %f28, %p24;
- st.shared.f32 [%rd2], %f29;
+ setp.gt.f32 %p21, %f67, %f24;
+ setp.nan.f32 %p22, %f67, %f67;
+ or.pred %p23, %p22, %p21;
+ selp.f32 %f67, %f67, %f24, %p23;
+
+$L__BB0_14:
+ bar.sync 0;
+ mul.wide.s32 %rd22, %r5, 4;
+ add.s64 %rd4, %rd15, %rd22;
+ setp.eq.s32 %p24, %r1, 0;
+ @%p24 bra $L__BB0_15;
+ bra.uni $L__BB0_16;
$L__BB0_15:
- bar.sync 0;
- shr.u32 %r17, %r125, 1;
- setp.gt.u32 %p25, %r125, 3;
- mov.u32 %r125, %r17;
- @%p25 bra $L__BB0_13;
+ st.shared.f32 [%rd4], %f67;
$L__BB0_16:
- add.s32 %r101, %r13, 1;
- mul.wide.u32 %rd25, %r101, 4;
- add.s64 %rd4, %rd20, %rd25;
- mov.f32 %f102, 0fFF800000;
- @%p4 bra $L__BB0_19;
-
- ld.shared.f32 %f102, [%rd2];
- setp.lt.u32 %p27, %r12, 2;
- @%p27 bra $L__BB0_19;
-
- ld.shared.f32 %f31, [%rd4];
- setp.gt.f32 %p28, %f102, %f31;
- setp.nan.f32 %p29, %f102, %f102;
- or.pred %p30, %p29, %p28;
- selp.f32 %f102, %f102, %f31, %p30;
-
-$L__BB0_19:
- bar.sync 0;
- mul.wide.s32 %rd27, %r11, 4;
- add.s64 %rd5, %rd20, %rd27;
- setp.eq.s32 %p31, %r1, 0;
- @%p31 bra $L__BB0_20;
- bra.uni $L__BB0_21;
+ bar.sync 0;
+ ld.shared.f32 %f8, [%rd4];
+ bar.sync 0;
+ mov.f32 %f68, 0f00000000;
+ @%p3 bra $L__BB0_18;
+
+ sub.f32 %f26, %f65, %f8;
+ mov.f32 %f27, 0f3F000000;
+ mov.f32 %f28, 0f3BBB989D;
+ fma.rn.f32 %f29, %f26, %f28, %f27;
+ cvt.sat.f32.f32 %f30, %f29;
+ mov.f32 %f31, 0f4B400001;
+ mov.f32 %f32, 0f437C0000;
+ fma.rm.f32 %f33, %f30, %f32, %f31;
+ add.f32 %f34, %f33, 0fCB40007F;
+ neg.f32 %f35, %f34;
+ mov.f32 %f36, 0f3FB8AA3B;
+ fma.rn.f32 %f37, %f26, %f36, %f35;
+ mov.f32 %f38, 0f32A57060;
+ fma.rn.f32 %f39, %f26, %f38, %f37;
+ mov.b32 %r78, %f33;
+ shl.b32 %r79, %r78, 23;
+ mov.b32 %f40, %r79;
+ ex2.approx.ftz.f32 %f41, %f39;
+ mul.f32 %f42, %f41, %f40;
+ add.f32 %f43, %f42, 0f00000000;
+ mov.b32 %r76, %f42;
+ sub.f32 %f44, %f66, %f8;
+ fma.rn.f32 %f45, %f44, %f28, %f27;
+ cvt.sat.f32.f32 %f46, %f45;
+ fma.rm.f32 %f47, %f46, %f32, %f31;
+ add.f32 %f48, %f47, 0fCB40007F;
+ neg.f32 %f49, %f48;
+ fma.rn.f32 %f50, %f44, %f36, %f49;
+ fma.rn.f32 %f51, %f44, %f38, %f50;
+ mov.b32 %r80, %f47;
+ shl.b32 %r81, %r80, 23;
+ mov.b32 %f52, %r81;
+ ex2.approx.ftz.f32 %f53, %f51;
+ mul.f32 %f54, %f53, %f52;
+ add.f32 %f68, %f43, %f54;
+ mov.b32 %r77, %f54;
+ shl.b32 %r82, %r4, 1;
+ mul.wide.s32 %rd25, %r82, 4;
+ add.s64 %rd24, %rd7, %rd25;
+
+ st.global.cs.v2.s32 [%rd24], {%r76,%r77};
+
+
+$L__BB0_18:
+ st.shared.f32 [%rd1], %f68;
+ bar.sync 0;
+ @%p9 bra $L__BB0_20;
+
+ ld.shared.f32 %f55, [%rd2];
+ ld.shared.f32 %f56, [%rd1];
+ add.f32 %f57, %f55, %f56;
+ st.shared.f32 [%rd1], %f57;
$L__BB0_20:
- st.shared.f32 [%rd5], %f102;
+ bar.sync 0;
+ @%p13 bra $L__BB0_24;
$L__BB0_21:
- bar.sync 0;
- ld.shared.f32 %f11, [%rd5];
- bar.sync 0;
- @%p5 bra $L__BB0_22;
- bra.uni $L__BB0_23;
-
-$L__BB0_22:
- mov.u32 %r102, %ctaid.x;
- mad.lo.s32 %r18, %r9, %r102, %r10;
- setp.lt.s32 %p33, %r18, 484;
- @%p33 bra $L__BB0_25;
- bra.uni $L__BB0_23;
-
-$L__BB0_25:
- sub.f32 %f60, %f100, %f11;
- mov.f32 %f61, 0f3F000000;
- mov.f32 %f62, 0f3BBB989D;
- fma.rn.f32 %f63, %f60, %f62, %f61;
- cvt.sat.f32.f32 %f64, %f63;
- mov.f32 %f65, 0f4B400001;
- mov.f32 %f66, 0f437C0000;
- fma.rm.f32 %f67, %f64, %f66, %f65;
- add.f32 %f68, %f67, 0fCB40007F;
- neg.f32 %f69, %f68;
- mov.f32 %f70, 0f3FB8AA3B;
- fma.rn.f32 %f71, %f60, %f70, %f69;
- mov.f32 %f72, 0f32A57060;
- fma.rn.f32 %f73, %f60, %f72, %f71;
- mov.b32 %r117, %f67;
- shl.b32 %r118, %r117, 23;
- mov.b32 %f74, %r118;
- ex2.approx.ftz.f32 %f75, %f73;
- mul.f32 %f76, %f75, %f74;
- add.f32 %f77, %f76, 0f00000000;
- mov.b32 %r115, %f76;
- sub.f32 %f78, %f101, %f11;
- fma.rn.f32 %f79, %f78, %f62, %f61;
- cvt.sat.f32.f32 %f80, %f79;
- fma.rm.f32 %f81, %f80, %f66, %f65;
- add.f32 %f82, %f81, 0fCB40007F;
- neg.f32 %f83, %f82;
- fma.rn.f32 %f84, %f78, %f70, %f83;
- fma.rn.f32 %f85, %f78, %f72, %f84;
- mov.b32 %r119, %f81;
- shl.b32 %r120, %r119, 23;
- mov.b32 %f86, %r120;
- ex2.approx.ftz.f32 %f87, %f85;
- mul.f32 %f88, %f87, %f86;
- add.f32 %f103, %f77, %f88;
- mov.b32 %r116, %f88;
- add.s32 %r121, %r18, %r1;
- shl.b32 %r122, %r121, 1;
- mul.wide.s32 %rd32, %r122, 4;
- add.s64 %rd31, %rd9, %rd32;
-
- st.global.cs.v2.s32 [%rd31], {%r115,%r116};
-
- bra.uni $L__BB0_26;
+ setp.ge.u32 %p28, %r1, %r85;
+ @%p28 bra $L__BB0_23;
+
+ add.s32 %r83, %r85, %r7;
+ mul.wide.s32 %rd26, %r83, 4;
+ add.s64 %rd28, %rd15, %rd26;
+ ld.shared.f32 %f58, [%rd1];
+ ld.shared.f32 %f59, [%rd28];
+ add.f32 %f60, %f59, %f58;
+ st.shared.f32 [%rd1], %f60;
$L__BB0_23:
- mov.u32 %r103, %ctaid.x;
- mad.lo.s32 %r104, %r9, %r103, %r10;
- setp.lt.s32 %p35, %r104, 484;
- sub.f32 %f33, %f100, %f11;
- mov.f32 %f34, 0f3F000000;
- mov.f32 %f35, 0f3BBB989D;
- fma.rn.f32 %f36, %f33, %f35, %f34;
- cvt.sat.f32.f32 %f37, %f36;
- mov.f32 %f38, 0f4B400001;
- mov.f32 %f39, 0f437C0000;
- fma.rm.f32 %f40, %f37, %f39, %f38;
- add.f32 %f41, %f40, 0fCB40007F;
- neg.f32 %f42, %f41;
- mov.f32 %f43, 0f3FB8AA3B;
- fma.rn.f32 %f44, %f33, %f43, %f42;
- mov.f32 %f45, 0f32A57060;
- fma.rn.f32 %f46, %f33, %f45, %f44;
- mov.b32 %r105, %f40;
- shl.b32 %r106, %r105, 23;
- mov.b32 %f47, %r106;
- ex2.approx.ftz.f32 %f48, %f46;
- mul.f32 %f12, %f48, %f47;
- add.f32 %f49, %f12, 0f00000000;
- mov.f32 %f103, 0f00000000;
- and.pred %p3, %p5, %p35;
- sub.f32 %f50, %f101, %f11;
- fma.rn.f32 %f51, %f50, %f35, %f34;
- cvt.sat.f32.f32 %f52, %f51;
- fma.rm.f32 %f53, %f52, %f39, %f38;
- add.f32 %f54, %f53, 0fCB40007F;
- neg.f32 %f55, %f54;
- fma.rn.f32 %f56, %f50, %f43, %f55;
- fma.rn.f32 %f57, %f50, %f45, %f56;
- mov.b32 %r107, %f53;
- shl.b32 %r108, %r107, 23;
- mov.b32 %f58, %r108;
- ex2.approx.ftz.f32 %f59, %f57;
- mul.f32 %f13, %f59, %f58;
- add.f32 %f14, %f49, %f13;
- not.pred %p36, %p3;
- @%p36 bra $L__BB0_26;
-
- mov.b32 %r110, %f13;
- add.s32 %r111, %r1, %r10;
- mad.lo.s32 %r113, %r9, %r103, %r111;
- shl.b32 %r114, %r113, 1;
- mul.wide.s32 %rd30, %r114, 4;
- add.s64 %rd29, %rd9, %rd30;
- mov.b32 %r109, %f12;
-
- st.global.cs.v2.s32 [%rd29], {%r109,%r110};
-
- selp.f32 %f103, %f14, 0f00000000, %p3;
-
-$L__BB0_26:
- st.shared.f32 [%rd2], %f103;
- bar.sync 0;
- @%p16 bra $L__BB0_28;
-
- ld.shared.f32 %f89, [%rd3];
- ld.shared.f32 %f90, [%rd2];
- add.f32 %f91, %f89, %f90;
- st.shared.f32 [%rd2], %f91;
-
-$L__BB0_28:
- bar.sync 0;
- @%p20 bra $L__BB0_32;
+ bar.sync 0;
+ shr.u32 %r13, %r85, 1;
+ setp.gt.u32 %p29, %r85, 3;
+ mov.u32 %r85, %r13;
+ @%p29 bra $L__BB0_21;
+
+$L__BB0_24:
+ mov.f32 %f69, 0f00000000;
+ @%p2 bra $L__BB0_27;
+
+ ld.shared.f32 %f62, [%rd1];
+ add.f32 %f69, %f62, 0f00000000;
+ setp.lt.u32 %p31, %r6, 2;
+ @%p31 bra $L__BB0_27;
+
+ ld.shared.f32 %f63, [%rd3];
+ add.f32 %f69, %f69, %f63;
+
+$L__BB0_27:
+ bar.sync 0;
+ @%p3 bra $L__BB0_29;
+
+ rcp.rn.f32 %f64, %f69;
+ cvta.to.global.u64 %rd29, %rd6;
+ mul.wide.s32 %rd30, %r4, 4;
+ add.s64 %rd31, %rd29, %rd30;
+ st.global.f32 [%rd31], %f64;
$L__BB0_29:
- setp.ge.u32 %p39, %r1, %r126;
- @%p39 bra $L__BB0_31;
-
- add.s32 %r123, %r126, %r13;
- mul.wide.s32 %rd33, %r123, 4;
- add.s64 %rd35, %rd20, %rd33;
- ld.shared.f32 %f92, [%rd2];
- ld.shared.f32 %f93, [%rd35];
- add.f32 %f94, %f93, %f92;
- st.shared.f32 [%rd2], %f94;
-
-$L__BB0_31:
- bar.sync 0;
- shr.u32 %r20, %r126, 1;
- setp.gt.u32 %p40, %r126, 3;
- mov.u32 %r126, %r20;
- @%p40 bra $L__BB0_29;
-
-$L__BB0_32:
- mov.f32 %f104, 0f00000000;
- @%p4 bra $L__BB0_35;
-
- ld.shared.f32 %f96, [%rd2];
- add.f32 %f104, %f96, 0f00000000;
- setp.lt.u32 %p42, %r12, 2;
- @%p42 bra $L__BB0_35;
-
- ld.shared.f32 %f97, [%rd4];
- add.f32 %f104, %f104, %f97;
-
-$L__BB0_35:
- bar.sync 0;
- mov.u32 %r124, %ctaid.x;
- mad.lo.s32 %r21, %r9, %r124, %r10;
- setp.gt.s32 %p43, %r21, 483;
- @%p43 bra $L__BB0_37;
-
- rcp.rn.f32 %f98, %f104;
- cvta.to.global.u64 %rd36, %rd8;
- mul.wide.s32 %rd37, %r21, 4;
- add.s64 %rd38, %rd36, %rd37;
- st.global.f32 [%rd38], %f98;
-
-$L__BB0_37:
ret;
}
Kernel 140
CUDA
PTX
53997da5d
Diff
03a1b695e
-9
+9 index type: int
registers: 15
gmem: 3
static smem: 16
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 3, 3> T0, Tensor<float, 3, 3> T8, Tensor<float, 3, 3> T15) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
NVFUSER_DEFINE_MAGIC_ZERO;
Array<float, 2, 1> T13;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
T13[i0] = NEG_INFINITY;
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((((nvfuser_index_t)threadIdx.x) * 2) + 1) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484))) {
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
T13[i0]
= T0[(((((2 * T0.alloc_stride[2LL]) * ((nvfuser_index_t)threadIdx.x)) + (T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) / 22))) + (T0.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) % 22))) + (T0.alloc_stride[2LL] * (i0 + nvfuser_zero)))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
if ((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484))) {
T13[i0]
= T0[(((((2 * T0.alloc_stride[2LL]) * ((nvfuser_index_t)threadIdx.x)) + (T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) / 22))) + (T0.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) % 22))) + (T0.alloc_stride[2LL] * (i0 + nvfuser_zero)))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<float, 1, 1> T17;
T17[0] = NEG_INFINITY;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 2; ++i1) {
T17[0] = fmax(
T17[0],
T13[i1]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 1, 1> T2;
T2[0] = NEG_INFINITY;
blockReduce<true, false, false, true>(T2[0], T17[0], [](float &a, float b) { a = fmax(a, b); }, static_cast<float*>(shared_mem), true, true, float(NEG_INFINITY), DefaultBlockDim());
Array<float, 1, 1> T3;
broadcast::blockBroadcast<true, false, false, true>(T3[0], T2[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T18;
T18[0] = 0.000000000e+00f;
if (((((((nvfuser_index_t)threadIdx.x) * 2) + 1) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484))) {
Array<float, 2, 2> T16;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 2; ++i2) {
Array<float, 1, 1> T4;
T4[0]
= T13[i2]
- T3[0];
Array<float, 1, 1> T5;
T5[0]
= expf(T4[0]);
T18[0]
= T18[0]
+ T5[0];
T16[i2]
= T5[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T15[(((2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T16[0]);
} else {
Array<float, 2, 2> T16;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 2; ++i2) {
Array<float, 1, 1> T4;
T4[0]
= T13[i2]
- T3[0];
Array<float, 1, 1> T5;
T5[0]
= expf(T4[0]);
if ((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484))) {
T18[0]
= T18[0]
+ T5[0];
}
T16[i2]
= T5[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T15[(((2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T16[0]);
}
}
Array<float, 1, 1> T6;
T6[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T6[0], T18[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T7;
T7[0]
= T6[0];
Array<float, 1, 1> T14;
T14[0]
= reciprocal(T7[0]);
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484)) {
T8[(((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x)))]
= T14[0];
}
}
__global__ void nvfuser_N(Tensor<float, 3, 3> T0, Tensor<float, 3, 3> T8, Tensor<float, 3, 3> T15) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
NVFUSER_DEFINE_MAGIC_ZERO;
Array<float, 2, 1> T13;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
T13[i0] = NEG_INFINITY;
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484)) {
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
T13[i0]
= T0[(((T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) / 22)) + (T0.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) % 22))) + (T0.alloc_stride[2LL] * (i0 + nvfuser_zero)))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484)) {
T13[i0]
= T0[(((T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) / 22)) + (T0.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) % 22))) + (T0.alloc_stride[2LL] * (i0 + nvfuser_zero)))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<float, 1, 1> T17;
T17[0] = NEG_INFINITY;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 2; ++i1) {
T17[0] = fmax(
T17[0],
T13[i1]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 1, 1> T2;
T2[0] = NEG_INFINITY;
blockReduce<true, false, false, true>(T2[0], T17[0], [](float &a, float b) { a = fmax(a, b); }, static_cast<float*>(shared_mem), true, true, float(NEG_INFINITY), DefaultBlockDim());
Array<float, 1, 1> T3;
broadcast::blockBroadcast<true, false, false, true>(T3[0], T2[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T18;
T18[0] = 0.000000000e+00f;
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484)) {
Array<float, 2, 2> T16;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 2; ++i2) {
Array<float, 1, 1> T4;
T4[0]
= T13[i2]
- T3[0];
Array<float, 1, 1> T5;
T5[0]
= expf(T4[0]);
T18[0]
= T18[0]
+ T5[0];
T16[i2]
= T5[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T15[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T16[0]);
} else {
Array<float, 2, 2> T16;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 2; ++i2) {
Array<float, 1, 1> T4;
T4[0]
= T13[i2]
- T3[0];
Array<float, 1, 1> T5;
T5[0]
= expf(T4[0]);
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484)) {
T18[0]
= T18[0]
+ T5[0];
}
T16[i2]
= T5[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484)) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T15[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T16[0]);
}
}
Array<float, 1, 1> T6;
T6[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T6[0], T18[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T7;
T7[0]
= T6[0];
Array<float, 1, 1> T14;
T14[0]
= reciprocal(T7[0]);
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484)) {
T8[(((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x)))]
= T14[0];
}
}
--- 53997da5d
+++ 03a1b695e
@@ -6,23 +6,23 @@
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
T13[i0] = NEG_INFINITY;
}
NVFUSER_UPDATE_MAGIC_ZERO;
- if (((((((nvfuser_index_t)threadIdx.x) * 2) + 1) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484))) {
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484)) {
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
T13[i0]
- = T0[(((((2 * T0.alloc_stride[2LL]) * ((nvfuser_index_t)threadIdx.x)) + (T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) / 22))) + (T0.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) % 22))) + (T0.alloc_stride[2LL] * (i0 + nvfuser_zero)))];
+ = T0[(((T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) / 22)) + (T0.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) % 22))) + (T0.alloc_stride[2LL] * (i0 + nvfuser_zero)))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
- if ((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484))) {
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484)) {
T13[i0]
- = T0[(((((2 * T0.alloc_stride[2LL]) * ((nvfuser_index_t)threadIdx.x)) + (T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) / 22))) + (T0.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) % 22))) + (T0.alloc_stride[2LL] * (i0 + nvfuser_zero)))];
+ = T0[(((T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) / 22)) + (T0.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) % 22))) + (T0.alloc_stride[2LL] * (i0 + nvfuser_zero)))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<float, 1, 1> T17;
@@ -39,11 +39,11 @@
blockReduce<true, false, false, true>(T2[0], T17[0], [](float &a, float b) { a = fmax(a, b); }, static_cast<float*>(shared_mem), true, true, float(NEG_INFINITY), DefaultBlockDim());
Array<float, 1, 1> T3;
broadcast::blockBroadcast<true, false, false, true>(T3[0], T2[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T18;
T18[0] = 0.000000000e+00f;
- if (((((((nvfuser_index_t)threadIdx.x) * 2) + 1) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484))) {
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484)) {
Array<float, 2, 2> T16;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 2; ++i2) {
Array<float, 1, 1> T4;
T4[0]
@@ -57,11 +57,11 @@
+ T5[0];
T16[i2]
= T5[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
- loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T15[(((2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T16[0]);
+ loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T15[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T16[0]);
} else {
Array<float, 2, 2> T16;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 2; ++i2) {
Array<float, 1, 1> T4;
@@ -69,21 +69,21 @@
= T13[i2]
- T3[0];
Array<float, 1, 1> T5;
T5[0]
= expf(T4[0]);
- if ((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484))) {
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484)) {
T18[0]
= T18[0]
+ T5[0];
}
T16[i2]
= T5[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
- if ((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484))) {
- loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T15[(((2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T16[0]);
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 484)) {
+ loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T15[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T16[0]);
}
}
Array<float, 1, 1> T6;
T6[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T6[0], T18[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_145_cu_cf86d722_1911011nvfuser_145ENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_145_cu_cf86d722_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_145_cu_cf86d722_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_145_cu_cf86d722_191103std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_145_cu_cf86d722_191105arrayE[];
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_145_cu_cf86d722_1911011nvfuser_145ENS_6TensorIfLi3ELi3EEES1_S1_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_145_cu_cf86d722_1911011nvfuser_145ENS_6TensorIfLi3ELi3EEES1_S1__param_0[32],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_145_cu_cf86d722_1911011nvfuser_145ENS_6TensorIfLi3ELi3EEES1_S1__param_1[32],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_145_cu_cf86d722_1911011nvfuser_145ENS_6TensorIfLi3ELi3EEES1_S1__param_2[32]
)
{
.reg .pred %p<44>;
.reg .f32 %f<105>;
.reg .b32 %r<127>;
.reg .b64 %rd<39>;
// demoted variable
.shared .align 4 .u32 _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_145_cu_cf86d722_1911011nvfuser_145ENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s;
ld.param.v2.u32 {%r42, %r43}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_145_cu_cf86d722_1911011nvfuser_145ENS_6TensorIfLi3ELi3EEES1_S1__param_0+16];
ld.param.v2.u32 {%r44, %r45}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_145_cu_cf86d722_1911011nvfuser_145ENS_6TensorIfLi3ELi3EEES1_S1__param_0+24];
ld.param.u64 %rd9, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_145_cu_cf86d722_1911011nvfuser_145ENS_6TensorIfLi3ELi3EEES1_S1__param_2];
ld.param.u64 %rd8, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_145_cu_cf86d722_1911011nvfuser_145ENS_6TensorIfLi3ELi3EEES1_S1__param_1];
ld.param.u64 %rd7, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_145_cu_cf86d722_1911011nvfuser_145ENS_6TensorIfLi3ELi3EEES1_S1__param_0];
cvta.to.global.u64 %rd1, %rd7;
mov.u32 %r1, %tid.x;
setp.ne.s32 %p4, %r1, 0;
@%p4 bra $L__BB0_2;
mov.u32 %r58, 0;
st.shared.u32 [_ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_145_cu_cf86d722_1911011nvfuser_145ENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s], %r58;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd10, _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_145_cu_cf86d722_1911011nvfuser_145ENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s;
atom.shared.min.s32 %r59, [%rd10], %r1;
ld.shared.u32 %r60, [_ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_145_cu_cf86d722_1911011nvfuser_145ENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s];
shl.b32 %r5, %r60, 1;
setp.lt.s32 %p5, %r1, 1;
@%p5 bra $L__BB0_3;
bra.uni $L__BB0_4;
$L__BB0_3:
mov.u32 %r61, %tid.y;
mov.u32 %r62, %ctaid.x;
mov.u32 %r63, %ntid.y;
mad.lo.s32 %r6, %r63, %r62, %r61;
setp.lt.s32 %p6, %r6, 484;
@%p6 bra $L__BB0_8;
bra.uni $L__BB0_4;
$L__BB0_8:
shl.b32 %r79, %r1, 1;
mul.hi.s32 %r80, %r6, 780903145;
shr.u32 %r81, %r80, 31;
shr.s32 %r82, %r80, 2;
add.s32 %r83, %r82, %r81;
mul.lo.s32 %r84, %r43, %r83;
mul.lo.s32 %r85, %r83, 22;
sub.s32 %r86, %r6, %r85;
add.s32 %r87, %r5, %r79;
mad.lo.s32 %r88, %r44, %r86, %r84;
mad.lo.s32 %r89, %r87, %r45, %r88;
mul.wide.s32 %rd15, %r89, 4;
add.s64 %rd16, %rd1, %rd15;
ld.global.f32 %f100, [%rd16];
add.s32 %r90, %r89, %r45;
mul.wide.s32 %rd17, %r90, 4;
add.s64 %rd18, %rd1, %rd17;
ld.global.f32 %f101, [%rd18];
bra.uni $L__BB0_9;
$L__BB0_4:
mov.u32 %r64, %ntid.y;
mov.u32 %r65, %ctaid.x;
mov.u32 %r66, %tid.y;
mad.lo.s32 %r67, %r64, %r65, %r66;
setp.lt.s32 %p8, %r67, 484;
mul.hi.s32 %r68, %r67, 780903145;
shr.u32 %r69, %r68, 31;
shr.s32 %r70, %r68, 2;
add.s32 %r71, %r70, %r69;
mul.lo.s32 %r72, %r43, %r71;
mul.lo.s32 %r73, %r71, 22;
sub.s32 %r74, %r67, %r73;
shl.b32 %r75, %r1, 1;
add.s32 %r7, %r5, %r75;
mad.lo.s32 %r8, %r44, %r74, %r72;
and.pred %p1, %p5, %p8;
mov.f32 %f101, 0fFF800000;
not.pred %p9, %p1;
mov.f32 %f100, %f101;
@%p9 bra $L__BB0_6;
mad.lo.s32 %r76, %r7, %r45, %r8;
mul.wide.s32 %rd11, %r76, 4;
add.s64 %rd12, %rd1, %rd11;
ld.global.f32 %f100, [%rd12];
$L__BB0_6:
@%p9 bra $L__BB0_9;
add.s32 %r77, %r7, 1;
mad.lo.s32 %r78, %r77, %r45, %r8;
mul.wide.s32 %rd13, %r78, 4;
add.s64 %rd14, %rd1, %rd13;
ld.global.f32 %f101, [%rd14];
$L__BB0_9:
setp.gt.f32 %p11, %f100, %f101;
setp.nan.f32 %p12, %f100, %f100;
or.pred %p13, %p12, %p11;
selp.f32 %f23, %f100, %f101, %p13;
mov.u32 %r91, %tid.z;
mov.u32 %r9, %ntid.y;
mov.u32 %r10, %tid.y;
mad.lo.s32 %r11, %r9, %r91, %r10;
mov.u32 %r12, %ntid.x;
mad.lo.s32 %r13, %r11, %r12, %r1;
mul.wide.u32 %rd19, %r13, 4;
mov.u64 %rd20, _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_145_cu_cf86d722_191105arrayE;
add.s64 %rd2, %rd20, %rd19;
st.shared.f32 [%rd2], %f23;
bar.sync 0;
clz.b32 %r92, %r12;
mov.u32 %r93, 31;
sub.s32 %r94, %r93, %r92;
mov.u32 %r95, 1;
shl.b32 %r14, %r95, %r94;
setp.lt.u32 %p14, %r1, %r14;
add.s32 %r96, %r14, %r1;
setp.lt.u32 %p15, %r96, %r12;
and.pred %p2, %p14, %p15;
add.s32 %r97, %r13, %r14;
mul.wide.s32 %rd21, %r97, 4;
add.s64 %rd3, %rd20, %rd21;
not.pred %p16, %p2;
@%p16 bra $L__BB0_11;
ld.shared.f32 %f24, [%rd3];
ld.shared.f32 %f25, [%rd2];
setp.nan.f32 %p17, %f25, %f25;
setp.gt.f32 %p18, %f25, %f24;
or.pred %p19, %p17, %p18;
selp.f32 %f26, %f25, %f24, %p19;
st.shared.f32 [%rd2], %f26;
$L__BB0_11:
bar.sync 0;
shr.u32 %r98, %r14, 31;
add.s32 %r99, %r14, %r98;
shr.s32 %r126, %r99, 1;
setp.lt.s32 %p20, %r14, 4;
@%p20 bra $L__BB0_16;
mov.u32 %r125, %r126;
$L__BB0_13:
setp.ge.u32 %p21, %r1, %r125;
@%p21 bra $L__BB0_15;
add.s32 %r100, %r125, %r13;
mul.wide.s32 %rd22, %r100, 4;
add.s64 %rd24, %rd20, %rd22;
ld.shared.f32 %f27, [%rd2];
setp.nan.f32 %p22, %f27, %f27;
ld.shared.f32 %f28, [%rd24];
setp.gt.f32 %p23, %f27, %f28;
or.pred %p24, %p22, %p23;
selp.f32 %f29, %f27, %f28, %p24;
st.shared.f32 [%rd2], %f29;
$L__BB0_15:
bar.sync 0;
shr.u32 %r17, %r125, 1;
setp.gt.u32 %p25, %r125, 3;
mov.u32 %r125, %r17;
@%p25 bra $L__BB0_13;
$L__BB0_16:
add.s32 %r101, %r13, 1;
mul.wide.u32 %rd25, %r101, 4;
add.s64 %rd4, %rd20, %rd25;
mov.f32 %f102, 0fFF800000;
@%p4 bra $L__BB0_19;
ld.shared.f32 %f102, [%rd2];
setp.lt.u32 %p27, %r12, 2;
@%p27 bra $L__BB0_19;
ld.shared.f32 %f31, [%rd4];
setp.gt.f32 %p28, %f102, %f31;
setp.nan.f32 %p29, %f102, %f102;
or.pred %p30, %p29, %p28;
selp.f32 %f102, %f102, %f31, %p30;
$L__BB0_19:
bar.sync 0;
mul.wide.s32 %rd27, %r11, 4;
add.s64 %rd5, %rd20, %rd27;
setp.eq.s32 %p31, %r1, 0;
@%p31 bra $L__BB0_20;
bra.uni $L__BB0_21;
$L__BB0_20:
st.shared.f32 [%rd5], %f102;
$L__BB0_21:
bar.sync 0;
ld.shared.f32 %f11, [%rd5];
bar.sync 0;
@%p5 bra $L__BB0_22;
bra.uni $L__BB0_23;
$L__BB0_22:
mov.u32 %r102, %ctaid.x;
mad.lo.s32 %r18, %r9, %r102, %r10;
setp.lt.s32 %p33, %r18, 484;
@%p33 bra $L__BB0_25;
bra.uni $L__BB0_23;
$L__BB0_25:
sub.f32 %f60, %f100, %f11;
mov.f32 %f61, 0f3F000000;
mov.f32 %f62, 0f3BBB989D;
fma.rn.f32 %f63, %f60, %f62, %f61;
cvt.sat.f32.f32 %f64, %f63;
mov.f32 %f65, 0f4B400001;
mov.f32 %f66, 0f437C0000;
fma.rm.f32 %f67, %f64, %f66, %f65;
add.f32 %f68, %f67, 0fCB40007F;
neg.f32 %f69, %f68;
mov.f32 %f70, 0f3FB8AA3B;
fma.rn.f32 %f71, %f60, %f70, %f69;
mov.f32 %f72, 0f32A57060;
fma.rn.f32 %f73, %f60, %f72, %f71;
mov.b32 %r117, %f67;
shl.b32 %r118, %r117, 23;
mov.b32 %f74, %r118;
ex2.approx.ftz.f32 %f75, %f73;
mul.f32 %f76, %f75, %f74;
add.f32 %f77, %f76, 0f00000000;
mov.b32 %r115, %f76;
sub.f32 %f78, %f101, %f11;
fma.rn.f32 %f79, %f78, %f62, %f61;
cvt.sat.f32.f32 %f80, %f79;
fma.rm.f32 %f81, %f80, %f66, %f65;
add.f32 %f82, %f81, 0fCB40007F;
neg.f32 %f83, %f82;
fma.rn.f32 %f84, %f78, %f70, %f83;
fma.rn.f32 %f85, %f78, %f72, %f84;
mov.b32 %r119, %f81;
shl.b32 %r120, %r119, 23;
mov.b32 %f86, %r120;
ex2.approx.ftz.f32 %f87, %f85;
mul.f32 %f88, %f87, %f86;
add.f32 %f103, %f77, %f88;
mov.b32 %r116, %f88;
add.s32 %r121, %r18, %r1;
shl.b32 %r122, %r121, 1;
mul.wide.s32 %rd32, %r122, 4;
add.s64 %rd31, %rd9, %rd32;
// begin inline asm
st.global.cs.v2.s32 [%rd31], {%r115,%r116};
// end inline asm
bra.uni $L__BB0_26;
$L__BB0_23:
mov.u32 %r103, %ctaid.x;
mad.lo.s32 %r104, %r9, %r103, %r10;
setp.lt.s32 %p35, %r104, 484;
sub.f32 %f33, %f100, %f11;
mov.f32 %f34, 0f3F000000;
mov.f32 %f35, 0f3BBB989D;
fma.rn.f32 %f36, %f33, %f35, %f34;
cvt.sat.f32.f32 %f37, %f36;
mov.f32 %f38, 0f4B400001;
mov.f32 %f39, 0f437C0000;
fma.rm.f32 %f40, %f37, %f39, %f38;
add.f32 %f41, %f40, 0fCB40007F;
neg.f32 %f42, %f41;
mov.f32 %f43, 0f3FB8AA3B;
fma.rn.f32 %f44, %f33, %f43, %f42;
mov.f32 %f45, 0f32A57060;
fma.rn.f32 %f46, %f33, %f45, %f44;
mov.b32 %r105, %f40;
shl.b32 %r106, %r105, 23;
mov.b32 %f47, %r106;
ex2.approx.ftz.f32 %f48, %f46;
mul.f32 %f12, %f48, %f47;
add.f32 %f49, %f12, 0f00000000;
mov.f32 %f103, 0f00000000;
and.pred %p3, %p5, %p35;
sub.f32 %f50, %f101, %f11;
fma.rn.f32 %f51, %f50, %f35, %f34;
cvt.sat.f32.f32 %f52, %f51;
fma.rm.f32 %f53, %f52, %f39, %f38;
add.f32 %f54, %f53, 0fCB40007F;
neg.f32 %f55, %f54;
fma.rn.f32 %f56, %f50, %f43, %f55;
fma.rn.f32 %f57, %f50, %f45, %f56;
mov.b32 %r107, %f53;
shl.b32 %r108, %r107, 23;
mov.b32 %f58, %r108;
ex2.approx.ftz.f32 %f59, %f57;
mul.f32 %f13, %f59, %f58;
add.f32 %f14, %f49, %f13;
not.pred %p36, %p3;
@%p36 bra $L__BB0_26;
mov.b32 %r110, %f13;
add.s32 %r111, %r1, %r10;
mad.lo.s32 %r113, %r9, %r103, %r111;
shl.b32 %r114, %r113, 1;
mul.wide.s32 %rd30, %r114, 4;
add.s64 %rd29, %rd9, %rd30;
mov.b32 %r109, %f12;
// begin inline asm
st.global.cs.v2.s32 [%rd29], {%r109,%r110};
// end inline asm
selp.f32 %f103, %f14, 0f00000000, %p3;
$L__BB0_26:
st.shared.f32 [%rd2], %f103;
bar.sync 0;
@%p16 bra $L__BB0_28;
ld.shared.f32 %f89, [%rd3];
ld.shared.f32 %f90, [%rd2];
add.f32 %f91, %f89, %f90;
st.shared.f32 [%rd2], %f91;
$L__BB0_28:
bar.sync 0;
@%p20 bra $L__BB0_32;
$L__BB0_29:
setp.ge.u32 %p39, %r1, %r126;
@%p39 bra $L__BB0_31;
add.s32 %r123, %r126, %r13;
mul.wide.s32 %rd33, %r123, 4;
add.s64 %rd35, %rd20, %rd33;
ld.shared.f32 %f92, [%rd2];
ld.shared.f32 %f93, [%rd35];
add.f32 %f94, %f93, %f92;
st.shared.f32 [%rd2], %f94;
$L__BB0_31:
bar.sync 0;
shr.u32 %r20, %r126, 1;
setp.gt.u32 %p40, %r126, 3;
mov.u32 %r126, %r20;
@%p40 bra $L__BB0_29;
$L__BB0_32:
mov.f32 %f104, 0f00000000;
@%p4 bra $L__BB0_35;
ld.shared.f32 %f96, [%rd2];
add.f32 %f104, %f96, 0f00000000;
setp.lt.u32 %p42, %r12, 2;
@%p42 bra $L__BB0_35;
ld.shared.f32 %f97, [%rd4];
add.f32 %f104, %f104, %f97;
$L__BB0_35:
bar.sync 0;
mov.u32 %r124, %ctaid.x;
mad.lo.s32 %r21, %r9, %r124, %r10;
setp.gt.s32 %p43, %r21, 483;
@%p43 bra $L__BB0_37;
rcp.rn.f32 %f98, %f104;
cvta.to.global.u64 %rd36, %rd8;
mul.wide.s32 %rd37, %r21, 4;
add.s64 %rd38, %rd36, %rd37;
st.global.f32 [%rd38], %f98;
$L__BB0_37:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_145_cu_25d5f882_1601111nvfuser_145ENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_145_cu_25d5f882_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_145_cu_25d5f882_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_145_cu_25d5f882_160113std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_145_cu_25d5f882_160115arrayE[];
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_145_cu_25d5f882_1601111nvfuser_145ENS_6TensorIfLi3ELi3EEES1_S1_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_145_cu_25d5f882_1601111nvfuser_145ENS_6TensorIfLi3ELi3EEES1_S1__param_0[32],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_145_cu_25d5f882_1601111nvfuser_145ENS_6TensorIfLi3ELi3EEES1_S1__param_1[32],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_145_cu_25d5f882_1601111nvfuser_145ENS_6TensorIfLi3ELi3EEES1_S1__param_2[32]
)
{
.reg .pred %p<33>;
.reg .f32 %f<70>;
.reg .b32 %r<86>;
.reg .b64 %rd<32>;
// demoted variable
.shared .align 4 .u32 _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_145_cu_25d5f882_1601111nvfuser_145ENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s;
ld.param.v2.u32 {%r34, %r35}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_145_cu_25d5f882_1601111nvfuser_145ENS_6TensorIfLi3ELi3EEES1_S1__param_0+16];
ld.param.v2.u32 {%r36, %r37}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_145_cu_25d5f882_1601111nvfuser_145ENS_6TensorIfLi3ELi3EEES1_S1__param_0+24];
ld.param.u64 %rd7, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_145_cu_25d5f882_1601111nvfuser_145ENS_6TensorIfLi3ELi3EEES1_S1__param_2];
ld.param.u64 %rd6, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_145_cu_25d5f882_1601111nvfuser_145ENS_6TensorIfLi3ELi3EEES1_S1__param_1];
ld.param.u64 %rd5, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_145_cu_25d5f882_1601111nvfuser_145ENS_6TensorIfLi3ELi3EEES1_S1__param_0];
mov.u32 %r1, %tid.x;
setp.ne.s32 %p2, %r1, 0;
@%p2 bra $L__BB0_2;
mov.u32 %r50, 0;
st.shared.u32 [_ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_145_cu_25d5f882_1601111nvfuser_145ENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s], %r50;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd8, _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_145_cu_25d5f882_1601111nvfuser_145ENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s;
atom.shared.min.s32 %r51, [%rd8], %r1;
mov.u32 %r52, %ctaid.x;
mov.u32 %r2, %ntid.y;
mov.u32 %r3, %tid.y;
mad.lo.s32 %r4, %r2, %r52, %r3;
setp.gt.s32 %p3, %r4, 483;
mov.f32 %f65, 0fFF800000;
mov.f32 %f66, %f65;
@%p3 bra $L__BB0_4;
cvta.to.global.u64 %rd9, %rd5;
ld.shared.u32 %r53, [_ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_145_cu_25d5f882_1601111nvfuser_145ENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s];
shl.b32 %r54, %r53, 1;
mul.hi.s32 %r55, %r4, 780903145;
shr.u32 %r56, %r55, 31;
shr.s32 %r57, %r55, 2;
add.s32 %r58, %r57, %r56;
mul.lo.s32 %r59, %r58, 22;
sub.s32 %r60, %r4, %r59;
mul.lo.s32 %r61, %r36, %r60;
mad.lo.s32 %r62, %r35, %r58, %r61;
mad.lo.s32 %r63, %r54, %r37, %r62;
mul.wide.s32 %rd10, %r63, 4;
add.s64 %rd11, %rd9, %rd10;
ld.global.f32 %f65, [%rd11];
add.s32 %r64, %r63, %r37;
mul.wide.s32 %rd12, %r64, 4;
add.s64 %rd13, %rd9, %rd12;
ld.global.f32 %f66, [%rd13];
$L__BB0_4:
setp.gt.f32 %p4, %f65, %f66;
setp.nan.f32 %p5, %f65, %f65;
or.pred %p6, %p5, %p4;
selp.f32 %f16, %f65, %f66, %p6;
mov.u32 %r65, %tid.z;
mad.lo.s32 %r5, %r2, %r65, %r3;
mov.u32 %r6, %ntid.x;
mad.lo.s32 %r7, %r5, %r6, %r1;
mul.wide.u32 %rd14, %r7, 4;
mov.u64 %rd15, _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_145_cu_25d5f882_160115arrayE;
add.s64 %rd1, %rd15, %rd14;
st.shared.f32 [%rd1], %f16;
bar.sync 0;
clz.b32 %r66, %r6;
mov.u32 %r67, 31;
sub.s32 %r68, %r67, %r66;
mov.u32 %r69, 1;
shl.b32 %r8, %r69, %r68;
setp.lt.u32 %p7, %r1, %r8;
add.s32 %r70, %r8, %r1;
setp.lt.u32 %p8, %r70, %r6;
and.pred %p1, %p7, %p8;
add.s32 %r71, %r7, %r8;
mul.wide.s32 %rd16, %r71, 4;
add.s64 %rd2, %rd15, %rd16;
not.pred %p9, %p1;
@%p9 bra $L__BB0_6;
ld.shared.f32 %f17, [%rd2];
ld.shared.f32 %f18, [%rd1];
setp.nan.f32 %p10, %f18, %f18;
setp.gt.f32 %p11, %f18, %f17;
or.pred %p12, %p10, %p11;
selp.f32 %f19, %f18, %f17, %p12;
st.shared.f32 [%rd1], %f19;
$L__BB0_6:
bar.sync 0;
shr.u32 %r72, %r8, 31;
add.s32 %r73, %r8, %r72;
shr.s32 %r85, %r73, 1;
setp.lt.s32 %p13, %r8, 4;
@%p13 bra $L__BB0_11;
mov.u32 %r84, %r85;
$L__BB0_8:
setp.ge.u32 %p14, %r1, %r84;
@%p14 bra $L__BB0_10;
add.s32 %r74, %r84, %r7;
mul.wide.s32 %rd17, %r74, 4;
add.s64 %rd19, %rd15, %rd17;
ld.shared.f32 %f20, [%rd1];
setp.nan.f32 %p15, %f20, %f20;
ld.shared.f32 %f21, [%rd19];
setp.gt.f32 %p16, %f20, %f21;
or.pred %p17, %p15, %p16;
selp.f32 %f22, %f20, %f21, %p17;
st.shared.f32 [%rd1], %f22;
$L__BB0_10:
bar.sync 0;
shr.u32 %r11, %r84, 1;
setp.gt.u32 %p18, %r84, 3;
mov.u32 %r84, %r11;
@%p18 bra $L__BB0_8;
$L__BB0_11:
add.s32 %r75, %r7, 1;
mul.wide.u32 %rd20, %r75, 4;
add.s64 %rd3, %rd15, %rd20;
mov.f32 %f67, 0fFF800000;
@%p2 bra $L__BB0_14;
ld.shared.f32 %f67, [%rd1];
setp.lt.u32 %p20, %r6, 2;
@%p20 bra $L__BB0_14;
ld.shared.f32 %f24, [%rd3];
setp.gt.f32 %p21, %f67, %f24;
setp.nan.f32 %p22, %f67, %f67;
or.pred %p23, %p22, %p21;
selp.f32 %f67, %f67, %f24, %p23;
$L__BB0_14:
bar.sync 0;
mul.wide.s32 %rd22, %r5, 4;
add.s64 %rd4, %rd15, %rd22;
setp.eq.s32 %p24, %r1, 0;
@%p24 bra $L__BB0_15;
bra.uni $L__BB0_16;
$L__BB0_15:
st.shared.f32 [%rd4], %f67;
$L__BB0_16:
bar.sync 0;
ld.shared.f32 %f8, [%rd4];
bar.sync 0;
mov.f32 %f68, 0f00000000;
@%p3 bra $L__BB0_18;
sub.f32 %f26, %f65, %f8;
mov.f32 %f27, 0f3F000000;
mov.f32 %f28, 0f3BBB989D;
fma.rn.f32 %f29, %f26, %f28, %f27;
cvt.sat.f32.f32 %f30, %f29;
mov.f32 %f31, 0f4B400001;
mov.f32 %f32, 0f437C0000;
fma.rm.f32 %f33, %f30, %f32, %f31;
add.f32 %f34, %f33, 0fCB40007F;
neg.f32 %f35, %f34;
mov.f32 %f36, 0f3FB8AA3B;
fma.rn.f32 %f37, %f26, %f36, %f35;
mov.f32 %f38, 0f32A57060;
fma.rn.f32 %f39, %f26, %f38, %f37;
mov.b32 %r78, %f33;
shl.b32 %r79, %r78, 23;
mov.b32 %f40, %r79;
ex2.approx.ftz.f32 %f41, %f39;
mul.f32 %f42, %f41, %f40;
add.f32 %f43, %f42, 0f00000000;
mov.b32 %r76, %f42;
sub.f32 %f44, %f66, %f8;
fma.rn.f32 %f45, %f44, %f28, %f27;
cvt.sat.f32.f32 %f46, %f45;
fma.rm.f32 %f47, %f46, %f32, %f31;
add.f32 %f48, %f47, 0fCB40007F;
neg.f32 %f49, %f48;
fma.rn.f32 %f50, %f44, %f36, %f49;
fma.rn.f32 %f51, %f44, %f38, %f50;
mov.b32 %r80, %f47;
shl.b32 %r81, %r80, 23;
mov.b32 %f52, %r81;
ex2.approx.ftz.f32 %f53, %f51;
mul.f32 %f54, %f53, %f52;
add.f32 %f68, %f43, %f54;
mov.b32 %r77, %f54;
shl.b32 %r82, %r4, 1;
mul.wide.s32 %rd25, %r82, 4;
add.s64 %rd24, %rd7, %rd25;
// begin inline asm
st.global.cs.v2.s32 [%rd24], {%r76,%r77};
// end inline asm
$L__BB0_18:
st.shared.f32 [%rd1], %f68;
bar.sync 0;
@%p9 bra $L__BB0_20;
ld.shared.f32 %f55, [%rd2];
ld.shared.f32 %f56, [%rd1];
add.f32 %f57, %f55, %f56;
st.shared.f32 [%rd1], %f57;
$L__BB0_20:
bar.sync 0;
@%p13 bra $L__BB0_24;
$L__BB0_21:
setp.ge.u32 %p28, %r1, %r85;
@%p28 bra $L__BB0_23;
add.s32 %r83, %r85, %r7;
mul.wide.s32 %rd26, %r83, 4;
add.s64 %rd28, %rd15, %rd26;
ld.shared.f32 %f58, [%rd1];
ld.shared.f32 %f59, [%rd28];
add.f32 %f60, %f59, %f58;
st.shared.f32 [%rd1], %f60;
$L__BB0_23:
bar.sync 0;
shr.u32 %r13, %r85, 1;
setp.gt.u32 %p29, %r85, 3;
mov.u32 %r85, %r13;
@%p29 bra $L__BB0_21;
$L__BB0_24:
mov.f32 %f69, 0f00000000;
@%p2 bra $L__BB0_27;
ld.shared.f32 %f62, [%rd1];
add.f32 %f69, %f62, 0f00000000;
setp.lt.u32 %p31, %r6, 2;
@%p31 bra $L__BB0_27;
ld.shared.f32 %f63, [%rd3];
add.f32 %f69, %f69, %f63;
$L__BB0_27:
bar.sync 0;
@%p3 bra $L__BB0_29;
rcp.rn.f32 %f64, %f69;
cvta.to.global.u64 %rd29, %rd6;
mul.wide.s32 %rd30, %r4, 4;
add.s64 %rd31, %rd29, %rd30;
st.global.f32 [%rd31], %f64;
$L__BB0_29:
ret;
}
--- 53997da5d
+++ 03a1b695e
@@ -20,379 +20,265 @@
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1__param_0[32],
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1__param_1[32],
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1__param_2[32]
)
{
- .reg .pred %p<44>;
- .reg .f32 %f<105>;
- .reg .b32 %r<127>;
- .reg .b64 %rd<39>;
+ .reg .pred %p<33>;
+ .reg .f32 %f<70>;
+ .reg .b32 %r<86>;
+ .reg .b64 %rd<32>;
.shared .align 4 .u32 _ZZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s;
- ld.param.v2.u32 {%r42, %r43}, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1__param_0+16];
- ld.param.v2.u32 {%r44, %r45}, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1__param_0+24];
- ld.param.u64 %rd9, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1__param_2];
- ld.param.u64 %rd8, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1__param_1];
- ld.param.u64 %rd7, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1__param_0];
- cvta.to.global.u64 %rd1, %rd7;
+ ld.param.v2.u32 {%r34, %r35}, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1__param_0+16];
+ ld.param.v2.u32 {%r36, %r37}, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1__param_0+24];
+ ld.param.u64 %rd7, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1__param_2];
+ ld.param.u64 %rd6, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1__param_1];
+ ld.param.u64 %rd5, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1__param_0];
mov.u32 %r1, %tid.x;
- setp.ne.s32 %p4, %r1, 0;
- @%p4 bra $L__BB0_2;
-
- mov.u32 %r58, 0;
- st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s], %r58;
+ setp.ne.s32 %p2, %r1, 0;
+ @%p2 bra $L__BB0_2;
+
+ mov.u32 %r50, 0;
+ st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s], %r50;
$L__BB0_2:
bar.sync 0;
- mov.u64 %rd10, _ZZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s;
- atom.shared.min.s32 %r59, [%rd10], %r1;
- ld.shared.u32 %r60, [_ZZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s];
- shl.b32 %r5, %r60, 1;
- setp.lt.s32 %p5, %r1, 1;
- @%p5 bra $L__BB0_3;
- bra.uni $L__BB0_4;
-
-$L__BB0_3:
- mov.u32 %r61, %tid.y;
- mov.u32 %r62, %ctaid.x;
- mov.u32 %r63, %ntid.y;
- mad.lo.s32 %r6, %r63, %r62, %r61;
- setp.lt.s32 %p6, %r6, 484;
- @%p6 bra $L__BB0_8;
- bra.uni $L__BB0_4;
+ mov.u64 %rd8, _ZZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s;
+ atom.shared.min.s32 %r51, [%rd8], %r1;
+ mov.u32 %r52, %ctaid.x;
+ mov.u32 %r2, %ntid.y;
+ mov.u32 %r3, %tid.y;
+ mad.lo.s32 %r4, %r2, %r52, %r3;
+ setp.gt.s32 %p3, %r4, 483;
+ mov.f32 %f65, 0fFF800000;
+ mov.f32 %f66, %f65;
+ @%p3 bra $L__BB0_4;
+
+ cvta.to.global.u64 %rd9, %rd5;
+ ld.shared.u32 %r53, [_ZZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s];
+ shl.b32 %r54, %r53, 1;
+ mul.hi.s32 %r55, %r4, 780903145;
+ shr.u32 %r56, %r55, 31;
+ shr.s32 %r57, %r55, 2;
+ add.s32 %r58, %r57, %r56;
+ mul.lo.s32 %r59, %r58, 22;
+ sub.s32 %r60, %r4, %r59;
+ mul.lo.s32 %r61, %r36, %r60;
+ mad.lo.s32 %r62, %r35, %r58, %r61;
+ mad.lo.s32 %r63, %r54, %r37, %r62;
+ mul.wide.s32 %rd10, %r63, 4;
+ add.s64 %rd11, %rd9, %rd10;
+ ld.global.f32 %f65, [%rd11];
+ add.s32 %r64, %r63, %r37;
+ mul.wide.s32 %rd12, %r64, 4;
+ add.s64 %rd13, %rd9, %rd12;
+ ld.global.f32 %f66, [%rd13];
+
+$L__BB0_4:
+ setp.gt.f32 %p4, %f65, %f66;
+ setp.nan.f32 %p5, %f65, %f65;
+ or.pred %p6, %p5, %p4;
+ selp.f32 %f16, %f65, %f66, %p6;
+ mov.u32 %r65, %tid.z;
+ mad.lo.s32 %r5, %r2, %r65, %r3;
+ mov.u32 %r6, %ntid.x;
+ mad.lo.s32 %r7, %r5, %r6, %r1;
+ mul.wide.u32 %rd14, %r7, 4;
+ mov.u64 %rd15, _ZN11kernelscope6kernelE;
+ add.s64 %rd1, %rd15, %rd14;
+ st.shared.f32 [%rd1], %f16;
+ bar.sync 0;
+ clz.b32 %r66, %r6;
+ mov.u32 %r67, 31;
+ sub.s32 %r68, %r67, %r66;
+ mov.u32 %r69, 1;
+ shl.b32 %r8, %r69, %r68;
+ setp.lt.u32 %p7, %r1, %r8;
+ add.s32 %r70, %r8, %r1;
+ setp.lt.u32 %p8, %r70, %r6;
+ and.pred %p1, %p7, %p8;
+ add.s32 %r71, %r7, %r8;
+ mul.wide.s32 %rd16, %r71, 4;
+ add.s64 %rd2, %rd15, %rd16;
+ not.pred %p9, %p1;
+ @%p9 bra $L__BB0_6;
+
+ ld.shared.f32 %f17, [%rd2];
+ ld.shared.f32 %f18, [%rd1];
+ setp.nan.f32 %p10, %f18, %f18;
+ setp.gt.f32 %p11, %f18, %f17;
+ or.pred %p12, %p10, %p11;
+ selp.f32 %f19, %f18, %f17, %p12;
+ st.shared.f32 [%rd1], %f19;
+
+$L__BB0_6:
+ bar.sync 0;
+ shr.u32 %r72, %r8, 31;
+ add.s32 %r73, %r8, %r72;
+ shr.s32 %r85, %r73, 1;
+ setp.lt.s32 %p13, %r8, 4;
+ @%p13 bra $L__BB0_11;
+
+ mov.u32 %r84, %r85;
$L__BB0_8:
- shl.b32 %r79, %r1, 1;
- mul.hi.s32 %r80, %r6, 780903145;
- shr.u32 %r81, %r80, 31;
- shr.s32 %r82, %r80, 2;
- add.s32 %r83, %r82, %r81;
- mul.lo.s32 %r84, %r43, %r83;
- mul.lo.s32 %r85, %r83, 22;
- sub.s32 %r86, %r6, %r85;
- add.s32 %r87, %r5, %r79;
- mad.lo.s32 %r88, %r44, %r86, %r84;
- mad.lo.s32 %r89, %r87, %r45, %r88;
- mul.wide.s32 %rd15, %r89, 4;
- add.s64 %rd16, %rd1, %rd15;
- ld.global.f32 %f100, [%rd16];
- add.s32 %r90, %r89, %r45;
- mul.wide.s32 %rd17, %r90, 4;
- add.s64 %rd18, %rd1, %rd17;
- ld.global.f32 %f101, [%rd18];
- bra.uni $L__BB0_9;
-
-$L__BB0_4:
- mov.u32 %r64, %ntid.y;
- mov.u32 %r65, %ctaid.x;
- mov.u32 %r66, %tid.y;
- mad.lo.s32 %r67, %r64, %r65, %r66;
- setp.lt.s32 %p8, %r67, 484;
- mul.hi.s32 %r68, %r67, 780903145;
- shr.u32 %r69, %r68, 31;
- shr.s32 %r70, %r68, 2;
- add.s32 %r71, %r70, %r69;
- mul.lo.s32 %r72, %r43, %r71;
- mul.lo.s32 %r73, %r71, 22;
- sub.s32 %r74, %r67, %r73;
- shl.b32 %r75, %r1, 1;
- add.s32 %r7, %r5, %r75;
- mad.lo.s32 %r8, %r44, %r74, %r72;
- and.pred %p1, %p5, %p8;
- mov.f32 %f101, 0fFF800000;
- not.pred %p9, %p1;
- mov.f32 %f100, %f101;
- @%p9 bra $L__BB0_6;
-
- mad.lo.s32 %r76, %r7, %r45, %r8;
- mul.wide.s32 %rd11, %r76, 4;
- add.s64 %rd12, %rd1, %rd11;
- ld.global.f32 %f100, [%rd12];
-
-$L__BB0_6:
- @%p9 bra $L__BB0_9;
-
- add.s32 %r77, %r7, 1;
- mad.lo.s32 %r78, %r77, %r45, %r8;
- mul.wide.s32 %rd13, %r78, 4;
- add.s64 %rd14, %rd1, %rd13;
- ld.global.f32 %f101, [%rd14];
-
-$L__BB0_9:
- setp.gt.f32 %p11, %f100, %f101;
- setp.nan.f32 %p12, %f100, %f100;
- or.pred %p13, %p12, %p11;
- selp.f32 %f23, %f100, %f101, %p13;
- mov.u32 %r91, %tid.z;
- mov.u32 %r9, %ntid.y;
- mov.u32 %r10, %tid.y;
- mad.lo.s32 %r11, %r9, %r91, %r10;
- mov.u32 %r12, %ntid.x;
- mad.lo.s32 %r13, %r11, %r12, %r1;
- mul.wide.u32 %rd19, %r13, 4;
- mov.u64 %rd20, _ZN11kernelscope6kernelE;
- add.s64 %rd2, %rd20, %rd19;
- st.shared.f32 [%rd2], %f23;
- bar.sync 0;
- clz.b32 %r92, %r12;
- mov.u32 %r93, 31;
- sub.s32 %r94, %r93, %r92;
- mov.u32 %r95, 1;
- shl.b32 %r14, %r95, %r94;
- setp.lt.u32 %p14, %r1, %r14;
- add.s32 %r96, %r14, %r1;
- setp.lt.u32 %p15, %r96, %r12;
- and.pred %p2, %p14, %p15;
- add.s32 %r97, %r13, %r14;
- mul.wide.s32 %rd21, %r97, 4;
- add.s64 %rd3, %rd20, %rd21;
- not.pred %p16, %p2;
- @%p16 bra $L__BB0_11;
+ setp.ge.u32 %p14, %r1, %r84;
+ @%p14 bra $L__BB0_10;
+
+ add.s32 %r74, %r84, %r7;
+ mul.wide.s32 %rd17, %r74, 4;
+ add.s64 %rd19, %rd15, %rd17;
+ ld.shared.f32 %f20, [%rd1];
+ setp.nan.f32 %p15, %f20, %f20;
+ ld.shared.f32 %f21, [%rd19];
+ setp.gt.f32 %p16, %f20, %f21;
+ or.pred %p17, %p15, %p16;
+ selp.f32 %f22, %f20, %f21, %p17;
+ st.shared.f32 [%rd1], %f22;
+
+$L__BB0_10:
+ bar.sync 0;
+ shr.u32 %r11, %r84, 1;
+ setp.gt.u32 %p18, %r84, 3;
+ mov.u32 %r84, %r11;
+ @%p18 bra $L__BB0_8;
+
+$L__BB0_11:
+ add.s32 %r75, %r7, 1;
+ mul.wide.u32 %rd20, %r75, 4;
+ add.s64 %rd3, %rd15, %rd20;
+ mov.f32 %f67, 0fFF800000;
+ @%p2 bra $L__BB0_14;
+
+ ld.shared.f32 %f67, [%rd1];
+ setp.lt.u32 %p20, %r6, 2;
+ @%p20 bra $L__BB0_14;
ld.shared.f32 %f24, [%rd3];
- ld.shared.f32 %f25, [%rd2];
- setp.nan.f32 %p17, %f25, %f25;
- setp.gt.f32 %p18, %f25, %f24;
- or.pred %p19, %p17, %p18;
- selp.f32 %f26, %f25, %f24, %p19;
- st.shared.f32 [%rd2], %f26;
-
-$L__BB0_11:
- bar.sync 0;
- shr.u32 %r98, %r14, 31;
- add.s32 %r99, %r14, %r98;
- shr.s32 %r126, %r99, 1;
- setp.lt.s32 %p20, %r14, 4;
- @%p20 bra $L__BB0_16;
-
- mov.u32 %r125, %r126;
-
-$L__BB0_13:
- setp.ge.u32 %p21, %r1, %r125;
- @%p21 bra $L__BB0_15;
-
- add.s32 %r100, %r125, %r13;
- mul.wide.s32 %rd22, %r100, 4;
- add.s64 %rd24, %rd20, %rd22;
- ld.shared.f32 %f27, [%rd2];
- setp.nan.f32 %p22, %f27, %f27;
- ld.shared.f32 %f28, [%rd24];
- setp.gt.f32 %p23, %f27, %f28;
- or.pred %p24, %p22, %p23;
- selp.f32 %f29, %f27, %f28, %p24;
- st.shared.f32 [%rd2], %f29;
+ setp.gt.f32 %p21, %f67, %f24;
+ setp.nan.f32 %p22, %f67, %f67;
+ or.pred %p23, %p22, %p21;
+ selp.f32 %f67, %f67, %f24, %p23;
+
+$L__BB0_14:
+ bar.sync 0;
+ mul.wide.s32 %rd22, %r5, 4;
+ add.s64 %rd4, %rd15, %rd22;
+ setp.eq.s32 %p24, %r1, 0;
+ @%p24 bra $L__BB0_15;
+ bra.uni $L__BB0_16;
$L__BB0_15:
- bar.sync 0;
- shr.u32 %r17, %r125, 1;
- setp.gt.u32 %p25, %r125, 3;
- mov.u32 %r125, %r17;
- @%p25 bra $L__BB0_13;
+ st.shared.f32 [%rd4], %f67;
$L__BB0_16:
- add.s32 %r101, %r13, 1;
- mul.wide.u32 %rd25, %r101, 4;
- add.s64 %rd4, %rd20, %rd25;
- mov.f32 %f102, 0fFF800000;
- @%p4 bra $L__BB0_19;
-
- ld.shared.f32 %f102, [%rd2];
- setp.lt.u32 %p27, %r12, 2;
- @%p27 bra $L__BB0_19;
-
- ld.shared.f32 %f31, [%rd4];
- setp.gt.f32 %p28, %f102, %f31;
- setp.nan.f32 %p29, %f102, %f102;
- or.pred %p30, %p29, %p28;
- selp.f32 %f102, %f102, %f31, %p30;
-
-$L__BB0_19:
- bar.sync 0;
- mul.wide.s32 %rd27, %r11, 4;
- add.s64 %rd5, %rd20, %rd27;
- setp.eq.s32 %p31, %r1, 0;
- @%p31 bra $L__BB0_20;
- bra.uni $L__BB0_21;
+ bar.sync 0;
+ ld.shared.f32 %f8, [%rd4];
+ bar.sync 0;
+ mov.f32 %f68, 0f00000000;
+ @%p3 bra $L__BB0_18;
+
+ sub.f32 %f26, %f65, %f8;
+ mov.f32 %f27, 0f3F000000;
+ mov.f32 %f28, 0f3BBB989D;
+ fma.rn.f32 %f29, %f26, %f28, %f27;
+ cvt.sat.f32.f32 %f30, %f29;
+ mov.f32 %f31, 0f4B400001;
+ mov.f32 %f32, 0f437C0000;
+ fma.rm.f32 %f33, %f30, %f32, %f31;
+ add.f32 %f34, %f33, 0fCB40007F;
+ neg.f32 %f35, %f34;
+ mov.f32 %f36, 0f3FB8AA3B;
+ fma.rn.f32 %f37, %f26, %f36, %f35;
+ mov.f32 %f38, 0f32A57060;
+ fma.rn.f32 %f39, %f26, %f38, %f37;
+ mov.b32 %r78, %f33;
+ shl.b32 %r79, %r78, 23;
+ mov.b32 %f40, %r79;
+ ex2.approx.ftz.f32 %f41, %f39;
+ mul.f32 %f42, %f41, %f40;
+ add.f32 %f43, %f42, 0f00000000;
+ mov.b32 %r76, %f42;
+ sub.f32 %f44, %f66, %f8;
+ fma.rn.f32 %f45, %f44, %f28, %f27;
+ cvt.sat.f32.f32 %f46, %f45;
+ fma.rm.f32 %f47, %f46, %f32, %f31;
+ add.f32 %f48, %f47, 0fCB40007F;
+ neg.f32 %f49, %f48;
+ fma.rn.f32 %f50, %f44, %f36, %f49;
+ fma.rn.f32 %f51, %f44, %f38, %f50;
+ mov.b32 %r80, %f47;
+ shl.b32 %r81, %r80, 23;
+ mov.b32 %f52, %r81;
+ ex2.approx.ftz.f32 %f53, %f51;
+ mul.f32 %f54, %f53, %f52;
+ add.f32 %f68, %f43, %f54;
+ mov.b32 %r77, %f54;
+ shl.b32 %r82, %r4, 1;
+ mul.wide.s32 %rd25, %r82, 4;
+ add.s64 %rd24, %rd7, %rd25;
+
+ st.global.cs.v2.s32 [%rd24], {%r76,%r77};
+
+
+$L__BB0_18:
+ st.shared.f32 [%rd1], %f68;
+ bar.sync 0;
+ @%p9 bra $L__BB0_20;
+
+ ld.shared.f32 %f55, [%rd2];
+ ld.shared.f32 %f56, [%rd1];
+ add.f32 %f57, %f55, %f56;
+ st.shared.f32 [%rd1], %f57;
$L__BB0_20:
- st.shared.f32 [%rd5], %f102;
+ bar.sync 0;
+ @%p13 bra $L__BB0_24;
$L__BB0_21:
- bar.sync 0;
- ld.shared.f32 %f11, [%rd5];
- bar.sync 0;
- @%p5 bra $L__BB0_22;
- bra.uni $L__BB0_23;
-
-$L__BB0_22:
- mov.u32 %r102, %ctaid.x;
- mad.lo.s32 %r18, %r9, %r102, %r10;
- setp.lt.s32 %p33, %r18, 484;
- @%p33 bra $L__BB0_25;
- bra.uni $L__BB0_23;
-
-$L__BB0_25:
- sub.f32 %f60, %f100, %f11;
- mov.f32 %f61, 0f3F000000;
- mov.f32 %f62, 0f3BBB989D;
- fma.rn.f32 %f63, %f60, %f62, %f61;
- cvt.sat.f32.f32 %f64, %f63;
- mov.f32 %f65, 0f4B400001;
- mov.f32 %f66, 0f437C0000;
- fma.rm.f32 %f67, %f64, %f66, %f65;
- add.f32 %f68, %f67, 0fCB40007F;
- neg.f32 %f69, %f68;
- mov.f32 %f70, 0f3FB8AA3B;
- fma.rn.f32 %f71, %f60, %f70, %f69;
- mov.f32 %f72, 0f32A57060;
- fma.rn.f32 %f73, %f60, %f72, %f71;
- mov.b32 %r117, %f67;
- shl.b32 %r118, %r117, 23;
- mov.b32 %f74, %r118;
- ex2.approx.ftz.f32 %f75, %f73;
- mul.f32 %f76, %f75, %f74;
- add.f32 %f77, %f76, 0f00000000;
- mov.b32 %r115, %f76;
- sub.f32 %f78, %f101, %f11;
- fma.rn.f32 %f79, %f78, %f62, %f61;
- cvt.sat.f32.f32 %f80, %f79;
- fma.rm.f32 %f81, %f80, %f66, %f65;
- add.f32 %f82, %f81, 0fCB40007F;
- neg.f32 %f83, %f82;
- fma.rn.f32 %f84, %f78, %f70, %f83;
- fma.rn.f32 %f85, %f78, %f72, %f84;
- mov.b32 %r119, %f81;
- shl.b32 %r120, %r119, 23;
- mov.b32 %f86, %r120;
- ex2.approx.ftz.f32 %f87, %f85;
- mul.f32 %f88, %f87, %f86;
- add.f32 %f103, %f77, %f88;
- mov.b32 %r116, %f88;
- add.s32 %r121, %r18, %r1;
- shl.b32 %r122, %r121, 1;
- mul.wide.s32 %rd32, %r122, 4;
- add.s64 %rd31, %rd9, %rd32;
-
- st.global.cs.v2.s32 [%rd31], {%r115,%r116};
-
- bra.uni $L__BB0_26;
+ setp.ge.u32 %p28, %r1, %r85;
+ @%p28 bra $L__BB0_23;
+
+ add.s32 %r83, %r85, %r7;
+ mul.wide.s32 %rd26, %r83, 4;
+ add.s64 %rd28, %rd15, %rd26;
+ ld.shared.f32 %f58, [%rd1];
+ ld.shared.f32 %f59, [%rd28];
+ add.f32 %f60, %f59, %f58;
+ st.shared.f32 [%rd1], %f60;
$L__BB0_23:
- mov.u32 %r103, %ctaid.x;
- mad.lo.s32 %r104, %r9, %r103, %r10;
- setp.lt.s32 %p35, %r104, 484;
- sub.f32 %f33, %f100, %f11;
- mov.f32 %f34, 0f3F000000;
- mov.f32 %f35, 0f3BBB989D;
- fma.rn.f32 %f36, %f33, %f35, %f34;
- cvt.sat.f32.f32 %f37, %f36;
- mov.f32 %f38, 0f4B400001;
- mov.f32 %f39, 0f437C0000;
- fma.rm.f32 %f40, %f37, %f39, %f38;
- add.f32 %f41, %f40, 0fCB40007F;
- neg.f32 %f42, %f41;
- mov.f32 %f43, 0f3FB8AA3B;
- fma.rn.f32 %f44, %f33, %f43, %f42;
- mov.f32 %f45, 0f32A57060;
- fma.rn.f32 %f46, %f33, %f45, %f44;
- mov.b32 %r105, %f40;
- shl.b32 %r106, %r105, 23;
- mov.b32 %f47, %r106;
- ex2.approx.ftz.f32 %f48, %f46;
- mul.f32 %f12, %f48, %f47;
- add.f32 %f49, %f12, 0f00000000;
- mov.f32 %f103, 0f00000000;
- and.pred %p3, %p5, %p35;
- sub.f32 %f50, %f101, %f11;
- fma.rn.f32 %f51, %f50, %f35, %f34;
- cvt.sat.f32.f32 %f52, %f51;
- fma.rm.f32 %f53, %f52, %f39, %f38;
- add.f32 %f54, %f53, 0fCB40007F;
- neg.f32 %f55, %f54;
- fma.rn.f32 %f56, %f50, %f43, %f55;
- fma.rn.f32 %f57, %f50, %f45, %f56;
- mov.b32 %r107, %f53;
- shl.b32 %r108, %r107, 23;
- mov.b32 %f58, %r108;
- ex2.approx.ftz.f32 %f59, %f57;
- mul.f32 %f13, %f59, %f58;
- add.f32 %f14, %f49, %f13;
- not.pred %p36, %p3;
- @%p36 bra $L__BB0_26;
-
- mov.b32 %r110, %f13;
- add.s32 %r111, %r1, %r10;
- mad.lo.s32 %r113, %r9, %r103, %r111;
- shl.b32 %r114, %r113, 1;
- mul.wide.s32 %rd30, %r114, 4;
- add.s64 %rd29, %rd9, %rd30;
- mov.b32 %r109, %f12;
-
- st.global.cs.v2.s32 [%rd29], {%r109,%r110};
-
- selp.f32 %f103, %f14, 0f00000000, %p3;
-
-$L__BB0_26:
- st.shared.f32 [%rd2], %f103;
- bar.sync 0;
- @%p16 bra $L__BB0_28;
-
- ld.shared.f32 %f89, [%rd3];
- ld.shared.f32 %f90, [%rd2];
- add.f32 %f91, %f89, %f90;
- st.shared.f32 [%rd2], %f91;
-
-$L__BB0_28:
- bar.sync 0;
- @%p20 bra $L__BB0_32;
+ bar.sync 0;
+ shr.u32 %r13, %r85, 1;
+ setp.gt.u32 %p29, %r85, 3;
+ mov.u32 %r85, %r13;
+ @%p29 bra $L__BB0_21;
+
+$L__BB0_24:
+ mov.f32 %f69, 0f00000000;
+ @%p2 bra $L__BB0_27;
+
+ ld.shared.f32 %f62, [%rd1];
+ add.f32 %f69, %f62, 0f00000000;
+ setp.lt.u32 %p31, %r6, 2;
+ @%p31 bra $L__BB0_27;
+
+ ld.shared.f32 %f63, [%rd3];
+ add.f32 %f69, %f69, %f63;
+
+$L__BB0_27:
+ bar.sync 0;
+ @%p3 bra $L__BB0_29;
+
+ rcp.rn.f32 %f64, %f69;
+ cvta.to.global.u64 %rd29, %rd6;
+ mul.wide.s32 %rd30, %r4, 4;
+ add.s64 %rd31, %rd29, %rd30;
+ st.global.f32 [%rd31], %f64;
$L__BB0_29:
- setp.ge.u32 %p39, %r1, %r126;
- @%p39 bra $L__BB0_31;
-
- add.s32 %r123, %r126, %r13;
- mul.wide.s32 %rd33, %r123, 4;
- add.s64 %rd35, %rd20, %rd33;
- ld.shared.f32 %f92, [%rd2];
- ld.shared.f32 %f93, [%rd35];
- add.f32 %f94, %f93, %f92;
- st.shared.f32 [%rd2], %f94;
-
-$L__BB0_31:
- bar.sync 0;
- shr.u32 %r20, %r126, 1;
- setp.gt.u32 %p40, %r126, 3;
- mov.u32 %r126, %r20;
- @%p40 bra $L__BB0_29;
-
-$L__BB0_32:
- mov.f32 %f104, 0f00000000;
- @%p4 bra $L__BB0_35;
-
- ld.shared.f32 %f96, [%rd2];
- add.f32 %f104, %f96, 0f00000000;
- setp.lt.u32 %p42, %r12, 2;
- @%p42 bra $L__BB0_35;
-
- ld.shared.f32 %f97, [%rd4];
- add.f32 %f104, %f104, %f97;
-
-$L__BB0_35:
- bar.sync 0;
- mov.u32 %r124, %ctaid.x;
- mad.lo.s32 %r21, %r9, %r124, %r10;
- setp.gt.s32 %p43, %r21, 483;
- @%p43 bra $L__BB0_37;
-
- rcp.rn.f32 %f98, %f104;
- cvta.to.global.u64 %rd36, %rd8;
- mul.wide.s32 %rd37, %r21, 4;
- add.s64 %rd38, %rd36, %rd37;
- st.global.f32 [%rd38], %f98;
-
-$L__BB0_37:
ret;
}
Kernel 145
CUDA
PTX
53997da5d
Diff
03a1b695e
-2
+2 index type: int
registers: 39→ 23
gmem: 3
static smem: 4→ 16
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 5, 5> T1, Tensor<float, 5, 5> T8, Tensor<float, 5, 5> T5, Tensor<float, 5, 5> T11) {
NVFUSER_DEFINE_MAGIC_ZERO;
if (((((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < T1.logical_size[0LL]) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv((12 * T1.logical_size[4LL]), 4)), ((nvfuser_index_t)blockDim.x))))) && ((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) < (12 * T1.logical_size[4LL]))) && ((((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * ((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) / (6 * T1.logical_size[4LL]))) + (((((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) + ((ceilDiv(((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL]), 2)) * (((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) / T5.logical_size[4LL])) < (T5.logical_size[2LL] * T5.logical_size[3LL]))) && (((((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) + ((ceilDiv(((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL]), 2)) * (((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) < ((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL])))) {
Array<float, 4, 1> T12;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
T12[i0] = 0;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
T12[i0]
= T1[((((((T1.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + ((((nvfuser_index_t)blockDim.y) * T1.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.x))) + (T1.alloc_stride[1LL] * ((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i0 + nvfuser_zero)) / (6 * T1.logical_size[4LL])))) + (T1.alloc_stride[2LL] * (((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i0 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) + (T1.alloc_stride[3LL] * ((((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i0 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) / T1.logical_size[4LL]))) + (T1.alloc_stride[4LL] * ((((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i0 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) % T1.logical_size[4LL])))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 4, 1> T13;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
T13[i1] = 0;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
T13[i1]
= T8[(((((T5.logical_size[3LL] * T5.logical_size[2LL]) * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * T5.logical_size[3LL]) * T5.logical_size[2LL]) * ((nvfuser_index_t)blockIdx.x))) + ((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * ((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) / (6 * T1.logical_size[4LL])))) + (((((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) + ((ceilDiv(((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL]), 2)) * (((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) / T5.logical_size[4LL]))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 4, 4> T14;
T14.set(float(0));
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T14[0], &T5[((((4 * ((nvfuser_index_t)threadIdx.x)) + (((T5.logical_size[4LL] * T5.logical_size[3LL]) * T5.logical_size[2LL]) * ((nvfuser_index_t)threadIdx.y))) + ((((((nvfuser_index_t)blockDim.y) * T5.logical_size[4LL]) * T5.logical_size[3LL]) * T5.logical_size[2LL]) * ((nvfuser_index_t)blockIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y)))]);
Array<float, 4, 4> T15;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 4; ++i2) {
Array<float, 1, 1> T9;
T9[0]
= T14[i2]
* T13[i2];
Array<float, 1, 1> T10;
T10[0]
= T9[0];
T15[i2]
= T10[0]
+ T12[i2];
}
NVFUSER_UPDATE_MAGIC_ZERO;
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T11[((((4 * ((nvfuser_index_t)threadIdx.x)) + ((12 * T1.logical_size[4LL]) * ((nvfuser_index_t)threadIdx.y))) + (((12 * ((nvfuser_index_t)blockDim.y)) * T1.logical_size[4LL]) * ((nvfuser_index_t)blockIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y)))], &T15[0]);
} else {
Array<float, 4, 1> T12;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
T12[i0] = 0;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
if (((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < T1.logical_size[0LL]) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv((12 * T1.logical_size[4LL]), 4)), ((nvfuser_index_t)blockDim.x))))) && (((3 + (4 * ((nvfuser_index_t)threadIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) < (12 * T1.logical_size[4LL])))) {
T12[i0]
= T1[((((((T1.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + ((((nvfuser_index_t)blockDim.y) * T1.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.x))) + (T1.alloc_stride[1LL] * ((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i0 + nvfuser_zero)) / (6 * T1.logical_size[4LL])))) + (T1.alloc_stride[2LL] * (((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i0 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) + (T1.alloc_stride[3LL] * ((((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i0 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) / T1.logical_size[4LL]))) + (T1.alloc_stride[4LL] * ((((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i0 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) % T1.logical_size[4LL])))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 4, 1> T13;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
T13[i1] = 0;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
if (((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < T1.logical_size[0LL]) && ((((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * ((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) / (6 * T1.logical_size[4LL]))) + (((((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) + ((ceilDiv(((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL]), 2)) * (((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) / T5.logical_size[4LL])) < (T5.logical_size[2LL] * T5.logical_size[3LL]))) && (((((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) + ((ceilDiv(((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL]), 2)) * (((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) < ((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL])))) {
T13[i1]
= T8[(((((T5.logical_size[3LL] * T5.logical_size[2LL]) * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * T5.logical_size[3LL]) * T5.logical_size[2LL]) * ((nvfuser_index_t)blockIdx.x))) + ((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * ((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) / (6 * T1.logical_size[4LL])))) + (((((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) + ((ceilDiv(((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL]), 2)) * (((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) / T5.logical_size[4LL]))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 4, 4> T14;
T14.set(float(0));
if (((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < T1.logical_size[0LL]) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv((12 * T1.logical_size[4LL]), 4)), ((nvfuser_index_t)blockDim.x))))) && (((3 + (4 * ((nvfuser_index_t)threadIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) < (12 * T1.logical_size[4LL])))) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T14[0], &T5[((((4 * ((nvfuser_index_t)threadIdx.x)) + (((T5.logical_size[4LL] * T5.logical_size[3LL]) * T5.logical_size[2LL]) * ((nvfuser_index_t)threadIdx.y))) + ((((((nvfuser_index_t)blockDim.y) * T5.logical_size[4LL]) * T5.logical_size[3LL]) * T5.logical_size[2LL]) * ((nvfuser_index_t)blockIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y)))]);
}
Array<float, 4, 4> T15;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 4; ++i2) {
Array<float, 1, 1> T9;
T9[0]
= T14[i2]
* T13[i2];
Array<float, 1, 1> T10;
T10[0]
= T9[0];
T15[i2]
= T10[0]
+ T12[i2];
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < T1.logical_size[0LL]) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv((12 * T1.logical_size[4LL]), 4)), ((nvfuser_index_t)blockDim.x))))) && (((3 + (4 * ((nvfuser_index_t)threadIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) < (12 * T1.logical_size[4LL])))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T11[((((4 * ((nvfuser_index_t)threadIdx.x)) + ((12 * T1.logical_size[4LL]) * ((nvfuser_index_t)threadIdx.y))) + (((12 * ((nvfuser_index_t)blockDim.y)) * T1.logical_size[4LL]) * ((nvfuser_index_t)blockIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y)))], &T15[0]);
}
}
}
__global__ void nvfuser_N(Tensor<float, 5, 5> T1, Tensor<float, 5, 5> T8, Tensor<float, 5, 5> T5, Tensor<float, 5, 5> T11) {
NVFUSER_DEFINE_MAGIC_ZERO;
if ((((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < T1.logical_size[0LL]) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv((12 * T1.logical_size[4LL]), 4)), ((nvfuser_index_t)blockDim.x))))) && ((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) < (12 * T1.logical_size[4LL]))) && ((((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * ((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) / (6 * T1.logical_size[4LL]))) + (((((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) + ((ceilDiv(((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL]), 2)) * (((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) / T5.logical_size[4LL])) < (T5.logical_size[2LL] * T5.logical_size[3LL])))) {
Array<float, 4, 1> T12;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
T12[i0] = 0;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
T12[i0]
= T1[((((((T1.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + ((((nvfuser_index_t)blockDim.y) * T1.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.x))) + (T1.alloc_stride[1LL] * ((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i0 + nvfuser_zero)) / (6 * T1.logical_size[4LL])))) + (T1.alloc_stride[2LL] * (((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i0 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) + (T1.alloc_stride[3LL] * ((((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i0 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) / T1.logical_size[4LL]))) + (T1.alloc_stride[4LL] * ((((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i0 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) % T1.logical_size[4LL])))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 4, 1> T13;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
T13[i1] = 0;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
T13[i1]
= T8[(((((T5.logical_size[3LL] * T5.logical_size[2LL]) * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * T5.logical_size[3LL]) * T5.logical_size[2LL]) * ((nvfuser_index_t)blockIdx.x))) + ((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * ((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) / (6 * T1.logical_size[4LL])))) + (((((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) + ((ceilDiv(((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL]), 2)) * (((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) / T5.logical_size[4LL]))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 4, 4> T14;
T14.set(float(0));
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T14[0], &T5[((((4 * ((nvfuser_index_t)threadIdx.x)) + (((T5.logical_size[4LL] * T5.logical_size[3LL]) * T5.logical_size[2LL]) * ((nvfuser_index_t)threadIdx.y))) + ((((((nvfuser_index_t)blockDim.y) * T5.logical_size[4LL]) * T5.logical_size[3LL]) * T5.logical_size[2LL]) * ((nvfuser_index_t)blockIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y)))]);
Array<float, 4, 4> T15;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 4; ++i2) {
Array<float, 1, 1> T9;
T9[0]
= T14[i2]
* T13[i2];
Array<float, 1, 1> T10;
T10[0]
= T9[0];
T15[i2]
= T10[0]
+ T12[i2];
}
NVFUSER_UPDATE_MAGIC_ZERO;
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T11[((((4 * ((nvfuser_index_t)threadIdx.x)) + ((12 * T1.logical_size[4LL]) * ((nvfuser_index_t)threadIdx.y))) + (((12 * ((nvfuser_index_t)blockDim.y)) * T1.logical_size[4LL]) * ((nvfuser_index_t)blockIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y)))], &T15[0]);
} else {
Array<float, 4, 1> T12;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
T12[i0] = 0;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
if (((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < T1.logical_size[0LL]) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv((12 * T1.logical_size[4LL]), 4)), ((nvfuser_index_t)blockDim.x))))) && (((3 + (4 * ((nvfuser_index_t)threadIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) < (12 * T1.logical_size[4LL])))) {
T12[i0]
= T1[((((((T1.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + ((((nvfuser_index_t)blockDim.y) * T1.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.x))) + (T1.alloc_stride[1LL] * ((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i0 + nvfuser_zero)) / (6 * T1.logical_size[4LL])))) + (T1.alloc_stride[2LL] * (((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i0 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) + (T1.alloc_stride[3LL] * ((((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i0 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) / T1.logical_size[4LL]))) + (T1.alloc_stride[4LL] * ((((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i0 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) % T1.logical_size[4LL])))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 4, 1> T13;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
T13[i1] = 0;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
if ((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < T1.logical_size[0LL]) && ((((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * ((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) / (6 * T1.logical_size[4LL]))) + (((((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) + ((ceilDiv(((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL]), 2)) * (((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) / T5.logical_size[4LL])) < (T5.logical_size[2LL] * T5.logical_size[3LL])))) {
T13[i1]
= T8[(((((T5.logical_size[3LL] * T5.logical_size[2LL]) * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * T5.logical_size[3LL]) * T5.logical_size[2LL]) * ((nvfuser_index_t)blockIdx.x))) + ((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * ((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) / (6 * T1.logical_size[4LL])))) + (((((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) + ((ceilDiv(((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL]), 2)) * (((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) / T5.logical_size[4LL]))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 4, 4> T14;
T14.set(float(0));
if (((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < T1.logical_size[0LL]) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv((12 * T1.logical_size[4LL]), 4)), ((nvfuser_index_t)blockDim.x))))) && (((3 + (4 * ((nvfuser_index_t)threadIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) < (12 * T1.logical_size[4LL])))) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T14[0], &T5[((((4 * ((nvfuser_index_t)threadIdx.x)) + (((T5.logical_size[4LL] * T5.logical_size[3LL]) * T5.logical_size[2LL]) * ((nvfuser_index_t)threadIdx.y))) + ((((((nvfuser_index_t)blockDim.y) * T5.logical_size[4LL]) * T5.logical_size[3LL]) * T5.logical_size[2LL]) * ((nvfuser_index_t)blockIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y)))]);
}
Array<float, 4, 4> T15;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 4; ++i2) {
Array<float, 1, 1> T9;
T9[0]
= T14[i2]
* T13[i2];
Array<float, 1, 1> T10;
T10[0]
= T9[0];
T15[i2]
= T10[0]
+ T12[i2];
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < T1.logical_size[0LL]) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv((12 * T1.logical_size[4LL]), 4)), ((nvfuser_index_t)blockDim.x))))) && (((3 + (4 * ((nvfuser_index_t)threadIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) < (12 * T1.logical_size[4LL])))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T11[((((4 * ((nvfuser_index_t)threadIdx.x)) + ((12 * T1.logical_size[4LL]) * ((nvfuser_index_t)threadIdx.y))) + (((12 * ((nvfuser_index_t)blockDim.y)) * T1.logical_size[4LL]) * ((nvfuser_index_t)blockIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y)))], &T15[0]);
}
}
}
--- 53997da5d
+++ 03a1b695e
@@ -1,8 +1,8 @@
__global__ void nvfuser_N(Tensor<float, 5, 5> T1, Tensor<float, 5, 5> T8, Tensor<float, 5, 5> T5, Tensor<float, 5, 5> T11) {
NVFUSER_DEFINE_MAGIC_ZERO;
- if (((((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < T1.logical_size[0LL]) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv((12 * T1.logical_size[4LL]), 4)), ((nvfuser_index_t)blockDim.x))))) && ((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) < (12 * T1.logical_size[4LL]))) && ((((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * ((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) / (6 * T1.logical_size[4LL]))) + (((((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) + ((ceilDiv(((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL]), 2)) * (((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) / T5.logical_size[4LL])) < (T5.logical_size[2LL] * T5.logical_size[3LL]))) && (((((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) + ((ceilDiv(((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL]), 2)) * (((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) < ((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL])))) {
+ if ((((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < T1.logical_size[0LL]) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv((12 * T1.logical_size[4LL]), 4)), ((nvfuser_index_t)blockDim.x))))) && ((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) < (12 * T1.logical_size[4LL]))) && ((((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * ((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) / (6 * T1.logical_size[4LL]))) + (((((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) + ((ceilDiv(((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL]), 2)) * (((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) / T5.logical_size[4LL])) < (T5.logical_size[2LL] * T5.logical_size[3LL])))) {
Array<float, 4, 1> T12;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
T12[i0] = 0;
}
@@ -65,11 +65,11 @@
T13[i1] = 0;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
- if (((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < T1.logical_size[0LL]) && ((((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * ((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) / (6 * T1.logical_size[4LL]))) + (((((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) + ((ceilDiv(((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL]), 2)) * (((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) / T5.logical_size[4LL])) < (T5.logical_size[2LL] * T5.logical_size[3LL]))) && (((((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) + ((ceilDiv(((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL]), 2)) * (((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) < ((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL])))) {
+ if ((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < T1.logical_size[0LL]) && ((((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * ((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) / (6 * T1.logical_size[4LL]))) + (((((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) + ((ceilDiv(((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL]), 2)) * (((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) / T5.logical_size[4LL])) < (T5.logical_size[2LL] * T5.logical_size[3LL])))) {
T13[i1]
= T8[(((((T5.logical_size[3LL] * T5.logical_size[2LL]) * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * T5.logical_size[3LL]) * T5.logical_size[2LL]) * ((nvfuser_index_t)blockIdx.x))) + ((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * ((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) / (6 * T1.logical_size[4LL])))) + (((((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) + ((ceilDiv(((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL]), 2)) * (((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) / T5.logical_size[4LL]))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_40def831_1911011nvfuser_150ENS_6TensorIfLi5ELi5EEES1_S1_S1_E14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_40def831_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_40def831_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_40def831_191103std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_40def831_1911011nvfuser_150ENS_6TensorIfLi5ELi5EEES1_S1_S1_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_40def831_1911011nvfuser_150ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0[48],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_40def831_1911011nvfuser_150ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_1[48],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_40def831_1911011nvfuser_150ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_2[48],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_40def831_1911011nvfuser_150ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_3[48]
)
{
.reg .pred %p<47>;
.reg .f32 %f<97>;
.reg .b32 %r<372>;
.reg .b64 %rd<49>;
// demoted variable
.shared .align 4 .u32 _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_40def831_1911011nvfuser_150ENS_6TensorIfLi5ELi5EEES1_S1_S1_E14nvfuser_zero_s;
ld.param.v2.u32 {%r66, %r67}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_40def831_1911011nvfuser_150ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0+8];
ld.param.v2.u32 {%r70, %r71}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_40def831_1911011nvfuser_150ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0+24];
ld.param.v2.u32 {%r72, %r73}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_40def831_1911011nvfuser_150ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0+32];
ld.param.v2.u32 {%r74, %r75}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_40def831_1911011nvfuser_150ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0+40];
ld.param.v2.u32 {%r76, %r77}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_40def831_1911011nvfuser_150ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_2+24];
ld.param.v2.u32 {%r78, %r79}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_40def831_1911011nvfuser_150ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_2+16];
ld.param.u64 %rd6, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_40def831_1911011nvfuser_150ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_3];
ld.param.u64 %rd5, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_40def831_1911011nvfuser_150ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0];
ld.param.u64 %rd7, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_40def831_1911011nvfuser_150ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_1];
cvta.to.global.u64 %rd1, %rd5;
cvta.to.global.u64 %rd2, %rd7;
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_40def831_1911011nvfuser_150ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_2];
mov.u32 %r4, %tid.x;
setp.ne.s32 %p1, %r4, 0;
@%p1 bra $L__BB0_2;
mov.u32 %r90, 0;
st.shared.u32 [_ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_40def831_1911011nvfuser_150ENS_6TensorIfLi5ELi5EEES1_S1_S1_E14nvfuser_zero_s], %r90;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd8, _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_40def831_1911011nvfuser_150ENS_6TensorIfLi5ELi5EEES1_S1_S1_E14nvfuser_zero_s;
atom.shared.min.s32 %r91, [%rd8], %r4;
ld.shared.u32 %r12, [_ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_40def831_1911011nvfuser_150ENS_6TensorIfLi5ELi5EEES1_S1_S1_E14nvfuser_zero_s];
mov.u32 %r92, %ctaid.x;
mov.u32 %r93, %ntid.y;
mul.lo.s32 %r13, %r93, %r92;
mov.u32 %r14, %tid.y;
add.s32 %r15, %r13, %r14;
setp.lt.s32 %p2, %r15, %r66;
@%p2 bra $L__BB0_3;
bra.uni $L__BB0_6;
$L__BB0_3:
mov.u32 %r16, %ctaid.y;
mul.lo.s32 %r17, %r70, 12;
or.b32 %r94, %r17, 3;
shr.s32 %r95, %r94, 31;
shr.u32 %r96, %r95, 30;
add.s32 %r97, %r94, %r96;
shr.s32 %r98, %r97, 2;
mov.u32 %r18, %ntid.x;
add.s32 %r99, %r18, %r98;
add.s32 %r100, %r99, -1;
div.s32 %r101, %r100, %r18;
setp.ge.s32 %p3, %r16, %r101;
@%p3 bra $L__BB0_6;
shl.b32 %r19, %r4, 2;
mul.lo.s32 %r102, %r18, %r16;
shl.b32 %r20, %r102, 2;
add.s32 %r103, %r19, %r20;
or.b32 %r21, %r103, 3;
setp.ge.s32 %p4, %r21, %r17;
@%p4 bra $L__BB0_6;
mul.lo.s32 %r22, %r78, %r79;
add.s32 %r104, %r22, 1;
shr.u32 %r105, %r104, 31;
add.s32 %r106, %r104, %r105;
shr.s32 %r23, %r106, 1;
mul.lo.s32 %r24, %r70, 6;
div.s32 %r107, %r21, %r24;
mul.lo.s32 %r108, %r107, %r24;
sub.s32 %r109, %r21, %r108;
mul.lo.s32 %r25, %r70, 3;
div.s32 %r110, %r109, %r25;
mul.lo.s32 %r111, %r110, %r25;
sub.s32 %r112, %r109, %r111;
mul.lo.s32 %r113, %r23, %r76;
add.s32 %r114, %r113, 1;
shr.u32 %r115, %r114, 31;
add.s32 %r116, %r114, %r115;
shr.s32 %r26, %r116, 1;
mad.lo.s32 %r117, %r110, %r26, %r112;
div.s32 %r118, %r117, %r76;
mad.lo.s32 %r119, %r107, %r23, %r118;
setp.lt.s32 %p5, %r119, %r22;
setp.lt.s32 %p6, %r117, %r113;
and.pred %p7, %p5, %p6;
@%p7 bra $L__BB0_37;
bra.uni $L__BB0_6;
$L__BB0_37:
shl.b32 %r263, %r12, 1;
add.s32 %r264, %r20, %r19;
add.s32 %r265, %r264, %r263;
div.s32 %r266, %r265, %r24;
mul.lo.s32 %r267, %r266, %r24;
sub.s32 %r268, %r265, %r267;
div.s32 %r269, %r268, %r25;
mul.lo.s32 %r270, %r269, %r25;
sub.s32 %r271, %r268, %r270;
div.s32 %r272, %r271, %r70;
mul.lo.s32 %r273, %r272, %r70;
sub.s32 %r274, %r271, %r273;
mul.lo.s32 %r275, %r15, %r71;
mad.lo.s32 %r276, %r266, %r72, %r275;
mad.lo.s32 %r277, %r269, %r73, %r276;
mad.lo.s32 %r278, %r272, %r74, %r277;
mad.lo.s32 %r279, %r274, %r75, %r278;
mul.wide.s32 %rd31, %r279, 4;
add.s64 %rd32, %rd1, %rd31;
ld.global.f32 %f57, [%rd32];
add.s32 %r280, %r265, 1;
div.s32 %r281, %r280, %r24;
mul.lo.s32 %r282, %r281, %r24;
sub.s32 %r283, %r280, %r282;
div.s32 %r284, %r283, %r25;
mul.lo.s32 %r285, %r284, %r25;
sub.s32 %r286, %r283, %r285;
div.s32 %r287, %r286, %r70;
mul.lo.s32 %r288, %r287, %r70;
sub.s32 %r289, %r286, %r288;
mad.lo.s32 %r290, %r281, %r72, %r275;
mad.lo.s32 %r291, %r284, %r73, %r290;
mad.lo.s32 %r292, %r287, %r74, %r291;
mad.lo.s32 %r293, %r289, %r75, %r292;
mul.wide.s32 %rd33, %r293, 4;
add.s64 %rd34, %rd1, %rd33;
ld.global.f32 %f58, [%rd34];
add.s32 %r294, %r265, 2;
div.s32 %r295, %r294, %r24;
mul.lo.s32 %r296, %r295, %r24;
sub.s32 %r297, %r294, %r296;
div.s32 %r298, %r297, %r25;
mul.lo.s32 %r299, %r298, %r25;
sub.s32 %r300, %r297, %r299;
div.s32 %r301, %r300, %r70;
mul.lo.s32 %r302, %r301, %r70;
sub.s32 %r303, %r300, %r302;
mad.lo.s32 %r304, %r295, %r72, %r275;
mad.lo.s32 %r305, %r298, %r73, %r304;
mad.lo.s32 %r306, %r301, %r74, %r305;
mad.lo.s32 %r307, %r303, %r75, %r306;
mul.wide.s32 %rd35, %r307, 4;
add.s64 %rd36, %rd1, %rd35;
ld.global.f32 %f59, [%rd36];
add.s32 %r308, %r265, 3;
div.s32 %r309, %r308, %r24;
mul.lo.s32 %r310, %r309, %r24;
sub.s32 %r311, %r308, %r310;
div.s32 %r312, %r311, %r25;
mul.lo.s32 %r313, %r312, %r25;
sub.s32 %r314, %r311, %r313;
div.s32 %r315, %r314, %r70;
mul.lo.s32 %r316, %r315, %r70;
sub.s32 %r317, %r314, %r316;
mad.lo.s32 %r318, %r309, %r72, %r275;
mad.lo.s32 %r319, %r312, %r73, %r318;
mad.lo.s32 %r320, %r315, %r74, %r319;
mad.lo.s32 %r321, %r317, %r75, %r320;
mul.wide.s32 %rd37, %r321, 4;
add.s64 %rd38, %rd1, %rd37;
ld.global.f32 %f60, [%rd38];
shl.b32 %r322, %r12, 3;
add.s32 %r323, %r264, %r322;
div.s32 %r324, %r323, %r24;
mul.lo.s32 %r325, %r324, %r24;
sub.s32 %r326, %r323, %r325;
div.s32 %r327, %r326, %r25;
mul.lo.s32 %r328, %r327, %r25;
sub.s32 %r329, %r326, %r328;
mad.lo.s32 %r330, %r327, %r26, %r329;
div.s32 %r331, %r330, %r76;
mul.lo.s32 %r332, %r22, %r15;
mad.lo.s32 %r333, %r324, %r23, %r332;
add.s32 %r334, %r333, %r331;
mul.wide.s32 %rd39, %r334, 4;
add.s64 %rd40, %rd2, %rd39;
ld.global.f32 %f61, [%rd40];
add.s32 %r335, %r323, 1;
div.s32 %r336, %r335, %r24;
mul.lo.s32 %r337, %r336, %r24;
sub.s32 %r338, %r335, %r337;
div.s32 %r339, %r338, %r25;
mul.lo.s32 %r340, %r339, %r25;
sub.s32 %r341, %r338, %r340;
mad.lo.s32 %r342, %r339, %r26, %r341;
div.s32 %r343, %r342, %r76;
mad.lo.s32 %r344, %r336, %r23, %r332;
add.s32 %r345, %r344, %r343;
mul.wide.s32 %rd41, %r345, 4;
add.s64 %rd42, %rd2, %rd41;
ld.global.f32 %f62, [%rd42];
add.s32 %r346, %r323, 2;
div.s32 %r347, %r346, %r24;
mul.lo.s32 %r348, %r347, %r24;
sub.s32 %r349, %r346, %r348;
div.s32 %r350, %r349, %r25;
mul.lo.s32 %r351, %r350, %r25;
sub.s32 %r352, %r349, %r351;
mad.lo.s32 %r353, %r350, %r26, %r352;
div.s32 %r354, %r353, %r76;
mad.lo.s32 %r355, %r347, %r23, %r332;
add.s32 %r356, %r355, %r354;
mul.wide.s32 %rd43, %r356, 4;
add.s64 %rd44, %rd2, %rd43;
ld.global.f32 %f63, [%rd44];
add.s32 %r357, %r323, 3;
div.s32 %r358, %r357, %r24;
mul.lo.s32 %r359, %r358, %r24;
sub.s32 %r360, %r357, %r359;
div.s32 %r361, %r360, %r25;
mul.lo.s32 %r362, %r361, %r25;
sub.s32 %r363, %r360, %r362;
mad.lo.s32 %r364, %r361, %r26, %r363;
div.s32 %r365, %r364, %r76;
mad.lo.s32 %r366, %r358, %r23, %r332;
add.s32 %r367, %r366, %r365;
mul.wide.s32 %rd45, %r367, 4;
add.s64 %rd46, %rd2, %rd45;
ld.global.f32 %f64, [%rd46];
mul.lo.s32 %r368, %r78, %r76;
mul.lo.s32 %r369, %r15, %r79;
mad.lo.s32 %r370, %r368, %r369, %r264;
mul.wide.s32 %rd47, %r370, 4;
add.s64 %rd29, %rd3, %rd47;
// begin inline asm
ld.global.cs.v4.u32 {%r255,%r256,%r257,%r258}, [%rd29];
// end inline asm
mov.b32 %f65, %r255;
fma.rn.f32 %f66, %f61, %f65, %f57;
mov.b32 %r259, %f66;
mov.b32 %f67, %r256;
fma.rn.f32 %f68, %f62, %f67, %f58;
mov.b32 %r260, %f68;
mov.b32 %f69, %r257;
fma.rn.f32 %f70, %f63, %f69, %f59;
mov.b32 %r261, %f70;
mov.b32 %f71, %r258;
fma.rn.f32 %f72, %f64, %f71, %f60;
mov.b32 %r262, %f72;
mad.lo.s32 %r371, %r17, %r15, %r264;
mul.wide.s32 %rd48, %r371, 4;
add.s64 %rd30, %rd6, %rd48;
// begin inline asm
st.global.cs.v4.s32 [%rd30], {%r259,%r260,%r261,%r262};
// end inline asm
bra.uni $L__BB0_38;
$L__BB0_6:
setp.ge.s32 %p8, %r15, %r66;
mul.lo.s32 %r27, %r70, 12;
or.b32 %r120, %r27, 3;
shr.s32 %r121, %r120, 31;
shr.u32 %r122, %r121, 30;
add.s32 %r123, %r120, %r122;
shr.s32 %r124, %r123, 2;
mov.u32 %r29, %ntid.x;
add.s32 %r125, %r29, %r124;
add.s32 %r28, %r125, -1;
shl.b32 %r126, %r4, 2;
shl.b32 %r127, %r29, 2;
mov.u32 %r30, %ctaid.y;
mad.lo.s32 %r128, %r127, %r30, %r126;
or.b32 %r31, %r128, 3;
shl.b32 %r129, %r12, 1;
add.s32 %r130, %r31, %r129;
add.s32 %r32, %r130, -3;
mul.lo.s32 %r33, %r70, 6;
mul.lo.s32 %r34, %r70, 3;
mul.lo.s32 %r35, %r15, %r71;
mov.f32 %f86, 0f00000000;
mov.f32 %f85, %f86;
@%p8 bra $L__BB0_9;
div.s32 %r131, %r28, %r29;
setp.ge.s32 %p9, %r30, %r131;
setp.ge.s32 %p10, %r31, %r27;
or.pred %p11, %p9, %p10;
@%p11 bra $L__BB0_9;
div.s32 %r132, %r32, %r33;
mul.lo.s32 %r133, %r132, %r33;
sub.s32 %r134, %r32, %r133;
div.s32 %r135, %r134, %r34;
mul.lo.s32 %r136, %r135, %r34;
sub.s32 %r137, %r134, %r136;
div.s32 %r138, %r137, %r70;
mul.lo.s32 %r139, %r138, %r70;
sub.s32 %r140, %r137, %r139;
mad.lo.s32 %r141, %r132, %r72, %r35;
mad.lo.s32 %r142, %r135, %r73, %r141;
mad.lo.s32 %r143, %r138, %r74, %r142;
mad.lo.s32 %r144, %r140, %r75, %r143;
mul.wide.s32 %rd9, %r144, 4;
add.s64 %rd10, %rd1, %rd9;
ld.global.f32 %f85, [%rd10];
$L__BB0_9:
@%p8 bra $L__BB0_12;
div.s32 %r145, %r28, %r29;
setp.ge.s32 %p13, %r30, %r145;
setp.ge.s32 %p14, %r31, %r27;
or.pred %p15, %p13, %p14;
@%p15 bra $L__BB0_12;
add.s32 %r146, %r32, 1;
div.s32 %r147, %r146, %r33;
mul.lo.s32 %r148, %r147, %r33;
sub.s32 %r149, %r146, %r148;
div.s32 %r150, %r149, %r34;
mul.lo.s32 %r151, %r150, %r34;
sub.s32 %r152, %r149, %r151;
div.s32 %r153, %r152, %r70;
mul.lo.s32 %r154, %r153, %r70;
sub.s32 %r155, %r152, %r154;
mad.lo.s32 %r156, %r147, %r72, %r35;
mad.lo.s32 %r157, %r150, %r73, %r156;
mad.lo.s32 %r158, %r153, %r74, %r157;
mad.lo.s32 %r159, %r155, %r75, %r158;
mul.wide.s32 %rd11, %r159, 4;
add.s64 %rd12, %rd1, %rd11;
ld.global.f32 %f86, [%rd12];
$L__BB0_12:
mov.f32 %f88, 0f00000000;
mov.f32 %f87, %f88;
@%p8 bra $L__BB0_15;
div.s32 %r160, %r28, %r29;
setp.ge.s32 %p17, %r30, %r160;
setp.ge.s32 %p18, %r31, %r27;
or.pred %p19, %p17, %p18;
@%p19 bra $L__BB0_15;
add.s32 %r161, %r32, 2;
div.s32 %r162, %r161, %r33;
mul.lo.s32 %r163, %r162, %r33;
sub.s32 %r164, %r161, %r163;
div.s32 %r165, %r164, %r34;
mul.lo.s32 %r166, %r165, %r34;
sub.s32 %r167, %r164, %r166;
div.s32 %r168, %r167, %r70;
mul.lo.s32 %r169, %r168, %r70;
sub.s32 %r170, %r167, %r169;
mad.lo.s32 %r171, %r162, %r72, %r35;
mad.lo.s32 %r172, %r165, %r73, %r171;
mad.lo.s32 %r173, %r168, %r74, %r172;
mad.lo.s32 %r174, %r170, %r75, %r173;
mul.wide.s32 %rd13, %r174, 4;
add.s64 %rd14, %rd1, %rd13;
ld.global.f32 %f87, [%rd14];
$L__BB0_15:
@%p8 bra $L__BB0_18;
div.s32 %r175, %r28, %r29;
setp.ge.s32 %p21, %r30, %r175;
setp.ge.s32 %p22, %r31, %r27;
or.pred %p23, %p21, %p22;
@%p23 bra $L__BB0_18;
add.s32 %r176, %r32, 3;
div.s32 %r177, %r176, %r33;
mul.lo.s32 %r178, %r177, %r33;
sub.s32 %r179, %r176, %r178;
div.s32 %r180, %r179, %r34;
mul.lo.s32 %r181, %r180, %r34;
sub.s32 %r182, %r179, %r181;
div.s32 %r183, %r182, %r70;
mul.lo.s32 %r184, %r183, %r70;
sub.s32 %r185, %r182, %r184;
mad.lo.s32 %r186, %r177, %r72, %r35;
mad.lo.s32 %r187, %r180, %r73, %r186;
mad.lo.s32 %r188, %r183, %r74, %r187;
mad.lo.s32 %r189, %r185, %r75, %r188;
mul.wide.s32 %rd15, %r189, 4;
add.s64 %rd16, %rd1, %rd15;
ld.global.f32 %f88, [%rd16];
$L__BB0_18:
mul.lo.s32 %r36, %r78, %r79;
add.s32 %r190, %r36, 1;
shr.u32 %r191, %r190, 31;
add.s32 %r192, %r190, %r191;
shr.s32 %r37, %r192, 1;
mad.lo.s32 %r193, %r30, %r29, %r4;
shl.b32 %r194, %r193, 2;
shl.b32 %r195, %r12, 3;
add.s32 %r38, %r195, %r194;
mul.lo.s32 %r39, %r37, %r76;
add.s32 %r196, %r39, 1;
shr.u32 %r197, %r196, 31;
add.s32 %r198, %r196, %r197;
shr.s32 %r40, %r198, 1;
mul.lo.s32 %r41, %r36, %r15;
mov.f32 %f90, 0f00000000;
mov.f32 %f89, %f90;
@%p8 bra $L__BB0_21;
div.s32 %r199, %r38, %r33;
mul.lo.s32 %r200, %r199, %r33;
sub.s32 %r201, %r38, %r200;
div.s32 %r202, %r201, %r34;
mul.lo.s32 %r203, %r202, %r34;
sub.s32 %r204, %r201, %r203;
mad.lo.s32 %r205, %r202, %r40, %r204;
div.s32 %r206, %r205, %r76;
mad.lo.s32 %r42, %r199, %r37, %r206;
setp.ge.s32 %p25, %r42, %r36;
setp.ge.s32 %p26, %r205, %r39;
or.pred %p27, %p25, %p26;
@%p27 bra $L__BB0_21;
add.s32 %r207, %r42, %r41;
mul.wide.s32 %rd17, %r207, 4;
add.s64 %rd18, %rd2, %rd17;
ld.global.f32 %f89, [%rd18];
$L__BB0_21:
@%p8 bra $L__BB0_24;
add.s32 %r208, %r38, 1;
div.s32 %r209, %r208, %r33;
mul.lo.s32 %r210, %r209, %r33;
sub.s32 %r211, %r208, %r210;
div.s32 %r212, %r211, %r34;
mul.lo.s32 %r213, %r212, %r34;
sub.s32 %r214, %r211, %r213;
mad.lo.s32 %r215, %r212, %r40, %r214;
div.s32 %r216, %r215, %r76;
mad.lo.s32 %r43, %r209, %r37, %r216;
setp.ge.s32 %p29, %r43, %r36;
setp.ge.s32 %p30, %r215, %r39;
or.pred %p31, %p29, %p30;
@%p31 bra $L__BB0_24;
add.s32 %r217, %r43, %r41;
mul.wide.s32 %rd19, %r217, 4;
add.s64 %rd20, %rd2, %rd19;
ld.global.f32 %f90, [%rd20];
$L__BB0_24:
mov.f32 %f92, 0f00000000;
mov.f32 %f91, %f92;
@%p8 bra $L__BB0_27;
add.s32 %r218, %r38, 2;
div.s32 %r219, %r218, %r33;
mul.lo.s32 %r220, %r219, %r33;
sub.s32 %r221, %r218, %r220;
div.s32 %r222, %r221, %r34;
mul.lo.s32 %r223, %r222, %r34;
sub.s32 %r224, %r221, %r223;
mad.lo.s32 %r225, %r222, %r40, %r224;
div.s32 %r226, %r225, %r76;
mad.lo.s32 %r44, %r219, %r37, %r226;
setp.ge.s32 %p33, %r44, %r36;
setp.ge.s32 %p34, %r225, %r39;
or.pred %p35, %p33, %p34;
@%p35 bra $L__BB0_27;
add.s32 %r227, %r44, %r41;
mul.wide.s32 %rd21, %r227, 4;
add.s64 %rd22, %rd2, %rd21;
ld.global.f32 %f91, [%rd22];
$L__BB0_27:
@%p8 bra $L__BB0_30;
add.s32 %r228, %r38, 3;
div.s32 %r229, %r228, %r33;
mul.lo.s32 %r230, %r229, %r33;
sub.s32 %r231, %r228, %r230;
div.s32 %r232, %r231, %r34;
mul.lo.s32 %r233, %r232, %r34;
sub.s32 %r234, %r231, %r233;
mad.lo.s32 %r235, %r232, %r40, %r234;
div.s32 %r236, %r235, %r76;
mad.lo.s32 %r45, %r229, %r37, %r236;
setp.ge.s32 %p37, %r45, %r36;
setp.ge.s32 %p38, %r235, %r39;
or.pred %p39, %p37, %p38;
@%p39 bra $L__BB0_30;
add.s32 %r237, %r45, %r41;
mul.wide.s32 %rd23, %r237, 4;
add.s64 %rd24, %rd2, %rd23;
ld.global.f32 %f92, [%rd24];
$L__BB0_30:
mov.f32 %f93, 0f00000000;
mov.f32 %f94, 0f00000000;
mov.f32 %f95, 0f00000000;
mov.f32 %f96, 0f00000000;
@%p8 bra $L__BB0_34;
div.s32 %r238, %r28, %r29;
setp.ge.s32 %p41, %r30, %r238;
@%p41 bra $L__BB0_34;
setp.ge.s32 %p42, %r31, %r27;
@%p42 bra $L__BB0_34;
mul.lo.s32 %r243, %r15, %r79;
mul.lo.s32 %r244, %r78, %r76;
mad.lo.s32 %r245, %r244, %r243, %r31;
add.s32 %r246, %r245, -3;
mul.wide.s32 %rd26, %r246, 4;
add.s64 %rd25, %rd3, %rd26;
// begin inline asm
ld.global.cs.v4.u32 {%r239,%r240,%r241,%r242}, [%rd25];
// end inline asm
mov.b32 %f96, %r239;
mov.b32 %f95, %r240;
mov.b32 %f94, %r241;
mov.b32 %f93, %r242;
$L__BB0_34:
fma.rn.f32 %f25, %f96, %f89, %f85;
fma.rn.f32 %f26, %f95, %f90, %f86;
fma.rn.f32 %f27, %f94, %f91, %f87;
fma.rn.f32 %f28, %f93, %f92, %f88;
@%p8 bra $L__BB0_38;
div.s32 %r247, %r28, %r29;
setp.ge.s32 %p44, %r30, %r247;
setp.ge.s32 %p45, %r31, %r27;
or.pred %p46, %p44, %p45;
@%p46 bra $L__BB0_38;
mad.lo.s32 %r252, %r27, %r14, %r31;
add.s32 %r253, %r252, -3;
mad.lo.s32 %r254, %r27, %r13, %r253;
mul.wide.s32 %rd28, %r254, 4;
add.s64 %rd27, %rd6, %rd28;
mov.b32 %r248, %f25;
mov.b32 %r249, %f26;
mov.b32 %r250, %f27;
mov.b32 %r251, %f28;
// begin inline asm
st.global.cs.v4.s32 [%rd27], {%r248,%r249,%r250,%r251};
// end inline asm
$L__BB0_38:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_aa8dd791_1601111nvfuser_150ENS_6TensorIfLi5ELi5EEES1_S1_S1_E14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_aa8dd791_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_aa8dd791_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_aa8dd791_160113std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_aa8dd791_1601111nvfuser_150ENS_6TensorIfLi5ELi5EEES1_S1_S1_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_aa8dd791_1601111nvfuser_150ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0[48],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_aa8dd791_1601111nvfuser_150ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_1[48],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_aa8dd791_1601111nvfuser_150ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_2[48],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_aa8dd791_1601111nvfuser_150ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_3[48]
)
{
.reg .pred %p<37>;
.reg .f32 %f<97>;
.reg .b32 %r<370>;
.reg .b64 %rd<49>;
// demoted variable
.shared .align 4 .u32 _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_aa8dd791_1601111nvfuser_150ENS_6TensorIfLi5ELi5EEES1_S1_S1_E14nvfuser_zero_s;
ld.param.v2.u32 {%r65, %r66}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_aa8dd791_1601111nvfuser_150ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0+8];
ld.param.v2.u32 {%r69, %r70}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_aa8dd791_1601111nvfuser_150ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0+24];
ld.param.v2.u32 {%r71, %r72}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_aa8dd791_1601111nvfuser_150ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0+32];
ld.param.v2.u32 {%r73, %r74}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_aa8dd791_1601111nvfuser_150ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0+40];
ld.param.v2.u32 {%r75, %r76}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_aa8dd791_1601111nvfuser_150ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_2+24];
ld.param.v2.u32 {%r77, %r78}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_aa8dd791_1601111nvfuser_150ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_2+16];
ld.param.u64 %rd6, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_aa8dd791_1601111nvfuser_150ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_3];
ld.param.u64 %rd5, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_aa8dd791_1601111nvfuser_150ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0];
ld.param.u64 %rd7, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_aa8dd791_1601111nvfuser_150ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_1];
cvta.to.global.u64 %rd1, %rd5;
cvta.to.global.u64 %rd2, %rd7;
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_aa8dd791_1601111nvfuser_150ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_2];
mov.u32 %r4, %tid.x;
setp.ne.s32 %p1, %r4, 0;
@%p1 bra $L__BB0_2;
mov.u32 %r89, 0;
st.shared.u32 [_ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_aa8dd791_1601111nvfuser_150ENS_6TensorIfLi5ELi5EEES1_S1_S1_E14nvfuser_zero_s], %r89;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd8, _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_aa8dd791_1601111nvfuser_150ENS_6TensorIfLi5ELi5EEES1_S1_S1_E14nvfuser_zero_s;
atom.shared.min.s32 %r90, [%rd8], %r4;
ld.shared.u32 %r12, [_ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_150_cu_aa8dd791_1601111nvfuser_150ENS_6TensorIfLi5ELi5EEES1_S1_S1_E14nvfuser_zero_s];
mov.u32 %r91, %ctaid.x;
mov.u32 %r92, %ntid.y;
mul.lo.s32 %r13, %r92, %r91;
mov.u32 %r14, %tid.y;
add.s32 %r15, %r13, %r14;
setp.lt.s32 %p2, %r15, %r65;
@%p2 bra $L__BB0_3;
bra.uni $L__BB0_6;
$L__BB0_3:
mov.u32 %r16, %ctaid.y;
mul.lo.s32 %r17, %r69, 12;
or.b32 %r93, %r17, 3;
shr.s32 %r94, %r93, 31;
shr.u32 %r95, %r94, 30;
add.s32 %r96, %r93, %r95;
shr.s32 %r97, %r96, 2;
mov.u32 %r18, %ntid.x;
add.s32 %r98, %r18, %r97;
add.s32 %r99, %r98, -1;
div.s32 %r100, %r99, %r18;
setp.ge.s32 %p3, %r16, %r100;
@%p3 bra $L__BB0_6;
shl.b32 %r19, %r4, 2;
mul.lo.s32 %r101, %r18, %r16;
shl.b32 %r20, %r101, 2;
add.s32 %r102, %r19, %r20;
or.b32 %r21, %r102, 3;
setp.ge.s32 %p4, %r21, %r17;
@%p4 bra $L__BB0_6;
mul.lo.s32 %r22, %r77, %r78;
add.s32 %r103, %r22, 1;
shr.u32 %r104, %r103, 31;
add.s32 %r105, %r103, %r104;
shr.s32 %r23, %r105, 1;
mul.lo.s32 %r24, %r69, 6;
div.s32 %r106, %r21, %r24;
mul.lo.s32 %r107, %r106, %r24;
sub.s32 %r108, %r21, %r107;
mul.lo.s32 %r25, %r69, 3;
div.s32 %r109, %r108, %r25;
mul.lo.s32 %r110, %r109, %r25;
sub.s32 %r111, %r108, %r110;
mad.lo.s32 %r112, %r23, %r75, 1;
shr.u32 %r113, %r112, 31;
add.s32 %r114, %r112, %r113;
shr.s32 %r26, %r114, 1;
mad.lo.s32 %r115, %r109, %r26, %r111;
div.s32 %r116, %r115, %r75;
mad.lo.s32 %r117, %r106, %r23, %r116;
setp.lt.s32 %p5, %r117, %r22;
@%p5 bra $L__BB0_37;
bra.uni $L__BB0_6;
$L__BB0_37:
shl.b32 %r261, %r12, 1;
add.s32 %r262, %r20, %r19;
add.s32 %r263, %r262, %r261;
div.s32 %r264, %r263, %r24;
mul.lo.s32 %r265, %r264, %r24;
sub.s32 %r266, %r263, %r265;
div.s32 %r267, %r266, %r25;
mul.lo.s32 %r268, %r267, %r25;
sub.s32 %r269, %r266, %r268;
div.s32 %r270, %r269, %r69;
mul.lo.s32 %r271, %r270, %r69;
sub.s32 %r272, %r269, %r271;
mul.lo.s32 %r273, %r15, %r70;
mad.lo.s32 %r274, %r264, %r71, %r273;
mad.lo.s32 %r275, %r267, %r72, %r274;
mad.lo.s32 %r276, %r270, %r73, %r275;
mad.lo.s32 %r277, %r272, %r74, %r276;
mul.wide.s32 %rd31, %r277, 4;
add.s64 %rd32, %rd1, %rd31;
ld.global.f32 %f57, [%rd32];
add.s32 %r278, %r263, 1;
div.s32 %r279, %r278, %r24;
mul.lo.s32 %r280, %r279, %r24;
sub.s32 %r281, %r278, %r280;
div.s32 %r282, %r281, %r25;
mul.lo.s32 %r283, %r282, %r25;
sub.s32 %r284, %r281, %r283;
div.s32 %r285, %r284, %r69;
mul.lo.s32 %r286, %r285, %r69;
sub.s32 %r287, %r284, %r286;
mad.lo.s32 %r288, %r279, %r71, %r273;
mad.lo.s32 %r289, %r282, %r72, %r288;
mad.lo.s32 %r290, %r285, %r73, %r289;
mad.lo.s32 %r291, %r287, %r74, %r290;
mul.wide.s32 %rd33, %r291, 4;
add.s64 %rd34, %rd1, %rd33;
ld.global.f32 %f58, [%rd34];
add.s32 %r292, %r263, 2;
div.s32 %r293, %r292, %r24;
mul.lo.s32 %r294, %r293, %r24;
sub.s32 %r295, %r292, %r294;
div.s32 %r296, %r295, %r25;
mul.lo.s32 %r297, %r296, %r25;
sub.s32 %r298, %r295, %r297;
div.s32 %r299, %r298, %r69;
mul.lo.s32 %r300, %r299, %r69;
sub.s32 %r301, %r298, %r300;
mad.lo.s32 %r302, %r293, %r71, %r273;
mad.lo.s32 %r303, %r296, %r72, %r302;
mad.lo.s32 %r304, %r299, %r73, %r303;
mad.lo.s32 %r305, %r301, %r74, %r304;
mul.wide.s32 %rd35, %r305, 4;
add.s64 %rd36, %rd1, %rd35;
ld.global.f32 %f59, [%rd36];
add.s32 %r306, %r263, 3;
div.s32 %r307, %r306, %r24;
mul.lo.s32 %r308, %r307, %r24;
sub.s32 %r309, %r306, %r308;
div.s32 %r310, %r309, %r25;
mul.lo.s32 %r311, %r310, %r25;
sub.s32 %r312, %r309, %r311;
div.s32 %r313, %r312, %r69;
mul.lo.s32 %r314, %r313, %r69;
sub.s32 %r315, %r312, %r314;
mad.lo.s32 %r316, %r307, %r71, %r273;
mad.lo.s32 %r317, %r310, %r72, %r316;
mad.lo.s32 %r318, %r313, %r73, %r317;
mad.lo.s32 %r319, %r315, %r74, %r318;
mul.wide.s32 %rd37, %r319, 4;
add.s64 %rd38, %rd1, %rd37;
ld.global.f32 %f60, [%rd38];
shl.b32 %r320, %r12, 3;
add.s32 %r321, %r262, %r320;
div.s32 %r322, %r321, %r24;
mul.lo.s32 %r323, %r322, %r24;
sub.s32 %r324, %r321, %r323;
div.s32 %r325, %r324, %r25;
mul.lo.s32 %r326, %r325, %r25;
sub.s32 %r327, %r324, %r326;
mad.lo.s32 %r328, %r325, %r26, %r327;
div.s32 %r329, %r328, %r75;
mul.lo.s32 %r330, %r22, %r15;
mad.lo.s32 %r331, %r322, %r23, %r330;
add.s32 %r332, %r331, %r329;
mul.wide.s32 %rd39, %r332, 4;
add.s64 %rd40, %rd2, %rd39;
ld.global.f32 %f61, [%rd40];
add.s32 %r333, %r321, 1;
div.s32 %r334, %r333, %r24;
mul.lo.s32 %r335, %r334, %r24;
sub.s32 %r336, %r333, %r335;
div.s32 %r337, %r336, %r25;
mul.lo.s32 %r338, %r337, %r25;
sub.s32 %r339, %r336, %r338;
mad.lo.s32 %r340, %r337, %r26, %r339;
div.s32 %r341, %r340, %r75;
mad.lo.s32 %r342, %r334, %r23, %r330;
add.s32 %r343, %r342, %r341;
mul.wide.s32 %rd41, %r343, 4;
add.s64 %rd42, %rd2, %rd41;
ld.global.f32 %f62, [%rd42];
add.s32 %r344, %r321, 2;
div.s32 %r345, %r344, %r24;
mul.lo.s32 %r346, %r345, %r24;
sub.s32 %r347, %r344, %r346;
div.s32 %r348, %r347, %r25;
mul.lo.s32 %r349, %r348, %r25;
sub.s32 %r350, %r347, %r349;
mad.lo.s32 %r351, %r348, %r26, %r350;
div.s32 %r352, %r351, %r75;
mad.lo.s32 %r353, %r345, %r23, %r330;
add.s32 %r354, %r353, %r352;
mul.wide.s32 %rd43, %r354, 4;
add.s64 %rd44, %rd2, %rd43;
ld.global.f32 %f63, [%rd44];
add.s32 %r355, %r321, 3;
div.s32 %r356, %r355, %r24;
mul.lo.s32 %r357, %r356, %r24;
sub.s32 %r358, %r355, %r357;
div.s32 %r359, %r358, %r25;
mul.lo.s32 %r360, %r359, %r25;
sub.s32 %r361, %r358, %r360;
mad.lo.s32 %r362, %r359, %r26, %r361;
div.s32 %r363, %r362, %r75;
mad.lo.s32 %r364, %r356, %r23, %r330;
add.s32 %r365, %r364, %r363;
mul.wide.s32 %rd45, %r365, 4;
add.s64 %rd46, %rd2, %rd45;
ld.global.f32 %f64, [%rd46];
mul.lo.s32 %r366, %r77, %r75;
mul.lo.s32 %r367, %r15, %r78;
mad.lo.s32 %r368, %r366, %r367, %r262;
mul.wide.s32 %rd47, %r368, 4;
add.s64 %rd29, %rd3, %rd47;
// begin inline asm
ld.global.cs.v4.u32 {%r253,%r254,%r255,%r256}, [%rd29];
// end inline asm
mov.b32 %f65, %r253;
fma.rn.f32 %f66, %f61, %f65, %f57;
mov.b32 %r257, %f66;
mov.b32 %f67, %r254;
fma.rn.f32 %f68, %f62, %f67, %f58;
mov.b32 %r258, %f68;
mov.b32 %f69, %r255;
fma.rn.f32 %f70, %f63, %f69, %f59;
mov.b32 %r259, %f70;
mov.b32 %f71, %r256;
fma.rn.f32 %f72, %f64, %f71, %f60;
mov.b32 %r260, %f72;
mad.lo.s32 %r369, %r17, %r15, %r262;
mul.wide.s32 %rd48, %r369, 4;
add.s64 %rd30, %rd6, %rd48;
// begin inline asm
st.global.cs.v4.s32 [%rd30], {%r257,%r258,%r259,%r260};
// end inline asm
bra.uni $L__BB0_38;
$L__BB0_6:
setp.ge.s32 %p6, %r15, %r65;
mul.lo.s32 %r27, %r69, 12;
or.b32 %r118, %r27, 3;
shr.s32 %r119, %r118, 31;
shr.u32 %r120, %r119, 30;
add.s32 %r121, %r118, %r120;
shr.s32 %r122, %r121, 2;
mov.u32 %r29, %ntid.x;
add.s32 %r123, %r29, %r122;
add.s32 %r28, %r123, -1;
shl.b32 %r124, %r4, 2;
shl.b32 %r125, %r29, 2;
mov.u32 %r30, %ctaid.y;
mad.lo.s32 %r126, %r125, %r30, %r124;
or.b32 %r31, %r126, 3;
shl.b32 %r127, %r12, 1;
add.s32 %r128, %r31, %r127;
add.s32 %r32, %r128, -3;
mul.lo.s32 %r33, %r69, 6;
mul.lo.s32 %r34, %r69, 3;
mul.lo.s32 %r35, %r15, %r70;
mov.f32 %f86, 0f00000000;
mov.f32 %f85, %f86;
@%p6 bra $L__BB0_9;
div.s32 %r129, %r28, %r29;
setp.ge.s32 %p7, %r30, %r129;
setp.ge.s32 %p8, %r31, %r27;
or.pred %p9, %p7, %p8;
@%p9 bra $L__BB0_9;
div.s32 %r130, %r32, %r33;
mul.lo.s32 %r131, %r130, %r33;
sub.s32 %r132, %r32, %r131;
div.s32 %r133, %r132, %r34;
mul.lo.s32 %r134, %r133, %r34;
sub.s32 %r135, %r132, %r134;
div.s32 %r136, %r135, %r69;
mul.lo.s32 %r137, %r136, %r69;
sub.s32 %r138, %r135, %r137;
mad.lo.s32 %r139, %r130, %r71, %r35;
mad.lo.s32 %r140, %r133, %r72, %r139;
mad.lo.s32 %r141, %r136, %r73, %r140;
mad.lo.s32 %r142, %r138, %r74, %r141;
mul.wide.s32 %rd9, %r142, 4;
add.s64 %rd10, %rd1, %rd9;
ld.global.f32 %f85, [%rd10];
$L__BB0_9:
@%p6 bra $L__BB0_12;
div.s32 %r143, %r28, %r29;
setp.ge.s32 %p11, %r30, %r143;
setp.ge.s32 %p12, %r31, %r27;
or.pred %p13, %p11, %p12;
@%p13 bra $L__BB0_12;
add.s32 %r144, %r32, 1;
div.s32 %r145, %r144, %r33;
mul.lo.s32 %r146, %r145, %r33;
sub.s32 %r147, %r144, %r146;
div.s32 %r148, %r147, %r34;
mul.lo.s32 %r149, %r148, %r34;
sub.s32 %r150, %r147, %r149;
div.s32 %r151, %r150, %r69;
mul.lo.s32 %r152, %r151, %r69;
sub.s32 %r153, %r150, %r152;
mad.lo.s32 %r154, %r145, %r71, %r35;
mad.lo.s32 %r155, %r148, %r72, %r154;
mad.lo.s32 %r156, %r151, %r73, %r155;
mad.lo.s32 %r157, %r153, %r74, %r156;
mul.wide.s32 %rd11, %r157, 4;
add.s64 %rd12, %rd1, %rd11;
ld.global.f32 %f86, [%rd12];
$L__BB0_12:
mov.f32 %f88, 0f00000000;
mov.f32 %f87, %f88;
@%p6 bra $L__BB0_15;
div.s32 %r158, %r28, %r29;
setp.ge.s32 %p15, %r30, %r158;
setp.ge.s32 %p16, %r31, %r27;
or.pred %p17, %p15, %p16;
@%p17 bra $L__BB0_15;
add.s32 %r159, %r32, 2;
div.s32 %r160, %r159, %r33;
mul.lo.s32 %r161, %r160, %r33;
sub.s32 %r162, %r159, %r161;
div.s32 %r163, %r162, %r34;
mul.lo.s32 %r164, %r163, %r34;
sub.s32 %r165, %r162, %r164;
div.s32 %r166, %r165, %r69;
mul.lo.s32 %r167, %r166, %r69;
sub.s32 %r168, %r165, %r167;
mad.lo.s32 %r169, %r160, %r71, %r35;
mad.lo.s32 %r170, %r163, %r72, %r169;
mad.lo.s32 %r171, %r166, %r73, %r170;
mad.lo.s32 %r172, %r168, %r74, %r171;
mul.wide.s32 %rd13, %r172, 4;
add.s64 %rd14, %rd1, %rd13;
ld.global.f32 %f87, [%rd14];
$L__BB0_15:
@%p6 bra $L__BB0_18;
div.s32 %r173, %r28, %r29;
setp.ge.s32 %p19, %r30, %r173;
setp.ge.s32 %p20, %r31, %r27;
or.pred %p21, %p19, %p20;
@%p21 bra $L__BB0_18;
add.s32 %r174, %r32, 3;
div.s32 %r175, %r174, %r33;
mul.lo.s32 %r176, %r175, %r33;
sub.s32 %r177, %r174, %r176;
div.s32 %r178, %r177, %r34;
mul.lo.s32 %r179, %r178, %r34;
sub.s32 %r180, %r177, %r179;
div.s32 %r181, %r180, %r69;
mul.lo.s32 %r182, %r181, %r69;
sub.s32 %r183, %r180, %r182;
mad.lo.s32 %r184, %r175, %r71, %r35;
mad.lo.s32 %r185, %r178, %r72, %r184;
mad.lo.s32 %r186, %r181, %r73, %r185;
mad.lo.s32 %r187, %r183, %r74, %r186;
mul.wide.s32 %rd15, %r187, 4;
add.s64 %rd16, %rd1, %rd15;
ld.global.f32 %f88, [%rd16];
$L__BB0_18:
mul.lo.s32 %r36, %r77, %r78;
add.s32 %r188, %r36, 1;
shr.u32 %r189, %r188, 31;
add.s32 %r190, %r188, %r189;
shr.s32 %r37, %r190, 1;
mad.lo.s32 %r191, %r30, %r29, %r4;
shl.b32 %r192, %r191, 2;
shl.b32 %r193, %r12, 3;
add.s32 %r38, %r193, %r192;
mad.lo.s32 %r194, %r37, %r75, 1;
shr.u32 %r195, %r194, 31;
add.s32 %r196, %r194, %r195;
shr.s32 %r39, %r196, 1;
mul.lo.s32 %r40, %r36, %r15;
mov.f32 %f90, 0f00000000;
mov.f32 %f89, %f90;
@%p6 bra $L__BB0_21;
div.s32 %r197, %r38, %r33;
mul.lo.s32 %r198, %r197, %r33;
sub.s32 %r199, %r38, %r198;
div.s32 %r200, %r199, %r34;
mul.lo.s32 %r201, %r200, %r34;
sub.s32 %r202, %r199, %r201;
mad.lo.s32 %r203, %r200, %r39, %r202;
div.s32 %r204, %r203, %r75;
mad.lo.s32 %r41, %r197, %r37, %r204;
setp.ge.s32 %p23, %r41, %r36;
@%p23 bra $L__BB0_21;
add.s32 %r205, %r41, %r40;
mul.wide.s32 %rd17, %r205, 4;
add.s64 %rd18, %rd2, %rd17;
ld.global.f32 %f89, [%rd18];
$L__BB0_21:
@%p6 bra $L__BB0_24;
add.s32 %r206, %r38, 1;
div.s32 %r207, %r206, %r33;
mul.lo.s32 %r208, %r207, %r33;
sub.s32 %r209, %r206, %r208;
div.s32 %r210, %r209, %r34;
mul.lo.s32 %r211, %r210, %r34;
sub.s32 %r212, %r209, %r211;
mad.lo.s32 %r213, %r210, %r39, %r212;
div.s32 %r214, %r213, %r75;
mad.lo.s32 %r42, %r207, %r37, %r214;
setp.ge.s32 %p25, %r42, %r36;
@%p25 bra $L__BB0_24;
add.s32 %r215, %r42, %r40;
mul.wide.s32 %rd19, %r215, 4;
add.s64 %rd20, %rd2, %rd19;
ld.global.f32 %f90, [%rd20];
$L__BB0_24:
mov.f32 %f92, 0f00000000;
mov.f32 %f91, %f92;
@%p6 bra $L__BB0_27;
add.s32 %r216, %r38, 2;
div.s32 %r217, %r216, %r33;
mul.lo.s32 %r218, %r217, %r33;
sub.s32 %r219, %r216, %r218;
div.s32 %r220, %r219, %r34;
mul.lo.s32 %r221, %r220, %r34;
sub.s32 %r222, %r219, %r221;
mad.lo.s32 %r223, %r220, %r39, %r222;
div.s32 %r224, %r223, %r75;
mad.lo.s32 %r43, %r217, %r37, %r224;
setp.ge.s32 %p27, %r43, %r36;
@%p27 bra $L__BB0_27;
add.s32 %r225, %r43, %r40;
mul.wide.s32 %rd21, %r225, 4;
add.s64 %rd22, %rd2, %rd21;
ld.global.f32 %f91, [%rd22];
$L__BB0_27:
@%p6 bra $L__BB0_30;
add.s32 %r226, %r38, 3;
div.s32 %r227, %r226, %r33;
mul.lo.s32 %r228, %r227, %r33;
sub.s32 %r229, %r226, %r228;
div.s32 %r230, %r229, %r34;
mul.lo.s32 %r231, %r230, %r34;
sub.s32 %r232, %r229, %r231;
mad.lo.s32 %r233, %r230, %r39, %r232;
div.s32 %r234, %r233, %r75;
mad.lo.s32 %r44, %r227, %r37, %r234;
setp.ge.s32 %p29, %r44, %r36;
@%p29 bra $L__BB0_30;
add.s32 %r235, %r44, %r40;
mul.wide.s32 %rd23, %r235, 4;
add.s64 %rd24, %rd2, %rd23;
ld.global.f32 %f92, [%rd24];
$L__BB0_30:
mov.f32 %f93, 0f00000000;
mov.f32 %f94, 0f00000000;
mov.f32 %f95, 0f00000000;
mov.f32 %f96, 0f00000000;
@%p6 bra $L__BB0_34;
div.s32 %r236, %r28, %r29;
setp.ge.s32 %p31, %r30, %r236;
@%p31 bra $L__BB0_34;
setp.ge.s32 %p32, %r31, %r27;
@%p32 bra $L__BB0_34;
mul.lo.s32 %r241, %r15, %r78;
mul.lo.s32 %r242, %r77, %r75;
mad.lo.s32 %r243, %r242, %r241, %r31;
add.s32 %r244, %r243, -3;
mul.wide.s32 %rd26, %r244, 4;
add.s64 %rd25, %rd3, %rd26;
// begin inline asm
ld.global.cs.v4.u32 {%r237,%r238,%r239,%r240}, [%rd25];
// end inline asm
mov.b32 %f96, %r237;
mov.b32 %f95, %r238;
mov.b32 %f94, %r239;
mov.b32 %f93, %r240;
$L__BB0_34:
fma.rn.f32 %f25, %f96, %f89, %f85;
fma.rn.f32 %f26, %f95, %f90, %f86;
fma.rn.f32 %f27, %f94, %f91, %f87;
fma.rn.f32 %f28, %f93, %f92, %f88;
@%p6 bra $L__BB0_38;
div.s32 %r245, %r28, %r29;
setp.ge.s32 %p34, %r30, %r245;
setp.ge.s32 %p35, %r31, %r27;
or.pred %p36, %p34, %p35;
@%p36 bra $L__BB0_38;
mad.lo.s32 %r250, %r27, %r14, %r31;
add.s32 %r251, %r250, -3;
mad.lo.s32 %r252, %r27, %r13, %r251;
mul.wide.s32 %rd28, %r252, 4;
add.s64 %rd27, %rd6, %rd28;
mov.b32 %r246, %f25;
mov.b32 %r247, %f26;
mov.b32 %r248, %f27;
mov.b32 %r249, %f28;
// begin inline asm
st.global.cs.v4.s32 [%rd27], {%r246,%r247,%r248,%r249};
// end inline asm
$L__BB0_38:
ret;
}
--- 53997da5d
+++ 03a1b695e
@@ -20,552 +20,540 @@
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1__param_1[48],
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1__param_2[48],
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1__param_3[48]
)
{
- .reg .pred %p<47>;
+ .reg .pred %p<37>;
.reg .f32 %f<97>;
- .reg .b32 %r<372>;
+ .reg .b32 %r<370>;
.reg .b64 %rd<49>;
.shared .align 4 .u32 _ZZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1_E14nvfuser_zero_s;
- ld.param.v2.u32 {%r66, %r67}, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0+8];
- ld.param.v2.u32 {%r70, %r71}, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0+24];
- ld.param.v2.u32 {%r72, %r73}, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0+32];
- ld.param.v2.u32 {%r74, %r75}, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0+40];
- ld.param.v2.u32 {%r76, %r77}, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1__param_2+24];
- ld.param.v2.u32 {%r78, %r79}, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1__param_2+16];
+ ld.param.v2.u32 {%r65, %r66}, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0+8];
+ ld.param.v2.u32 {%r69, %r70}, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0+24];
+ ld.param.v2.u32 {%r71, %r72}, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0+32];
+ ld.param.v2.u32 {%r73, %r74}, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0+40];
+ ld.param.v2.u32 {%r75, %r76}, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1__param_2+24];
+ ld.param.v2.u32 {%r77, %r78}, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1__param_2+16];
ld.param.u64 %rd6, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1__param_3];
ld.param.u64 %rd5, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0];
ld.param.u64 %rd7, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1__param_1];
cvta.to.global.u64 %rd1, %rd5;
cvta.to.global.u64 %rd2, %rd7;
ld.param.u64 %rd3, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1__param_2];
mov.u32 %r4, %tid.x;
setp.ne.s32 %p1, %r4, 0;
@%p1 bra $L__BB0_2;
- mov.u32 %r90, 0;
- st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1_E14nvfuser_zero_s], %r90;
+ mov.u32 %r89, 0;
+ st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1_E14nvfuser_zero_s], %r89;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd8, _ZZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1_E14nvfuser_zero_s;
- atom.shared.min.s32 %r91, [%rd8], %r4;
+ atom.shared.min.s32 %r90, [%rd8], %r4;
ld.shared.u32 %r12, [_ZZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1_E14nvfuser_zero_s];
- mov.u32 %r92, %ctaid.x;
- mov.u32 %r93, %ntid.y;
- mul.lo.s32 %r13, %r93, %r92;
+ mov.u32 %r91, %ctaid.x;
+ mov.u32 %r92, %ntid.y;
+ mul.lo.s32 %r13, %r92, %r91;
mov.u32 %r14, %tid.y;
add.s32 %r15, %r13, %r14;
- setp.lt.s32 %p2, %r15, %r66;
+ setp.lt.s32 %p2, %r15, %r65;
@%p2 bra $L__BB0_3;
bra.uni $L__BB0_6;
$L__BB0_3:
mov.u32 %r16, %ctaid.y;
- mul.lo.s32 %r17, %r70, 12;
- or.b32 %r94, %r17, 3;
- shr.s32 %r95, %r94, 31;
- shr.u32 %r96, %r95, 30;
- add.s32 %r97, %r94, %r96;
- shr.s32 %r98, %r97, 2;
+ mul.lo.s32 %r17, %r69, 12;
+ or.b32 %r93, %r17, 3;
+ shr.s32 %r94, %r93, 31;
+ shr.u32 %r95, %r94, 30;
+ add.s32 %r96, %r93, %r95;
+ shr.s32 %r97, %r96, 2;
mov.u32 %r18, %ntid.x;
- add.s32 %r99, %r18, %r98;
- add.s32 %r100, %r99, -1;
- div.s32 %r101, %r100, %r18;
- setp.ge.s32 %p3, %r16, %r101;
+ add.s32 %r98, %r18, %r97;
+ add.s32 %r99, %r98, -1;
+ div.s32 %r100, %r99, %r18;
+ setp.ge.s32 %p3, %r16, %r100;
@%p3 bra $L__BB0_6;
shl.b32 %r19, %r4, 2;
- mul.lo.s32 %r102, %r18, %r16;
- shl.b32 %r20, %r102, 2;
- add.s32 %r103, %r19, %r20;
- or.b32 %r21, %r103, 3;
+ mul.lo.s32 %r101, %r18, %r16;
+ shl.b32 %r20, %r101, 2;
+ add.s32 %r102, %r19, %r20;
+ or.b32 %r21, %r102, 3;
setp.ge.s32 %p4, %r21, %r17;
@%p4 bra $L__BB0_6;
- mul.lo.s32 %r22, %r78, %r79;
- add.s32 %r104, %r22, 1;
- shr.u32 %r105, %r104, 31;
- add.s32 %r106, %r104, %r105;
- shr.s32 %r23, %r106, 1;
- mul.lo.s32 %r24, %r70, 6;
- div.s32 %r107, %r21, %r24;
- mul.lo.s32 %r108, %r107, %r24;
- sub.s32 %r109, %r21, %r108;
- mul.lo.s32 %r25, %r70, 3;
- div.s32 %r110, %r109, %r25;
- mul.lo.s32 %r111, %r110, %r25;
- sub.s32 %r112, %r109, %r111;
- mul.lo.s32 %r113, %r23, %r76;
- add.s32 %r114, %r113, 1;
- shr.u32 %r115, %r114, 31;
- add.s32 %r116, %r114, %r115;
- shr.s32 %r26, %r116, 1;
- mad.lo.s32 %r117, %r110, %r26, %r112;
- div.s32 %r118, %r117, %r76;
- mad.lo.s32 %r119, %r107, %r23, %r118;
- setp.lt.s32 %p5, %r119, %r22;
- setp.lt.s32 %p6, %r117, %r113;
- and.pred %p7, %p5, %p6;
- @%p7 bra $L__BB0_37;
+ mul.lo.s32 %r22, %r77, %r78;
+ add.s32 %r103, %r22, 1;
+ shr.u32 %r104, %r103, 31;
+ add.s32 %r105, %r103, %r104;
+ shr.s32 %r23, %r105, 1;
+ mul.lo.s32 %r24, %r69, 6;
+ div.s32 %r106, %r21, %r24;
+ mul.lo.s32 %r107, %r106, %r24;
+ sub.s32 %r108, %r21, %r107;
+ mul.lo.s32 %r25, %r69, 3;
+ div.s32 %r109, %r108, %r25;
+ mul.lo.s32 %r110, %r109, %r25;
+ sub.s32 %r111, %r108, %r110;
+ mad.lo.s32 %r112, %r23, %r75, 1;
+ shr.u32 %r113, %r112, 31;
+ add.s32 %r114, %r112, %r113;
+ shr.s32 %r26, %r114, 1;
+ mad.lo.s32 %r115, %r109, %r26, %r111;
+ div.s32 %r116, %r115, %r75;
+ mad.lo.s32 %r117, %r106, %r23, %r116;
+ setp.lt.s32 %p5, %r117, %r22;
+ @%p5 bra $L__BB0_37;
bra.uni $L__BB0_6;
$L__BB0_37:
- shl.b32 %r263, %r12, 1;
- add.s32 %r264, %r20, %r19;
- add.s32 %r265, %r264, %r263;
- div.s32 %r266, %r265, %r24;
- mul.lo.s32 %r267, %r266, %r24;
- sub.s32 %r268, %r265, %r267;
- div.s32 %r269, %r268, %r25;
- mul.lo.s32 %r270, %r269, %r25;
- sub.s32 %r271, %r268, %r270;
- div.s32 %r272, %r271, %r70;
- mul.lo.s32 %r273, %r272, %r70;
- sub.s32 %r274, %r271, %r273;
- mul.lo.s32 %r275, %r15, %r71;
- mad.lo.s32 %r276, %r266, %r72, %r275;
- mad.lo.s32 %r277, %r269, %r73, %r276;
- mad.lo.s32 %r278, %r272, %r74, %r277;
- mad.lo.s32 %r279, %r274, %r75, %r278;
- mul.wide.s32 %rd31, %r279, 4;
+ shl.b32 %r261, %r12, 1;
+ add.s32 %r262, %r20, %r19;
+ add.s32 %r263, %r262, %r261;
+ div.s32 %r264, %r263, %r24;
+ mul.lo.s32 %r265, %r264, %r24;
+ sub.s32 %r266, %r263, %r265;
+ div.s32 %r267, %r266, %r25;
+ mul.lo.s32 %r268, %r267, %r25;
+ sub.s32 %r269, %r266, %r268;
+ div.s32 %r270, %r269, %r69;
+ mul.lo.s32 %r271, %r270, %r69;
+ sub.s32 %r272, %r269, %r271;
+ mul.lo.s32 %r273, %r15, %r70;
+ mad.lo.s32 %r274, %r264, %r71, %r273;
+ mad.lo.s32 %r275, %r267, %r72, %r274;
+ mad.lo.s32 %r276, %r270, %r73, %r275;
+ mad.lo.s32 %r277, %r272, %r74, %r276;
+ mul.wide.s32 %rd31, %r277, 4;
add.s64 %rd32, %rd1, %rd31;
ld.global.f32 %f57, [%rd32];
- add.s32 %r280, %r265, 1;
- div.s32 %r281, %r280, %r24;
- mul.lo.s32 %r282, %r281, %r24;
- sub.s32 %r283, %r280, %r282;
- div.s32 %r284, %r283, %r25;
- mul.lo.s32 %r285, %r284, %r25;
- sub.s32 %r286, %r283, %r285;
- div.s32 %r287, %r286, %r70;
- mul.lo.s32 %r288, %r287, %r70;
- sub.s32 %r289, %r286, %r288;
- mad.lo.s32 %r290, %r281, %r72, %r275;
- mad.lo.s32 %r291, %r284, %r73, %r290;
- mad.lo.s32 %r292, %r287, %r74, %r291;
- mad.lo.s32 %r293, %r289, %r75, %r292;
- mul.wide.s32 %rd33, %r293, 4;
+ add.s32 %r278, %r263, 1;
+ div.s32 %r279, %r278, %r24;
+ mul.lo.s32 %r280, %r279, %r24;
+ sub.s32 %r281, %r278, %r280;
+ div.s32 %r282, %r281, %r25;
+ mul.lo.s32 %r283, %r282, %r25;
+ sub.s32 %r284, %r281, %r283;
+ div.s32 %r285, %r284, %r69;
+ mul.lo.s32 %r286, %r285, %r69;
+ sub.s32 %r287, %r284, %r286;
+ mad.lo.s32 %r288, %r279, %r71, %r273;
+ mad.lo.s32 %r289, %r282, %r72, %r288;
+ mad.lo.s32 %r290, %r285, %r73, %r289;
+ mad.lo.s32 %r291, %r287, %r74, %r290;
+ mul.wide.s32 %rd33, %r291, 4;
add.s64 %rd34, %rd1, %rd33;
ld.global.f32 %f58, [%rd34];
- add.s32 %r294, %r265, 2;
- div.s32 %r295, %r294, %r24;
- mul.lo.s32 %r296, %r295, %r24;
- sub.s32 %r297, %r294, %r296;
- div.s32 %r298, %r297, %r25;
- mul.lo.s32 %r299, %r298, %r25;
- sub.s32 %r300, %r297, %r299;
- div.s32 %r301, %r300, %r70;
- mul.lo.s32 %r302, %r301, %r70;
- sub.s32 %r303, %r300, %r302;
- mad.lo.s32 %r304, %r295, %r72, %r275;
- mad.lo.s32 %r305, %r298, %r73, %r304;
- mad.lo.s32 %r306, %r301, %r74, %r305;
- mad.lo.s32 %r307, %r303, %r75, %r306;
- mul.wide.s32 %rd35, %r307, 4;
+ add.s32 %r292, %r263, 2;
+ div.s32 %r293, %r292, %r24;
+ mul.lo.s32 %r294, %r293, %r24;
+ sub.s32 %r295, %r292, %r294;
+ div.s32 %r296, %r295, %r25;
+ mul.lo.s32 %r297, %r296, %r25;
+ sub.s32 %r298, %r295, %r297;
+ div.s32 %r299, %r298, %r69;
+ mul.lo.s32 %r300, %r299, %r69;
+ sub.s32 %r301, %r298, %r300;
+ mad.lo.s32 %r302, %r293, %r71, %r273;
+ mad.lo.s32 %r303, %r296, %r72, %r302;
+ mad.lo.s32 %r304, %r299, %r73, %r303;
+ mad.lo.s32 %r305, %r301, %r74, %r304;
+ mul.wide.s32 %rd35, %r305, 4;
add.s64 %rd36, %rd1, %rd35;
ld.global.f32 %f59, [%rd36];
- add.s32 %r308, %r265, 3;
- div.s32 %r309, %r308, %r24;
- mul.lo.s32 %r310, %r309, %r24;
- sub.s32 %r311, %r308, %r310;
- div.s32 %r312, %r311, %r25;
- mul.lo.s32 %r313, %r312, %r25;
- sub.s32 %r314, %r311, %r313;
- div.s32 %r315, %r314, %r70;
- mul.lo.s32 %r316, %r315, %r70;
- sub.s32 %r317, %r314, %r316;
- mad.lo.s32 %r318, %r309, %r72, %r275;
- mad.lo.s32 %r319, %r312, %r73, %r318;
- mad.lo.s32 %r320, %r315, %r74, %r319;
- mad.lo.s32 %r321, %r317, %r75, %r320;
- mul.wide.s32 %rd37, %r321, 4;
+ add.s32 %r306, %r263, 3;
+ div.s32 %r307, %r306, %r24;
+ mul.lo.s32 %r308, %r307, %r24;
+ sub.s32 %r309, %r306, %r308;
+ div.s32 %r310, %r309, %r25;
+ mul.lo.s32 %r311, %r310, %r25;
+ sub.s32 %r312, %r309, %r311;
+ div.s32 %r313, %r312, %r69;
+ mul.lo.s32 %r314, %r313, %r69;
+ sub.s32 %r315, %r312, %r314;
+ mad.lo.s32 %r316, %r307, %r71, %r273;
+ mad.lo.s32 %r317, %r310, %r72, %r316;
+ mad.lo.s32 %r318, %r313, %r73, %r317;
+ mad.lo.s32 %r319, %r315, %r74, %r318;
+ mul.wide.s32 %rd37, %r319, 4;
add.s64 %rd38, %rd1, %rd37;
ld.global.f32 %f60, [%rd38];
- shl.b32 %r322, %r12, 3;
- add.s32 %r323, %r264, %r322;
- div.s32 %r324, %r323, %r24;
- mul.lo.s32 %r325, %r324, %r24;
- sub.s32 %r326, %r323, %r325;
- div.s32 %r327, %r326, %r25;
- mul.lo.s32 %r328, %r327, %r25;
- sub.s32 %r329, %r326, %r328;
- mad.lo.s32 %r330, %r327, %r26, %r329;
- div.s32 %r331, %r330, %r76;
- mul.lo.s32 %r332, %r22, %r15;
- mad.lo.s32 %r333, %r324, %r23, %r332;
- add.s32 %r334, %r333, %r331;
- mul.wide.s32 %rd39, %r334, 4;
+ shl.b32 %r320, %r12, 3;
+ add.s32 %r321, %r262, %r320;
+ div.s32 %r322, %r321, %r24;
+ mul.lo.s32 %r323, %r322, %r24;
+ sub.s32 %r324, %r321, %r323;
+ div.s32 %r325, %r324, %r25;
+ mul.lo.s32 %r326, %r325, %r25;
+ sub.s32 %r327, %r324, %r326;
+ mad.lo.s32 %r328, %r325, %r26, %r327;
+ div.s32 %r329, %r328, %r75;
+ mul.lo.s32 %r330, %r22, %r15;
+ mad.lo.s32 %r331, %r322, %r23, %r330;
+ add.s32 %r332, %r331, %r329;
+ mul.wide.s32 %rd39, %r332, 4;
add.s64 %rd40, %rd2, %rd39;
ld.global.f32 %f61, [%rd40];
- add.s32 %r335, %r323, 1;
- div.s32 %r336, %r335, %r24;
- mul.lo.s32 %r337, %r336, %r24;
- sub.s32 %r338, %r335, %r337;
- div.s32 %r339, %r338, %r25;
- mul.lo.s32 %r340, %r339, %r25;
- sub.s32 %r341, %r338, %r340;
- mad.lo.s32 %r342, %r339, %r26, %r341;
- div.s32 %r343, %r342, %r76;
- mad.lo.s32 %r344, %r336, %r23, %r332;
- add.s32 %r345, %r344, %r343;
- mul.wide.s32 %rd41, %r345, 4;
+ add.s32 %r333, %r321, 1;
+ div.s32 %r334, %r333, %r24;
+ mul.lo.s32 %r335, %r334, %r24;
+ sub.s32 %r336, %r333, %r335;
+ div.s32 %r337, %r336, %r25;
+ mul.lo.s32 %r338, %r337, %r25;
+ sub.s32 %r339, %r336, %r338;
+ mad.lo.s32 %r340, %r337, %r26, %r339;
+ div.s32 %r341, %r340, %r75;
+ mad.lo.s32 %r342, %r334, %r23, %r330;
+ add.s32 %r343, %r342, %r341;
+ mul.wide.s32 %rd41, %r343, 4;
add.s64 %rd42, %rd2, %rd41;
ld.global.f32 %f62, [%rd42];
- add.s32 %r346, %r323, 2;
- div.s32 %r347, %r346, %r24;
- mul.lo.s32 %r348, %r347, %r24;
- sub.s32 %r349, %r346, %r348;
- div.s32 %r350, %r349, %r25;
- mul.lo.s32 %r351, %r350, %r25;
- sub.s32 %r352, %r349, %r351;
- mad.lo.s32 %r353, %r350, %r26, %r352;
- div.s32 %r354, %r353, %r76;
- mad.lo.s32 %r355, %r347, %r23, %r332;
- add.s32 %r356, %r355, %r354;
- mul.wide.s32 %rd43, %r356, 4;
+ add.s32 %r344, %r321, 2;
+ div.s32 %r345, %r344, %r24;
+ mul.lo.s32 %r346, %r345, %r24;
+ sub.s32 %r347, %r344, %r346;
+ div.s32 %r348, %r347, %r25;
+ mul.lo.s32 %r349, %r348, %r25;
+ sub.s32 %r350, %r347, %r349;
+ mad.lo.s32 %r351, %r348, %r26, %r350;
+ div.s32 %r352, %r351, %r75;
+ mad.lo.s32 %r353, %r345, %r23, %r330;
+ add.s32 %r354, %r353, %r352;
+ mul.wide.s32 %rd43, %r354, 4;
add.s64 %rd44, %rd2, %rd43;
ld.global.f32 %f63, [%rd44];
- add.s32 %r357, %r323, 3;
- div.s32 %r358, %r357, %r24;
- mul.lo.s32 %r359, %r358, %r24;
- sub.s32 %r360, %r357, %r359;
- div.s32 %r361, %r360, %r25;
- mul.lo.s32 %r362, %r361, %r25;
- sub.s32 %r363, %r360, %r362;
- mad.lo.s32 %r364, %r361, %r26, %r363;
- div.s32 %r365, %r364, %r76;
- mad.lo.s32 %r366, %r358, %r23, %r332;
- add.s32 %r367, %r366, %r365;
- mul.wide.s32 %rd45, %r367, 4;
+ add.s32 %r355, %r321, 3;
+ div.s32 %r356, %r355, %r24;
+ mul.lo.s32 %r357, %r356, %r24;
+ sub.s32 %r358, %r355, %r357;
+ div.s32 %r359, %r358, %r25;
+ mul.lo.s32 %r360, %r359, %r25;
+ sub.s32 %r361, %r358, %r360;
+ mad.lo.s32 %r362, %r359, %r26, %r361;
+ div.s32 %r363, %r362, %r75;
+ mad.lo.s32 %r364, %r356, %r23, %r330;
+ add.s32 %r365, %r364, %r363;
+ mul.wide.s32 %rd45, %r365, 4;
add.s64 %rd46, %rd2, %rd45;
ld.global.f32 %f64, [%rd46];
- mul.lo.s32 %r368, %r78, %r76;
- mul.lo.s32 %r369, %r15, %r79;
- mad.lo.s32 %r370, %r368, %r369, %r264;
- mul.wide.s32 %rd47, %r370, 4;
+ mul.lo.s32 %r366, %r77, %r75;
+ mul.lo.s32 %r367, %r15, %r78;
+ mad.lo.s32 %r368, %r366, %r367, %r262;
+ mul.wide.s32 %rd47, %r368, 4;
add.s64 %rd29, %rd3, %rd47;
- ld.global.cs.v4.u32 {%r255,%r256,%r257,%r258}, [%rd29];
-
- mov.b32 %f65, %r255;
+ ld.global.cs.v4.u32 {%r253,%r254,%r255,%r256}, [%rd29];
+
+ mov.b32 %f65, %r253;
fma.rn.f32 %f66, %f61, %f65, %f57;
- mov.b32 %r259, %f66;
- mov.b32 %f67, %r256;
+ mov.b32 %r257, %f66;
+ mov.b32 %f67, %r254;
fma.rn.f32 %f68, %f62, %f67, %f58;
- mov.b32 %r260, %f68;
- mov.b32 %f69, %r257;
+ mov.b32 %r258, %f68;
+ mov.b32 %f69, %r255;
fma.rn.f32 %f70, %f63, %f69, %f59;
- mov.b32 %r261, %f70;
- mov.b32 %f71, %r258;
+ mov.b32 %r259, %f70;
+ mov.b32 %f71, %r256;
fma.rn.f32 %f72, %f64, %f71, %f60;
- mov.b32 %r262, %f72;
- mad.lo.s32 %r371, %r17, %r15, %r264;
- mul.wide.s32 %rd48, %r371, 4;
+ mov.b32 %r260, %f72;
+ mad.lo.s32 %r369, %r17, %r15, %r262;
+ mul.wide.s32 %rd48, %r369, 4;
add.s64 %rd30, %rd6, %rd48;
- st.global.cs.v4.s32 [%rd30], {%r259,%r260,%r261,%r262};
+ st.global.cs.v4.s32 [%rd30], {%r257,%r258,%r259,%r260};
bra.uni $L__BB0_38;
$L__BB0_6:
- setp.ge.s32 %p8, %r15, %r66;
- mul.lo.s32 %r27, %r70, 12;
- or.b32 %r120, %r27, 3;
- shr.s32 %r121, %r120, 31;
- shr.u32 %r122, %r121, 30;
- add.s32 %r123, %r120, %r122;
- shr.s32 %r124, %r123, 2;
+ setp.ge.s32 %p6, %r15, %r65;
+ mul.lo.s32 %r27, %r69, 12;
+ or.b32 %r118, %r27, 3;
+ shr.s32 %r119, %r118, 31;
+ shr.u32 %r120, %r119, 30;
+ add.s32 %r121, %r118, %r120;
+ shr.s32 %r122, %r121, 2;
mov.u32 %r29, %ntid.x;
- add.s32 %r125, %r29, %r124;
- add.s32 %r28, %r125, -1;
- shl.b32 %r126, %r4, 2;
- shl.b32 %r127, %r29, 2;
+ add.s32 %r123, %r29, %r122;
+ add.s32 %r28, %r123, -1;
+ shl.b32 %r124, %r4, 2;
+ shl.b32 %r125, %r29, 2;
mov.u32 %r30, %ctaid.y;
- mad.lo.s32 %r128, %r127, %r30, %r126;
- or.b32 %r31, %r128, 3;
- shl.b32 %r129, %r12, 1;
- add.s32 %r130, %r31, %r129;
- add.s32 %r32, %r130, -3;
- mul.lo.s32 %r33, %r70, 6;
- mul.lo.s32 %r34, %r70, 3;
- mul.lo.s32 %r35, %r15, %r71;
+ mad.lo.s32 %r126, %r125, %r30, %r124;
+ or.b32 %r31, %r126, 3;
+ shl.b32 %r127, %r12, 1;
+ add.s32 %r128, %r31, %r127;
+ add.s32 %r32, %r128, -3;
+ mul.lo.s32 %r33, %r69, 6;
+ mul.lo.s32 %r34, %r69, 3;
+ mul.lo.s32 %r35, %r15, %r70;
mov.f32 %f86, 0f00000000;
mov.f32 %f85, %f86;
- @%p8 bra $L__BB0_9;
-
- div.s32 %r131, %r28, %r29;
- setp.ge.s32 %p9, %r30, %r131;
- setp.ge.s32 %p10, %r31, %r27;
- or.pred %p11, %p9, %p10;
- @%p11 bra $L__BB0_9;
-
- div.s32 %r132, %r32, %r33;
- mul.lo.s32 %r133, %r132, %r33;
- sub.s32 %r134, %r32, %r133;
- div.s32 %r135, %r134, %r34;
- mul.lo.s32 %r136, %r135, %r34;
- sub.s32 %r137, %r134, %r136;
- div.s32 %r138, %r137, %r70;
- mul.lo.s32 %r139, %r138, %r70;
- sub.s32 %r140, %r137, %r139;
- mad.lo.s32 %r141, %r132, %r72, %r35;
- mad.lo.s32 %r142, %r135, %r73, %r141;
- mad.lo.s32 %r143, %r138, %r74, %r142;
- mad.lo.s32 %r144, %r140, %r75, %r143;
- mul.wide.s32 %rd9, %r144, 4;
+ @%p6 bra $L__BB0_9;
+
+ div.s32 %r129, %r28, %r29;
+ setp.ge.s32 %p7, %r30, %r129;
+ setp.ge.s32 %p8, %r31, %r27;
+ or.pred %p9, %p7, %p8;
+ @%p9 bra $L__BB0_9;
+
+ div.s32 %r130, %r32, %r33;
+ mul.lo.s32 %r131, %r130, %r33;
+ sub.s32 %r132, %r32, %r131;
+ div.s32 %r133, %r132, %r34;
+ mul.lo.s32 %r134, %r133, %r34;
+ sub.s32 %r135, %r132, %r134;
+ div.s32 %r136, %r135, %r69;
+ mul.lo.s32 %r137, %r136, %r69;
+ sub.s32 %r138, %r135, %r137;
+ mad.lo.s32 %r139, %r130, %r71, %r35;
+ mad.lo.s32 %r140, %r133, %r72, %r139;
+ mad.lo.s32 %r141, %r136, %r73, %r140;
+ mad.lo.s32 %r142, %r138, %r74, %r141;
+ mul.wide.s32 %rd9, %r142, 4;
add.s64 %rd10, %rd1, %rd9;
ld.global.f32 %f85, [%rd10];
$L__BB0_9:
- @%p8 bra $L__BB0_12;
-
- div.s32 %r145, %r28, %r29;
- setp.ge.s32 %p13, %r30, %r145;
- setp.ge.s32 %p14, %r31, %r27;
- or.pred %p15, %p13, %p14;
- @%p15 bra $L__BB0_12;
-
- add.s32 %r146, %r32, 1;
- div.s32 %r147, %r146, %r33;
- mul.lo.s32 %r148, %r147, %r33;
- sub.s32 %r149, %r146, %r148;
- div.s32 %r150, %r149, %r34;
- mul.lo.s32 %r151, %r150, %r34;
- sub.s32 %r152, %r149, %r151;
- div.s32 %r153, %r152, %r70;
- mul.lo.s32 %r154, %r153, %r70;
- sub.s32 %r155, %r152, %r154;
- mad.lo.s32 %r156, %r147, %r72, %r35;
- mad.lo.s32 %r157, %r150, %r73, %r156;
- mad.lo.s32 %r158, %r153, %r74, %r157;
- mad.lo.s32 %r159, %r155, %r75, %r158;
- mul.wide.s32 %rd11, %r159, 4;
+ @%p6 bra $L__BB0_12;
+
+ div.s32 %r143, %r28, %r29;
+ setp.ge.s32 %p11, %r30, %r143;
+ setp.ge.s32 %p12, %r31, %r27;
+ or.pred %p13, %p11, %p12;
+ @%p13 bra $L__BB0_12;
+
+ add.s32 %r144, %r32, 1;
+ div.s32 %r145, %r144, %r33;
+ mul.lo.s32 %r146, %r145, %r33;
+ sub.s32 %r147, %r144, %r146;
+ div.s32 %r148, %r147, %r34;
+ mul.lo.s32 %r149, %r148, %r34;
+ sub.s32 %r150, %r147, %r149;
+ div.s32 %r151, %r150, %r69;
+ mul.lo.s32 %r152, %r151, %r69;
+ sub.s32 %r153, %r150, %r152;
+ mad.lo.s32 %r154, %r145, %r71, %r35;
+ mad.lo.s32 %r155, %r148, %r72, %r154;
+ mad.lo.s32 %r156, %r151, %r73, %r155;
+ mad.lo.s32 %r157, %r153, %r74, %r156;
+ mul.wide.s32 %rd11, %r157, 4;
add.s64 %rd12, %rd1, %rd11;
ld.global.f32 %f86, [%rd12];
$L__BB0_12:
mov.f32 %f88, 0f00000000;
mov.f32 %f87, %f88;
- @%p8 bra $L__BB0_15;
-
- div.s32 %r160, %r28, %r29;
- setp.ge.s32 %p17, %r30, %r160;
- setp.ge.s32 %p18, %r31, %r27;
- or.pred %p19, %p17, %p18;
- @%p19 bra $L__BB0_15;
-
- add.s32 %r161, %r32, 2;
- div.s32 %r162, %r161, %r33;
- mul.lo.s32 %r163, %r162, %r33;
- sub.s32 %r164, %r161, %r163;
- div.s32 %r165, %r164, %r34;
- mul.lo.s32 %r166, %r165, %r34;
- sub.s32 %r167, %r164, %r166;
- div.s32 %r168, %r167, %r70;
- mul.lo.s32 %r169, %r168, %r70;
- sub.s32 %r170, %r167, %r169;
- mad.lo.s32 %r171, %r162, %r72, %r35;
- mad.lo.s32 %r172, %r165, %r73, %r171;
- mad.lo.s32 %r173, %r168, %r74, %r172;
- mad.lo.s32 %r174, %r170, %r75, %r173;
- mul.wide.s32 %rd13, %r174, 4;
+ @%p6 bra $L__BB0_15;
+
+ div.s32 %r158, %r28, %r29;
+ setp.ge.s32 %p15, %r30, %r158;
+ setp.ge.s32 %p16, %r31, %r27;
+ or.pred %p17, %p15, %p16;
+ @%p17 bra $L__BB0_15;
+
+ add.s32 %r159, %r32, 2;
+ div.s32 %r160, %r159, %r33;
+ mul.lo.s32 %r161, %r160, %r33;
+ sub.s32 %r162, %r159, %r161;
+ div.s32 %r163, %r162, %r34;
+ mul.lo.s32 %r164, %r163, %r34;
+ sub.s32 %r165, %r162, %r164;
+ div.s32 %r166, %r165, %r69;
+ mul.lo.s32 %r167, %r166, %r69;
+ sub.s32 %r168, %r165, %r167;
+ mad.lo.s32 %r169, %r160, %r71, %r35;
+ mad.lo.s32 %r170, %r163, %r72, %r169;
+ mad.lo.s32 %r171, %r166, %r73, %r170;
+ mad.lo.s32 %r172, %r168, %r74, %r171;
+ mul.wide.s32 %rd13, %r172, 4;
add.s64 %rd14, %rd1, %rd13;
ld.global.f32 %f87, [%rd14];
$L__BB0_15:
- @%p8 bra $L__BB0_18;
-
- div.s32 %r175, %r28, %r29;
- setp.ge.s32 %p21, %r30, %r175;
- setp.ge.s32 %p22, %r31, %r27;
- or.pred %p23, %p21, %p22;
- @%p23 bra $L__BB0_18;
-
- add.s32 %r176, %r32, 3;
- div.s32 %r177, %r176, %r33;
- mul.lo.s32 %r178, %r177, %r33;
- sub.s32 %r179, %r176, %r178;
- div.s32 %r180, %r179, %r34;
- mul.lo.s32 %r181, %r180, %r34;
- sub.s32 %r182, %r179, %r181;
- div.s32 %r183, %r182, %r70;
- mul.lo.s32 %r184, %r183, %r70;
- sub.s32 %r185, %r182, %r184;
- mad.lo.s32 %r186, %r177, %r72, %r35;
- mad.lo.s32 %r187, %r180, %r73, %r186;
- mad.lo.s32 %r188, %r183, %r74, %r187;
- mad.lo.s32 %r189, %r185, %r75, %r188;
- mul.wide.s32 %rd15, %r189, 4;
+ @%p6 bra $L__BB0_18;
+
+ div.s32 %r173, %r28, %r29;
+ setp.ge.s32 %p19, %r30, %r173;
+ setp.ge.s32 %p20, %r31, %r27;
+ or.pred %p21, %p19, %p20;
+ @%p21 bra $L__BB0_18;
+
+ add.s32 %r174, %r32, 3;
+ div.s32 %r175, %r174, %r33;
+ mul.lo.s32 %r176, %r175, %r33;
+ sub.s32 %r177, %r174, %r176;
+ div.s32 %r178, %r177, %r34;
+ mul.lo.s32 %r179, %r178, %r34;
+ sub.s32 %r180, %r177, %r179;
+ div.s32 %r181, %r180, %r69;
+ mul.lo.s32 %r182, %r181, %r69;
+ sub.s32 %r183, %r180, %r182;
+ mad.lo.s32 %r184, %r175, %r71, %r35;
+ mad.lo.s32 %r185, %r178, %r72, %r184;
+ mad.lo.s32 %r186, %r181, %r73, %r185;
+ mad.lo.s32 %r187, %r183, %r74, %r186;
+ mul.wide.s32 %rd15, %r187, 4;
add.s64 %rd16, %rd1, %rd15;
ld.global.f32 %f88, [%rd16];
$L__BB0_18:
- mul.lo.s32 %r36, %r78, %r79;
- add.s32 %r190, %r36, 1;
- shr.u32 %r191, %r190, 31;
- add.s32 %r192, %r190, %r191;
- shr.s32 %r37, %r192, 1;
- mad.lo.s32 %r193, %r30, %r29, %r4;
- shl.b32 %r194, %r193, 2;
- shl.b32 %r195, %r12, 3;
- add.s32 %r38, %r195, %r194;
- mul.lo.s32 %r39, %r37, %r76;
- add.s32 %r196, %r39, 1;
- shr.u32 %r197, %r196, 31;
- add.s32 %r198, %r196, %r197;
- shr.s32 %r40, %r198, 1;
- mul.lo.s32 %r41, %r36, %r15;
+ mul.lo.s32 %r36, %r77, %r78;
+ add.s32 %r188, %r36, 1;
+ shr.u32 %r189, %r188, 31;
+ add.s32 %r190, %r188, %r189;
+ shr.s32 %r37, %r190, 1;
+ mad.lo.s32 %r191, %r30, %r29, %r4;
+ shl.b32 %r192, %r191, 2;
+ shl.b32 %r193, %r12, 3;
+ add.s32 %r38, %r193, %r192;
+ mad.lo.s32 %r194, %r37, %r75, 1;
+ shr.u32 %r195, %r194, 31;
+ add.s32 %r196, %r194, %r195;
+ shr.s32 %r39, %r196, 1;
+ mul.lo.s32 %r40, %r36, %r15;
mov.f32 %f90, 0f00000000;
mov.f32 %f89, %f90;
- @%p8 bra $L__BB0_21;
-
- div.s32 %r199, %r38, %r33;
- mul.lo.s32 %r200, %r199, %r33;
- sub.s32 %r201, %r38, %r200;
- div.s32 %r202, %r201, %r34;
- mul.lo.s32 %r203, %r202, %r34;
- sub.s32 %r204, %r201, %r203;
- mad.lo.s32 %r205, %r202, %r40, %r204;
- div.s32 %r206, %r205, %r76;
- mad.lo.s32 %r42, %r199, %r37, %r206;
- setp.ge.s32 %p25, %r42, %r36;
- setp.ge.s32 %p26, %r205, %r39;
- or.pred %p27, %p25, %p26;
- @%p27 bra $L__BB0_21;
-
- add.s32 %r207, %r42, %r41;
- mul.wide.s32 %rd17, %r207, 4;
+ @%p6 bra $L__BB0_21;
+
+ div.s32 %r197, %r38, %r33;
+ mul.lo.s32 %r198, %r197, %r33;
+ sub.s32 %r199, %r38, %r198;
+ div.s32 %r200, %r199, %r34;
+ mul.lo.s32 %r201, %r200, %r34;
+ sub.s32 %r202, %r199, %r201;
+ mad.lo.s32 %r203, %r200, %r39, %r202;
+ div.s32 %r204, %r203, %r75;
+ mad.lo.s32 %r41, %r197, %r37, %r204;
+ setp.ge.s32 %p23, %r41, %r36;
+ @%p23 bra $L__BB0_21;
+
+ add.s32 %r205, %r41, %r40;
+ mul.wide.s32 %rd17, %r205, 4;
add.s64 %rd18, %rd2, %rd17;
ld.global.f32 %f89, [%rd18];
$L__BB0_21:
- @%p8 bra $L__BB0_24;
-
- add.s32 %r208, %r38, 1;
- div.s32 %r209, %r208, %r33;
- mul.lo.s32 %r210, %r209, %r33;
- sub.s32 %r211, %r208, %r210;
- div.s32 %r212, %r211, %r34;
- mul.lo.s32 %r213, %r212, %r34;
- sub.s32 %r214, %r211, %r213;
- mad.lo.s32 %r215, %r212, %r40, %r214;
- div.s32 %r216, %r215, %r76;
- mad.lo.s32 %r43, %r209, %r37, %r216;
- setp.ge.s32 %p29, %r43, %r36;
- setp.ge.s32 %p30, %r215, %r39;
- or.pred %p31, %p29, %p30;
- @%p31 bra $L__BB0_24;
-
- add.s32 %r217, %r43, %r41;
- mul.wide.s32 %rd19, %r217, 4;
+ @%p6 bra $L__BB0_24;
+
+ add.s32 %r206, %r38, 1;
+ div.s32 %r207, %r206, %r33;
+ mul.lo.s32 %r208, %r207, %r33;
+ sub.s32 %r209, %r206, %r208;
+ div.s32 %r210, %r209, %r34;
+ mul.lo.s32 %r211, %r210, %r34;
+ sub.s32 %r212, %r209, %r211;
+ mad.lo.s32 %r213, %r210, %r39, %r212;
+ div.s32 %r214, %r213, %r75;
+ mad.lo.s32 %r42, %r207, %r37, %r214;
+ setp.ge.s32 %p25, %r42, %r36;
+ @%p25 bra $L__BB0_24;
+
+ add.s32 %r215, %r42, %r40;
+ mul.wide.s32 %rd19, %r215, 4;
add.s64 %rd20, %rd2, %rd19;
ld.global.f32 %f90, [%rd20];
$L__BB0_24:
mov.f32 %f92, 0f00000000;
mov.f32 %f91, %f92;
- @%p8 bra $L__BB0_27;
-
- add.s32 %r218, %r38, 2;
- div.s32 %r219, %r218, %r33;
- mul.lo.s32 %r220, %r219, %r33;
- sub.s32 %r221, %r218, %r220;
- div.s32 %r222, %r221, %r34;
- mul.lo.s32 %r223, %r222, %r34;
- sub.s32 %r224, %r221, %r223;
- mad.lo.s32 %r225, %r222, %r40, %r224;
- div.s32 %r226, %r225, %r76;
- mad.lo.s32 %r44, %r219, %r37, %r226;
- setp.ge.s32 %p33, %r44, %r36;
- setp.ge.s32 %p34, %r225, %r39;
- or.pred %p35, %p33, %p34;
- @%p35 bra $L__BB0_27;
-
- add.s32 %r227, %r44, %r41;
- mul.wide.s32 %rd21, %r227, 4;
+ @%p6 bra $L__BB0_27;
+
+ add.s32 %r216, %r38, 2;
+ div.s32 %r217, %r216, %r33;
+ mul.lo.s32 %r218, %r217, %r33;
+ sub.s32 %r219, %r216, %r218;
+ div.s32 %r220, %r219, %r34;
+ mul.lo.s32 %r221, %r220, %r34;
+ sub.s32 %r222, %r219, %r221;
+ mad.lo.s32 %r223, %r220, %r39, %r222;
+ div.s32 %r224, %r223, %r75;
+ mad.lo.s32 %r43, %r217, %r37, %r224;
+ setp.ge.s32 %p27, %r43, %r36;
+ @%p27 bra $L__BB0_27;
+
+ add.s32 %r225, %r43, %r40;
+ mul.wide.s32 %rd21, %r225, 4;
add.s64 %rd22, %rd2, %rd21;
ld.global.f32 %f91, [%rd22];
$L__BB0_27:
- @%p8 bra $L__BB0_30;
-
- add.s32 %r228, %r38, 3;
- div.s32 %r229, %r228, %r33;
- mul.lo.s32 %r230, %r229, %r33;
- sub.s32 %r231, %r228, %r230;
- div.s32 %r232, %r231, %r34;
- mul.lo.s32 %r233, %r232, %r34;
- sub.s32 %r234, %r231, %r233;
- mad.lo.s32 %r235, %r232, %r40, %r234;
- div.s32 %r236, %r235, %r76;
- mad.lo.s32 %r45, %r229, %r37, %r236;
- setp.ge.s32 %p37, %r45, %r36;
- setp.ge.s32 %p38, %r235, %r39;
- or.pred %p39, %p37, %p38;
- @%p39 bra $L__BB0_30;
-
- add.s32 %r237, %r45, %r41;
- mul.wide.s32 %rd23, %r237, 4;
+ @%p6 bra $L__BB0_30;
+
+ add.s32 %r226, %r38, 3;
+ div.s32 %r227, %r226, %r33;
+ mul.lo.s32 %r228, %r227, %r33;
+ sub.s32 %r229, %r226, %r228;
+ div.s32 %r230, %r229, %r34;
+ mul.lo.s32 %r231, %r230, %r34;
+ sub.s32 %r232, %r229, %r231;
+ mad.lo.s32 %r233, %r230, %r39, %r232;
+ div.s32 %r234, %r233, %r75;
+ mad.lo.s32 %r44, %r227, %r37, %r234;
+ setp.ge.s32 %p29, %r44, %r36;
+ @%p29 bra $L__BB0_30;
+
+ add.s32 %r235, %r44, %r40;
+ mul.wide.s32 %rd23, %r235, 4;
add.s64 %rd24, %rd2, %rd23;
ld.global.f32 %f92, [%rd24];
$L__BB0_30:
mov.f32 %f93, 0f00000000;
mov.f32 %f94, 0f00000000;
mov.f32 %f95, 0f00000000;
mov.f32 %f96, 0f00000000;
- @%p8 bra $L__BB0_34;
-
- div.s32 %r238, %r28, %r29;
- setp.ge.s32 %p41, %r30, %r238;
- @%p41 bra $L__BB0_34;
-
- setp.ge.s32 %p42, %r31, %r27;
- @%p42 bra $L__BB0_34;
-
- mul.lo.s32 %r243, %r15, %r79;
- mul.lo.s32 %r244, %r78, %r76;
- mad.lo.s32 %r245, %r244, %r243, %r31;
- add.s32 %r246, %r245, -3;
- mul.wide.s32 %rd26, %r246, 4;
+ @%p6 bra $L__BB0_34;
+
+ div.s32 %r236, %r28, %r29;
+ setp.ge.s32 %p31, %r30, %r236;
+ @%p31 bra $L__BB0_34;
+
+ setp.ge.s32 %p32, %r31, %r27;
+ @%p32 bra $L__BB0_34;
+
+ mul.lo.s32 %r241, %r15, %r78;
+ mul.lo.s32 %r242, %r77, %r75;
+ mad.lo.s32 %r243, %r242, %r241, %r31;
+ add.s32 %r244, %r243, -3;
+ mul.wide.s32 %rd26, %r244, 4;
add.s64 %rd25, %rd3, %rd26;
- ld.global.cs.v4.u32 {%r239,%r240,%r241,%r242}, [%rd25];
-
- mov.b32 %f96, %r239;
- mov.b32 %f95, %r240;
- mov.b32 %f94, %r241;
- mov.b32 %f93, %r242;
+ ld.global.cs.v4.u32 {%r237,%r238,%r239,%r240}, [%rd25];
+
+ mov.b32 %f96, %r237;
+ mov.b32 %f95, %r238;
+ mov.b32 %f94, %r239;
+ mov.b32 %f93, %r240;
$L__BB0_34:
fma.rn.f32 %f25, %f96, %f89, %f85;
fma.rn.f32 %f26, %f95, %f90, %f86;
fma.rn.f32 %f27, %f94, %f91, %f87;
fma.rn.f32 %f28, %f93, %f92, %f88;
- @%p8 bra $L__BB0_38;
-
- div.s32 %r247, %r28, %r29;
- setp.ge.s32 %p44, %r30, %r247;
- setp.ge.s32 %p45, %r31, %r27;
- or.pred %p46, %p44, %p45;
- @%p46 bra $L__BB0_38;
-
- mad.lo.s32 %r252, %r27, %r14, %r31;
- add.s32 %r253, %r252, -3;
- mad.lo.s32 %r254, %r27, %r13, %r253;
- mul.wide.s32 %rd28, %r254, 4;
+ @%p6 bra $L__BB0_38;
+
+ div.s32 %r245, %r28, %r29;
+ setp.ge.s32 %p34, %r30, %r245;
+ setp.ge.s32 %p35, %r31, %r27;
+ or.pred %p36, %p34, %p35;
+ @%p36 bra $L__BB0_38;
+
+ mad.lo.s32 %r250, %r27, %r14, %r31;
+ add.s32 %r251, %r250, -3;
+ mad.lo.s32 %r252, %r27, %r13, %r251;
+ mul.wide.s32 %rd28, %r252, 4;
add.s64 %rd27, %rd6, %rd28;
- mov.b32 %r248, %f25;
- mov.b32 %r249, %f26;
- mov.b32 %r250, %f27;
- mov.b32 %r251, %f28;
-
- st.global.cs.v4.s32 [%rd27], {%r248,%r249,%r250,%r251};
+ mov.b32 %r246, %f25;
+ mov.b32 %r247, %f26;
+ mov.b32 %r248, %f27;
+ mov.b32 %r249, %f28;
+
+ st.global.cs.v4.s32 [%rd27], {%r246,%r247,%r248,%r249};
$L__BB0_38:
ret;
Kernel 149
CUDA
PTX
53997da5d
Diff
03a1b695e
-2
+2 index type: int
registers: 39→ 23
gmem: 3
static smem: 4→ 16
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 5, 5> T1, Tensor<float, 5, 5> T8, Tensor<float, 5, 5> T5, Tensor<float, 5, 5> T11) {
NVFUSER_DEFINE_MAGIC_ZERO;
if (((((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < T1.logical_size[0LL]) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv((12 * T1.logical_size[4LL]), 4)), ((nvfuser_index_t)blockDim.x))))) && ((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) < (12 * T1.logical_size[4LL]))) && ((((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * ((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) / (6 * T1.logical_size[4LL]))) + (((((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) + ((ceilDiv(((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL]), 2)) * (((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) / T5.logical_size[4LL])) < (T5.logical_size[2LL] * T5.logical_size[3LL]))) && (((((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) + ((ceilDiv(((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL]), 2)) * (((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) < ((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL])))) {
Array<float, 4, 1> T12;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
T12[i0] = 0;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
T12[i0]
= T1[((((((T1.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + ((((nvfuser_index_t)blockDim.y) * T1.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.x))) + (T1.alloc_stride[1LL] * ((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i0 + nvfuser_zero)) / (6 * T1.logical_size[4LL])))) + (T1.alloc_stride[2LL] * (((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i0 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) + (T1.alloc_stride[3LL] * ((((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i0 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) / T1.logical_size[4LL]))) + (T1.alloc_stride[4LL] * ((((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i0 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) % T1.logical_size[4LL])))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 4, 1> T13;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
T13[i1] = 0;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
T13[i1]
= T8[(((((T5.logical_size[3LL] * T5.logical_size[2LL]) * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * T5.logical_size[3LL]) * T5.logical_size[2LL]) * ((nvfuser_index_t)blockIdx.x))) + ((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * ((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) / (6 * T1.logical_size[4LL])))) + (((((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) + ((ceilDiv(((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL]), 2)) * (((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) / T5.logical_size[4LL]))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 4, 4> T14;
T14.set(float(0));
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T14[0], &T5[((((4 * ((nvfuser_index_t)threadIdx.x)) + (((T5.logical_size[4LL] * T5.logical_size[3LL]) * T5.logical_size[2LL]) * ((nvfuser_index_t)threadIdx.y))) + ((((((nvfuser_index_t)blockDim.y) * T5.logical_size[4LL]) * T5.logical_size[3LL]) * T5.logical_size[2LL]) * ((nvfuser_index_t)blockIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y)))]);
Array<float, 4, 4> T15;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 4; ++i2) {
Array<float, 1, 1> T9;
T9[0]
= T14[i2]
* T13[i2];
Array<float, 1, 1> T10;
T10[0]
= T9[0];
T15[i2]
= T10[0]
+ T12[i2];
}
NVFUSER_UPDATE_MAGIC_ZERO;
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T11[((((4 * ((nvfuser_index_t)threadIdx.x)) + ((12 * T1.logical_size[4LL]) * ((nvfuser_index_t)threadIdx.y))) + (((12 * ((nvfuser_index_t)blockDim.y)) * T1.logical_size[4LL]) * ((nvfuser_index_t)blockIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y)))], &T15[0]);
} else {
Array<float, 4, 1> T12;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
T12[i0] = 0;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
if (((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < T1.logical_size[0LL]) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv((12 * T1.logical_size[4LL]), 4)), ((nvfuser_index_t)blockDim.x))))) && (((3 + (4 * ((nvfuser_index_t)threadIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) < (12 * T1.logical_size[4LL])))) {
T12[i0]
= T1[((((((T1.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + ((((nvfuser_index_t)blockDim.y) * T1.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.x))) + (T1.alloc_stride[1LL] * ((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i0 + nvfuser_zero)) / (6 * T1.logical_size[4LL])))) + (T1.alloc_stride[2LL] * (((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i0 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) + (T1.alloc_stride[3LL] * ((((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i0 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) / T1.logical_size[4LL]))) + (T1.alloc_stride[4LL] * ((((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i0 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) % T1.logical_size[4LL])))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 4, 1> T13;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
T13[i1] = 0;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
if (((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < T1.logical_size[0LL]) && ((((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * ((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) / (6 * T1.logical_size[4LL]))) + (((((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) + ((ceilDiv(((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL]), 2)) * (((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) / T5.logical_size[4LL])) < (T5.logical_size[2LL] * T5.logical_size[3LL]))) && (((((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) + ((ceilDiv(((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL]), 2)) * (((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) < ((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL])))) {
T13[i1]
= T8[(((((T5.logical_size[3LL] * T5.logical_size[2LL]) * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * T5.logical_size[3LL]) * T5.logical_size[2LL]) * ((nvfuser_index_t)blockIdx.x))) + ((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * ((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) / (6 * T1.logical_size[4LL])))) + (((((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) + ((ceilDiv(((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL]), 2)) * (((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) / T5.logical_size[4LL]))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 4, 4> T14;
T14.set(float(0));
if (((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < T1.logical_size[0LL]) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv((12 * T1.logical_size[4LL]), 4)), ((nvfuser_index_t)blockDim.x))))) && (((3 + (4 * ((nvfuser_index_t)threadIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) < (12 * T1.logical_size[4LL])))) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T14[0], &T5[((((4 * ((nvfuser_index_t)threadIdx.x)) + (((T5.logical_size[4LL] * T5.logical_size[3LL]) * T5.logical_size[2LL]) * ((nvfuser_index_t)threadIdx.y))) + ((((((nvfuser_index_t)blockDim.y) * T5.logical_size[4LL]) * T5.logical_size[3LL]) * T5.logical_size[2LL]) * ((nvfuser_index_t)blockIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y)))]);
}
Array<float, 4, 4> T15;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 4; ++i2) {
Array<float, 1, 1> T9;
T9[0]
= T14[i2]
* T13[i2];
Array<float, 1, 1> T10;
T10[0]
= T9[0];
T15[i2]
= T10[0]
+ T12[i2];
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < T1.logical_size[0LL]) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv((12 * T1.logical_size[4LL]), 4)), ((nvfuser_index_t)blockDim.x))))) && (((3 + (4 * ((nvfuser_index_t)threadIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) < (12 * T1.logical_size[4LL])))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T11[((((4 * ((nvfuser_index_t)threadIdx.x)) + ((12 * T1.logical_size[4LL]) * ((nvfuser_index_t)threadIdx.y))) + (((12 * ((nvfuser_index_t)blockDim.y)) * T1.logical_size[4LL]) * ((nvfuser_index_t)blockIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y)))], &T15[0]);
}
}
}
__global__ void nvfuser_N(Tensor<float, 5, 5> T1, Tensor<float, 5, 5> T8, Tensor<float, 5, 5> T5, Tensor<float, 5, 5> T11) {
NVFUSER_DEFINE_MAGIC_ZERO;
if ((((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < T1.logical_size[0LL]) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv((12 * T1.logical_size[4LL]), 4)), ((nvfuser_index_t)blockDim.x))))) && ((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) < (12 * T1.logical_size[4LL]))) && ((((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * ((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) / (6 * T1.logical_size[4LL]))) + (((((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) + ((ceilDiv(((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL]), 2)) * (((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) / T5.logical_size[4LL])) < (T5.logical_size[2LL] * T5.logical_size[3LL])))) {
Array<float, 4, 1> T12;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
T12[i0] = 0;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
T12[i0]
= T1[((((((T1.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + ((((nvfuser_index_t)blockDim.y) * T1.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.x))) + (T1.alloc_stride[1LL] * ((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i0 + nvfuser_zero)) / (6 * T1.logical_size[4LL])))) + (T1.alloc_stride[2LL] * (((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i0 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) + (T1.alloc_stride[3LL] * ((((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i0 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) / T1.logical_size[4LL]))) + (T1.alloc_stride[4LL] * ((((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i0 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) % T1.logical_size[4LL])))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 4, 1> T13;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
T13[i1] = 0;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
T13[i1]
= T8[(((((T5.logical_size[3LL] * T5.logical_size[2LL]) * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * T5.logical_size[3LL]) * T5.logical_size[2LL]) * ((nvfuser_index_t)blockIdx.x))) + ((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * ((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) / (6 * T1.logical_size[4LL])))) + (((((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) + ((ceilDiv(((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL]), 2)) * (((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) / T5.logical_size[4LL]))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 4, 4> T14;
T14.set(float(0));
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T14[0], &T5[((((4 * ((nvfuser_index_t)threadIdx.x)) + (((T5.logical_size[4LL] * T5.logical_size[3LL]) * T5.logical_size[2LL]) * ((nvfuser_index_t)threadIdx.y))) + ((((((nvfuser_index_t)blockDim.y) * T5.logical_size[4LL]) * T5.logical_size[3LL]) * T5.logical_size[2LL]) * ((nvfuser_index_t)blockIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y)))]);
Array<float, 4, 4> T15;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 4; ++i2) {
Array<float, 1, 1> T9;
T9[0]
= T14[i2]
* T13[i2];
Array<float, 1, 1> T10;
T10[0]
= T9[0];
T15[i2]
= T10[0]
+ T12[i2];
}
NVFUSER_UPDATE_MAGIC_ZERO;
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T11[((((4 * ((nvfuser_index_t)threadIdx.x)) + ((12 * T1.logical_size[4LL]) * ((nvfuser_index_t)threadIdx.y))) + (((12 * ((nvfuser_index_t)blockDim.y)) * T1.logical_size[4LL]) * ((nvfuser_index_t)blockIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y)))], &T15[0]);
} else {
Array<float, 4, 1> T12;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
T12[i0] = 0;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
if (((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < T1.logical_size[0LL]) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv((12 * T1.logical_size[4LL]), 4)), ((nvfuser_index_t)blockDim.x))))) && (((3 + (4 * ((nvfuser_index_t)threadIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) < (12 * T1.logical_size[4LL])))) {
T12[i0]
= T1[((((((T1.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + ((((nvfuser_index_t)blockDim.y) * T1.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.x))) + (T1.alloc_stride[1LL] * ((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i0 + nvfuser_zero)) / (6 * T1.logical_size[4LL])))) + (T1.alloc_stride[2LL] * (((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i0 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) + (T1.alloc_stride[3LL] * ((((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i0 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) / T1.logical_size[4LL]))) + (T1.alloc_stride[4LL] * ((((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i0 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) % T1.logical_size[4LL])))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 4, 1> T13;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
T13[i1] = 0;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
if ((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < T1.logical_size[0LL]) && ((((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * ((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) / (6 * T1.logical_size[4LL]))) + (((((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) + ((ceilDiv(((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL]), 2)) * (((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) / T5.logical_size[4LL])) < (T5.logical_size[2LL] * T5.logical_size[3LL])))) {
T13[i1]
= T8[(((((T5.logical_size[3LL] * T5.logical_size[2LL]) * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * T5.logical_size[3LL]) * T5.logical_size[2LL]) * ((nvfuser_index_t)blockIdx.x))) + ((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * ((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) / (6 * T1.logical_size[4LL])))) + (((((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) + ((ceilDiv(((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL]), 2)) * (((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) / T5.logical_size[4LL]))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 4, 4> T14;
T14.set(float(0));
if (((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < T1.logical_size[0LL]) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv((12 * T1.logical_size[4LL]), 4)), ((nvfuser_index_t)blockDim.x))))) && (((3 + (4 * ((nvfuser_index_t)threadIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) < (12 * T1.logical_size[4LL])))) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T14[0], &T5[((((4 * ((nvfuser_index_t)threadIdx.x)) + (((T5.logical_size[4LL] * T5.logical_size[3LL]) * T5.logical_size[2LL]) * ((nvfuser_index_t)threadIdx.y))) + ((((((nvfuser_index_t)blockDim.y) * T5.logical_size[4LL]) * T5.logical_size[3LL]) * T5.logical_size[2LL]) * ((nvfuser_index_t)blockIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y)))]);
}
Array<float, 4, 4> T15;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 4; ++i2) {
Array<float, 1, 1> T9;
T9[0]
= T14[i2]
* T13[i2];
Array<float, 1, 1> T10;
T10[0]
= T9[0];
T15[i2]
= T10[0]
+ T12[i2];
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < T1.logical_size[0LL]) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv((12 * T1.logical_size[4LL]), 4)), ((nvfuser_index_t)blockDim.x))))) && (((3 + (4 * ((nvfuser_index_t)threadIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) < (12 * T1.logical_size[4LL])))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T11[((((4 * ((nvfuser_index_t)threadIdx.x)) + ((12 * T1.logical_size[4LL]) * ((nvfuser_index_t)threadIdx.y))) + (((12 * ((nvfuser_index_t)blockDim.y)) * T1.logical_size[4LL]) * ((nvfuser_index_t)blockIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y)))], &T15[0]);
}
}
}
--- 53997da5d
+++ 03a1b695e
@@ -1,8 +1,8 @@
__global__ void nvfuser_N(Tensor<float, 5, 5> T1, Tensor<float, 5, 5> T8, Tensor<float, 5, 5> T5, Tensor<float, 5, 5> T11) {
NVFUSER_DEFINE_MAGIC_ZERO;
- if (((((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < T1.logical_size[0LL]) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv((12 * T1.logical_size[4LL]), 4)), ((nvfuser_index_t)blockDim.x))))) && ((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) < (12 * T1.logical_size[4LL]))) && ((((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * ((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) / (6 * T1.logical_size[4LL]))) + (((((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) + ((ceilDiv(((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL]), 2)) * (((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) / T5.logical_size[4LL])) < (T5.logical_size[2LL] * T5.logical_size[3LL]))) && (((((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) + ((ceilDiv(((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL]), 2)) * (((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) < ((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL])))) {
+ if ((((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < T1.logical_size[0LL]) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv((12 * T1.logical_size[4LL]), 4)), ((nvfuser_index_t)blockDim.x))))) && ((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) < (12 * T1.logical_size[4LL]))) && ((((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * ((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) / (6 * T1.logical_size[4LL]))) + (((((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) + ((ceilDiv(((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL]), 2)) * (((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) / T5.logical_size[4LL])) < (T5.logical_size[2LL] * T5.logical_size[3LL])))) {
Array<float, 4, 1> T12;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
T12[i0] = 0;
}
@@ -65,11 +65,11 @@
T13[i1] = 0;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
- if (((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < T1.logical_size[0LL]) && ((((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * ((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) / (6 * T1.logical_size[4LL]))) + (((((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) + ((ceilDiv(((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL]), 2)) * (((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) / T5.logical_size[4LL])) < (T5.logical_size[2LL] * T5.logical_size[3LL]))) && (((((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) + ((ceilDiv(((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL]), 2)) * (((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) < ((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL])))) {
+ if ((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < T1.logical_size[0LL]) && ((((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * ((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) / (6 * T1.logical_size[4LL]))) + (((((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) + ((ceilDiv(((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL]), 2)) * (((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) / T5.logical_size[4LL])) < (T5.logical_size[2LL] * T5.logical_size[3LL])))) {
T13[i1]
= T8[(((((T5.logical_size[3LL] * T5.logical_size[2LL]) * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * T5.logical_size[3LL]) * T5.logical_size[2LL]) * ((nvfuser_index_t)blockIdx.x))) + ((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * ((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) / (6 * T1.logical_size[4LL])))) + (((((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) % (3 * T1.logical_size[4LL])) + ((ceilDiv(((ceilDiv((T5.logical_size[2LL] * T5.logical_size[3LL]), 2)) * T5.logical_size[4LL]), 2)) * (((((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.y))) + (i1 + nvfuser_zero)) % (6 * T1.logical_size[4LL])) / (3 * T1.logical_size[4LL])))) / T5.logical_size[4LL]))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_5ab439ca_1911011nvfuser_154ENS_6TensorIfLi5ELi5EEES1_S1_S1_E14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_5ab439ca_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_5ab439ca_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_5ab439ca_191103std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_5ab439ca_1911011nvfuser_154ENS_6TensorIfLi5ELi5EEES1_S1_S1_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_5ab439ca_1911011nvfuser_154ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0[48],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_5ab439ca_1911011nvfuser_154ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_1[48],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_5ab439ca_1911011nvfuser_154ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_2[48],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_5ab439ca_1911011nvfuser_154ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_3[48]
)
{
.reg .pred %p<47>;
.reg .f32 %f<97>;
.reg .b32 %r<372>;
.reg .b64 %rd<49>;
// demoted variable
.shared .align 4 .u32 _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_5ab439ca_1911011nvfuser_154ENS_6TensorIfLi5ELi5EEES1_S1_S1_E14nvfuser_zero_s;
ld.param.v2.u32 {%r66, %r67}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_5ab439ca_1911011nvfuser_154ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0+8];
ld.param.v2.u32 {%r70, %r71}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_5ab439ca_1911011nvfuser_154ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0+24];
ld.param.v2.u32 {%r72, %r73}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_5ab439ca_1911011nvfuser_154ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0+32];
ld.param.v2.u32 {%r74, %r75}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_5ab439ca_1911011nvfuser_154ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0+40];
ld.param.v2.u32 {%r76, %r77}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_5ab439ca_1911011nvfuser_154ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_2+24];
ld.param.v2.u32 {%r78, %r79}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_5ab439ca_1911011nvfuser_154ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_2+16];
ld.param.u64 %rd6, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_5ab439ca_1911011nvfuser_154ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_3];
ld.param.u64 %rd5, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_5ab439ca_1911011nvfuser_154ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0];
ld.param.u64 %rd7, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_5ab439ca_1911011nvfuser_154ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_1];
cvta.to.global.u64 %rd1, %rd5;
cvta.to.global.u64 %rd2, %rd7;
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_5ab439ca_1911011nvfuser_154ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_2];
mov.u32 %r4, %tid.x;
setp.ne.s32 %p1, %r4, 0;
@%p1 bra $L__BB0_2;
mov.u32 %r90, 0;
st.shared.u32 [_ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_5ab439ca_1911011nvfuser_154ENS_6TensorIfLi5ELi5EEES1_S1_S1_E14nvfuser_zero_s], %r90;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd8, _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_5ab439ca_1911011nvfuser_154ENS_6TensorIfLi5ELi5EEES1_S1_S1_E14nvfuser_zero_s;
atom.shared.min.s32 %r91, [%rd8], %r4;
ld.shared.u32 %r12, [_ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_5ab439ca_1911011nvfuser_154ENS_6TensorIfLi5ELi5EEES1_S1_S1_E14nvfuser_zero_s];
mov.u32 %r92, %ctaid.x;
mov.u32 %r93, %ntid.y;
mul.lo.s32 %r13, %r93, %r92;
mov.u32 %r14, %tid.y;
add.s32 %r15, %r13, %r14;
setp.lt.s32 %p2, %r15, %r66;
@%p2 bra $L__BB0_3;
bra.uni $L__BB0_6;
$L__BB0_3:
mov.u32 %r16, %ctaid.y;
mul.lo.s32 %r17, %r70, 12;
or.b32 %r94, %r17, 3;
shr.s32 %r95, %r94, 31;
shr.u32 %r96, %r95, 30;
add.s32 %r97, %r94, %r96;
shr.s32 %r98, %r97, 2;
mov.u32 %r18, %ntid.x;
add.s32 %r99, %r18, %r98;
add.s32 %r100, %r99, -1;
div.s32 %r101, %r100, %r18;
setp.ge.s32 %p3, %r16, %r101;
@%p3 bra $L__BB0_6;
shl.b32 %r19, %r4, 2;
mul.lo.s32 %r102, %r18, %r16;
shl.b32 %r20, %r102, 2;
add.s32 %r103, %r19, %r20;
or.b32 %r21, %r103, 3;
setp.ge.s32 %p4, %r21, %r17;
@%p4 bra $L__BB0_6;
mul.lo.s32 %r22, %r78, %r79;
add.s32 %r104, %r22, 1;
shr.u32 %r105, %r104, 31;
add.s32 %r106, %r104, %r105;
shr.s32 %r23, %r106, 1;
mul.lo.s32 %r24, %r70, 6;
div.s32 %r107, %r21, %r24;
mul.lo.s32 %r108, %r107, %r24;
sub.s32 %r109, %r21, %r108;
mul.lo.s32 %r25, %r70, 3;
div.s32 %r110, %r109, %r25;
mul.lo.s32 %r111, %r110, %r25;
sub.s32 %r112, %r109, %r111;
mul.lo.s32 %r113, %r23, %r76;
add.s32 %r114, %r113, 1;
shr.u32 %r115, %r114, 31;
add.s32 %r116, %r114, %r115;
shr.s32 %r26, %r116, 1;
mad.lo.s32 %r117, %r110, %r26, %r112;
div.s32 %r118, %r117, %r76;
mad.lo.s32 %r119, %r107, %r23, %r118;
setp.lt.s32 %p5, %r119, %r22;
setp.lt.s32 %p6, %r117, %r113;
and.pred %p7, %p5, %p6;
@%p7 bra $L__BB0_37;
bra.uni $L__BB0_6;
$L__BB0_37:
shl.b32 %r263, %r12, 1;
add.s32 %r264, %r20, %r19;
add.s32 %r265, %r264, %r263;
div.s32 %r266, %r265, %r24;
mul.lo.s32 %r267, %r266, %r24;
sub.s32 %r268, %r265, %r267;
div.s32 %r269, %r268, %r25;
mul.lo.s32 %r270, %r269, %r25;
sub.s32 %r271, %r268, %r270;
div.s32 %r272, %r271, %r70;
mul.lo.s32 %r273, %r272, %r70;
sub.s32 %r274, %r271, %r273;
mul.lo.s32 %r275, %r15, %r71;
mad.lo.s32 %r276, %r266, %r72, %r275;
mad.lo.s32 %r277, %r269, %r73, %r276;
mad.lo.s32 %r278, %r272, %r74, %r277;
mad.lo.s32 %r279, %r274, %r75, %r278;
mul.wide.s32 %rd31, %r279, 4;
add.s64 %rd32, %rd1, %rd31;
ld.global.f32 %f57, [%rd32];
add.s32 %r280, %r265, 1;
div.s32 %r281, %r280, %r24;
mul.lo.s32 %r282, %r281, %r24;
sub.s32 %r283, %r280, %r282;
div.s32 %r284, %r283, %r25;
mul.lo.s32 %r285, %r284, %r25;
sub.s32 %r286, %r283, %r285;
div.s32 %r287, %r286, %r70;
mul.lo.s32 %r288, %r287, %r70;
sub.s32 %r289, %r286, %r288;
mad.lo.s32 %r290, %r281, %r72, %r275;
mad.lo.s32 %r291, %r284, %r73, %r290;
mad.lo.s32 %r292, %r287, %r74, %r291;
mad.lo.s32 %r293, %r289, %r75, %r292;
mul.wide.s32 %rd33, %r293, 4;
add.s64 %rd34, %rd1, %rd33;
ld.global.f32 %f58, [%rd34];
add.s32 %r294, %r265, 2;
div.s32 %r295, %r294, %r24;
mul.lo.s32 %r296, %r295, %r24;
sub.s32 %r297, %r294, %r296;
div.s32 %r298, %r297, %r25;
mul.lo.s32 %r299, %r298, %r25;
sub.s32 %r300, %r297, %r299;
div.s32 %r301, %r300, %r70;
mul.lo.s32 %r302, %r301, %r70;
sub.s32 %r303, %r300, %r302;
mad.lo.s32 %r304, %r295, %r72, %r275;
mad.lo.s32 %r305, %r298, %r73, %r304;
mad.lo.s32 %r306, %r301, %r74, %r305;
mad.lo.s32 %r307, %r303, %r75, %r306;
mul.wide.s32 %rd35, %r307, 4;
add.s64 %rd36, %rd1, %rd35;
ld.global.f32 %f59, [%rd36];
add.s32 %r308, %r265, 3;
div.s32 %r309, %r308, %r24;
mul.lo.s32 %r310, %r309, %r24;
sub.s32 %r311, %r308, %r310;
div.s32 %r312, %r311, %r25;
mul.lo.s32 %r313, %r312, %r25;
sub.s32 %r314, %r311, %r313;
div.s32 %r315, %r314, %r70;
mul.lo.s32 %r316, %r315, %r70;
sub.s32 %r317, %r314, %r316;
mad.lo.s32 %r318, %r309, %r72, %r275;
mad.lo.s32 %r319, %r312, %r73, %r318;
mad.lo.s32 %r320, %r315, %r74, %r319;
mad.lo.s32 %r321, %r317, %r75, %r320;
mul.wide.s32 %rd37, %r321, 4;
add.s64 %rd38, %rd1, %rd37;
ld.global.f32 %f60, [%rd38];
shl.b32 %r322, %r12, 3;
add.s32 %r323, %r264, %r322;
div.s32 %r324, %r323, %r24;
mul.lo.s32 %r325, %r324, %r24;
sub.s32 %r326, %r323, %r325;
div.s32 %r327, %r326, %r25;
mul.lo.s32 %r328, %r327, %r25;
sub.s32 %r329, %r326, %r328;
mad.lo.s32 %r330, %r327, %r26, %r329;
div.s32 %r331, %r330, %r76;
mul.lo.s32 %r332, %r22, %r15;
mad.lo.s32 %r333, %r324, %r23, %r332;
add.s32 %r334, %r333, %r331;
mul.wide.s32 %rd39, %r334, 4;
add.s64 %rd40, %rd2, %rd39;
ld.global.f32 %f61, [%rd40];
add.s32 %r335, %r323, 1;
div.s32 %r336, %r335, %r24;
mul.lo.s32 %r337, %r336, %r24;
sub.s32 %r338, %r335, %r337;
div.s32 %r339, %r338, %r25;
mul.lo.s32 %r340, %r339, %r25;
sub.s32 %r341, %r338, %r340;
mad.lo.s32 %r342, %r339, %r26, %r341;
div.s32 %r343, %r342, %r76;
mad.lo.s32 %r344, %r336, %r23, %r332;
add.s32 %r345, %r344, %r343;
mul.wide.s32 %rd41, %r345, 4;
add.s64 %rd42, %rd2, %rd41;
ld.global.f32 %f62, [%rd42];
add.s32 %r346, %r323, 2;
div.s32 %r347, %r346, %r24;
mul.lo.s32 %r348, %r347, %r24;
sub.s32 %r349, %r346, %r348;
div.s32 %r350, %r349, %r25;
mul.lo.s32 %r351, %r350, %r25;
sub.s32 %r352, %r349, %r351;
mad.lo.s32 %r353, %r350, %r26, %r352;
div.s32 %r354, %r353, %r76;
mad.lo.s32 %r355, %r347, %r23, %r332;
add.s32 %r356, %r355, %r354;
mul.wide.s32 %rd43, %r356, 4;
add.s64 %rd44, %rd2, %rd43;
ld.global.f32 %f63, [%rd44];
add.s32 %r357, %r323, 3;
div.s32 %r358, %r357, %r24;
mul.lo.s32 %r359, %r358, %r24;
sub.s32 %r360, %r357, %r359;
div.s32 %r361, %r360, %r25;
mul.lo.s32 %r362, %r361, %r25;
sub.s32 %r363, %r360, %r362;
mad.lo.s32 %r364, %r361, %r26, %r363;
div.s32 %r365, %r364, %r76;
mad.lo.s32 %r366, %r358, %r23, %r332;
add.s32 %r367, %r366, %r365;
mul.wide.s32 %rd45, %r367, 4;
add.s64 %rd46, %rd2, %rd45;
ld.global.f32 %f64, [%rd46];
mul.lo.s32 %r368, %r78, %r76;
mul.lo.s32 %r369, %r15, %r79;
mad.lo.s32 %r370, %r368, %r369, %r264;
mul.wide.s32 %rd47, %r370, 4;
add.s64 %rd29, %rd3, %rd47;
// begin inline asm
ld.global.cs.v4.u32 {%r255,%r256,%r257,%r258}, [%rd29];
// end inline asm
mov.b32 %f65, %r255;
fma.rn.f32 %f66, %f61, %f65, %f57;
mov.b32 %r259, %f66;
mov.b32 %f67, %r256;
fma.rn.f32 %f68, %f62, %f67, %f58;
mov.b32 %r260, %f68;
mov.b32 %f69, %r257;
fma.rn.f32 %f70, %f63, %f69, %f59;
mov.b32 %r261, %f70;
mov.b32 %f71, %r258;
fma.rn.f32 %f72, %f64, %f71, %f60;
mov.b32 %r262, %f72;
mad.lo.s32 %r371, %r17, %r15, %r264;
mul.wide.s32 %rd48, %r371, 4;
add.s64 %rd30, %rd6, %rd48;
// begin inline asm
st.global.cs.v4.s32 [%rd30], {%r259,%r260,%r261,%r262};
// end inline asm
bra.uni $L__BB0_38;
$L__BB0_6:
setp.ge.s32 %p8, %r15, %r66;
mul.lo.s32 %r27, %r70, 12;
or.b32 %r120, %r27, 3;
shr.s32 %r121, %r120, 31;
shr.u32 %r122, %r121, 30;
add.s32 %r123, %r120, %r122;
shr.s32 %r124, %r123, 2;
mov.u32 %r29, %ntid.x;
add.s32 %r125, %r29, %r124;
add.s32 %r28, %r125, -1;
shl.b32 %r126, %r4, 2;
shl.b32 %r127, %r29, 2;
mov.u32 %r30, %ctaid.y;
mad.lo.s32 %r128, %r127, %r30, %r126;
or.b32 %r31, %r128, 3;
shl.b32 %r129, %r12, 1;
add.s32 %r130, %r31, %r129;
add.s32 %r32, %r130, -3;
mul.lo.s32 %r33, %r70, 6;
mul.lo.s32 %r34, %r70, 3;
mul.lo.s32 %r35, %r15, %r71;
mov.f32 %f86, 0f00000000;
mov.f32 %f85, %f86;
@%p8 bra $L__BB0_9;
div.s32 %r131, %r28, %r29;
setp.ge.s32 %p9, %r30, %r131;
setp.ge.s32 %p10, %r31, %r27;
or.pred %p11, %p9, %p10;
@%p11 bra $L__BB0_9;
div.s32 %r132, %r32, %r33;
mul.lo.s32 %r133, %r132, %r33;
sub.s32 %r134, %r32, %r133;
div.s32 %r135, %r134, %r34;
mul.lo.s32 %r136, %r135, %r34;
sub.s32 %r137, %r134, %r136;
div.s32 %r138, %r137, %r70;
mul.lo.s32 %r139, %r138, %r70;
sub.s32 %r140, %r137, %r139;
mad.lo.s32 %r141, %r132, %r72, %r35;
mad.lo.s32 %r142, %r135, %r73, %r141;
mad.lo.s32 %r143, %r138, %r74, %r142;
mad.lo.s32 %r144, %r140, %r75, %r143;
mul.wide.s32 %rd9, %r144, 4;
add.s64 %rd10, %rd1, %rd9;
ld.global.f32 %f85, [%rd10];
$L__BB0_9:
@%p8 bra $L__BB0_12;
div.s32 %r145, %r28, %r29;
setp.ge.s32 %p13, %r30, %r145;
setp.ge.s32 %p14, %r31, %r27;
or.pred %p15, %p13, %p14;
@%p15 bra $L__BB0_12;
add.s32 %r146, %r32, 1;
div.s32 %r147, %r146, %r33;
mul.lo.s32 %r148, %r147, %r33;
sub.s32 %r149, %r146, %r148;
div.s32 %r150, %r149, %r34;
mul.lo.s32 %r151, %r150, %r34;
sub.s32 %r152, %r149, %r151;
div.s32 %r153, %r152, %r70;
mul.lo.s32 %r154, %r153, %r70;
sub.s32 %r155, %r152, %r154;
mad.lo.s32 %r156, %r147, %r72, %r35;
mad.lo.s32 %r157, %r150, %r73, %r156;
mad.lo.s32 %r158, %r153, %r74, %r157;
mad.lo.s32 %r159, %r155, %r75, %r158;
mul.wide.s32 %rd11, %r159, 4;
add.s64 %rd12, %rd1, %rd11;
ld.global.f32 %f86, [%rd12];
$L__BB0_12:
mov.f32 %f88, 0f00000000;
mov.f32 %f87, %f88;
@%p8 bra $L__BB0_15;
div.s32 %r160, %r28, %r29;
setp.ge.s32 %p17, %r30, %r160;
setp.ge.s32 %p18, %r31, %r27;
or.pred %p19, %p17, %p18;
@%p19 bra $L__BB0_15;
add.s32 %r161, %r32, 2;
div.s32 %r162, %r161, %r33;
mul.lo.s32 %r163, %r162, %r33;
sub.s32 %r164, %r161, %r163;
div.s32 %r165, %r164, %r34;
mul.lo.s32 %r166, %r165, %r34;
sub.s32 %r167, %r164, %r166;
div.s32 %r168, %r167, %r70;
mul.lo.s32 %r169, %r168, %r70;
sub.s32 %r170, %r167, %r169;
mad.lo.s32 %r171, %r162, %r72, %r35;
mad.lo.s32 %r172, %r165, %r73, %r171;
mad.lo.s32 %r173, %r168, %r74, %r172;
mad.lo.s32 %r174, %r170, %r75, %r173;
mul.wide.s32 %rd13, %r174, 4;
add.s64 %rd14, %rd1, %rd13;
ld.global.f32 %f87, [%rd14];
$L__BB0_15:
@%p8 bra $L__BB0_18;
div.s32 %r175, %r28, %r29;
setp.ge.s32 %p21, %r30, %r175;
setp.ge.s32 %p22, %r31, %r27;
or.pred %p23, %p21, %p22;
@%p23 bra $L__BB0_18;
add.s32 %r176, %r32, 3;
div.s32 %r177, %r176, %r33;
mul.lo.s32 %r178, %r177, %r33;
sub.s32 %r179, %r176, %r178;
div.s32 %r180, %r179, %r34;
mul.lo.s32 %r181, %r180, %r34;
sub.s32 %r182, %r179, %r181;
div.s32 %r183, %r182, %r70;
mul.lo.s32 %r184, %r183, %r70;
sub.s32 %r185, %r182, %r184;
mad.lo.s32 %r186, %r177, %r72, %r35;
mad.lo.s32 %r187, %r180, %r73, %r186;
mad.lo.s32 %r188, %r183, %r74, %r187;
mad.lo.s32 %r189, %r185, %r75, %r188;
mul.wide.s32 %rd15, %r189, 4;
add.s64 %rd16, %rd1, %rd15;
ld.global.f32 %f88, [%rd16];
$L__BB0_18:
mul.lo.s32 %r36, %r78, %r79;
add.s32 %r190, %r36, 1;
shr.u32 %r191, %r190, 31;
add.s32 %r192, %r190, %r191;
shr.s32 %r37, %r192, 1;
mad.lo.s32 %r193, %r30, %r29, %r4;
shl.b32 %r194, %r193, 2;
shl.b32 %r195, %r12, 3;
add.s32 %r38, %r195, %r194;
mul.lo.s32 %r39, %r37, %r76;
add.s32 %r196, %r39, 1;
shr.u32 %r197, %r196, 31;
add.s32 %r198, %r196, %r197;
shr.s32 %r40, %r198, 1;
mul.lo.s32 %r41, %r36, %r15;
mov.f32 %f90, 0f00000000;
mov.f32 %f89, %f90;
@%p8 bra $L__BB0_21;
div.s32 %r199, %r38, %r33;
mul.lo.s32 %r200, %r199, %r33;
sub.s32 %r201, %r38, %r200;
div.s32 %r202, %r201, %r34;
mul.lo.s32 %r203, %r202, %r34;
sub.s32 %r204, %r201, %r203;
mad.lo.s32 %r205, %r202, %r40, %r204;
div.s32 %r206, %r205, %r76;
mad.lo.s32 %r42, %r199, %r37, %r206;
setp.ge.s32 %p25, %r42, %r36;
setp.ge.s32 %p26, %r205, %r39;
or.pred %p27, %p25, %p26;
@%p27 bra $L__BB0_21;
add.s32 %r207, %r42, %r41;
mul.wide.s32 %rd17, %r207, 4;
add.s64 %rd18, %rd2, %rd17;
ld.global.f32 %f89, [%rd18];
$L__BB0_21:
@%p8 bra $L__BB0_24;
add.s32 %r208, %r38, 1;
div.s32 %r209, %r208, %r33;
mul.lo.s32 %r210, %r209, %r33;
sub.s32 %r211, %r208, %r210;
div.s32 %r212, %r211, %r34;
mul.lo.s32 %r213, %r212, %r34;
sub.s32 %r214, %r211, %r213;
mad.lo.s32 %r215, %r212, %r40, %r214;
div.s32 %r216, %r215, %r76;
mad.lo.s32 %r43, %r209, %r37, %r216;
setp.ge.s32 %p29, %r43, %r36;
setp.ge.s32 %p30, %r215, %r39;
or.pred %p31, %p29, %p30;
@%p31 bra $L__BB0_24;
add.s32 %r217, %r43, %r41;
mul.wide.s32 %rd19, %r217, 4;
add.s64 %rd20, %rd2, %rd19;
ld.global.f32 %f90, [%rd20];
$L__BB0_24:
mov.f32 %f92, 0f00000000;
mov.f32 %f91, %f92;
@%p8 bra $L__BB0_27;
add.s32 %r218, %r38, 2;
div.s32 %r219, %r218, %r33;
mul.lo.s32 %r220, %r219, %r33;
sub.s32 %r221, %r218, %r220;
div.s32 %r222, %r221, %r34;
mul.lo.s32 %r223, %r222, %r34;
sub.s32 %r224, %r221, %r223;
mad.lo.s32 %r225, %r222, %r40, %r224;
div.s32 %r226, %r225, %r76;
mad.lo.s32 %r44, %r219, %r37, %r226;
setp.ge.s32 %p33, %r44, %r36;
setp.ge.s32 %p34, %r225, %r39;
or.pred %p35, %p33, %p34;
@%p35 bra $L__BB0_27;
add.s32 %r227, %r44, %r41;
mul.wide.s32 %rd21, %r227, 4;
add.s64 %rd22, %rd2, %rd21;
ld.global.f32 %f91, [%rd22];
$L__BB0_27:
@%p8 bra $L__BB0_30;
add.s32 %r228, %r38, 3;
div.s32 %r229, %r228, %r33;
mul.lo.s32 %r230, %r229, %r33;
sub.s32 %r231, %r228, %r230;
div.s32 %r232, %r231, %r34;
mul.lo.s32 %r233, %r232, %r34;
sub.s32 %r234, %r231, %r233;
mad.lo.s32 %r235, %r232, %r40, %r234;
div.s32 %r236, %r235, %r76;
mad.lo.s32 %r45, %r229, %r37, %r236;
setp.ge.s32 %p37, %r45, %r36;
setp.ge.s32 %p38, %r235, %r39;
or.pred %p39, %p37, %p38;
@%p39 bra $L__BB0_30;
add.s32 %r237, %r45, %r41;
mul.wide.s32 %rd23, %r237, 4;
add.s64 %rd24, %rd2, %rd23;
ld.global.f32 %f92, [%rd24];
$L__BB0_30:
mov.f32 %f93, 0f00000000;
mov.f32 %f94, 0f00000000;
mov.f32 %f95, 0f00000000;
mov.f32 %f96, 0f00000000;
@%p8 bra $L__BB0_34;
div.s32 %r238, %r28, %r29;
setp.ge.s32 %p41, %r30, %r238;
@%p41 bra $L__BB0_34;
setp.ge.s32 %p42, %r31, %r27;
@%p42 bra $L__BB0_34;
mul.lo.s32 %r243, %r15, %r79;
mul.lo.s32 %r244, %r78, %r76;
mad.lo.s32 %r245, %r244, %r243, %r31;
add.s32 %r246, %r245, -3;
mul.wide.s32 %rd26, %r246, 4;
add.s64 %rd25, %rd3, %rd26;
// begin inline asm
ld.global.cs.v4.u32 {%r239,%r240,%r241,%r242}, [%rd25];
// end inline asm
mov.b32 %f96, %r239;
mov.b32 %f95, %r240;
mov.b32 %f94, %r241;
mov.b32 %f93, %r242;
$L__BB0_34:
fma.rn.f32 %f25, %f96, %f89, %f85;
fma.rn.f32 %f26, %f95, %f90, %f86;
fma.rn.f32 %f27, %f94, %f91, %f87;
fma.rn.f32 %f28, %f93, %f92, %f88;
@%p8 bra $L__BB0_38;
div.s32 %r247, %r28, %r29;
setp.ge.s32 %p44, %r30, %r247;
setp.ge.s32 %p45, %r31, %r27;
or.pred %p46, %p44, %p45;
@%p46 bra $L__BB0_38;
mad.lo.s32 %r252, %r27, %r14, %r31;
add.s32 %r253, %r252, -3;
mad.lo.s32 %r254, %r27, %r13, %r253;
mul.wide.s32 %rd28, %r254, 4;
add.s64 %rd27, %rd6, %rd28;
mov.b32 %r248, %f25;
mov.b32 %r249, %f26;
mov.b32 %r250, %f27;
mov.b32 %r251, %f28;
// begin inline asm
st.global.cs.v4.s32 [%rd27], {%r248,%r249,%r250,%r251};
// end inline asm
$L__BB0_38:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_b0e7166a_1601111nvfuser_154ENS_6TensorIfLi5ELi5EEES1_S1_S1_E14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_b0e7166a_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_b0e7166a_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_b0e7166a_160113std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_b0e7166a_1601111nvfuser_154ENS_6TensorIfLi5ELi5EEES1_S1_S1_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_b0e7166a_1601111nvfuser_154ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0[48],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_b0e7166a_1601111nvfuser_154ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_1[48],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_b0e7166a_1601111nvfuser_154ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_2[48],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_b0e7166a_1601111nvfuser_154ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_3[48]
)
{
.reg .pred %p<37>;
.reg .f32 %f<97>;
.reg .b32 %r<370>;
.reg .b64 %rd<49>;
// demoted variable
.shared .align 4 .u32 _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_b0e7166a_1601111nvfuser_154ENS_6TensorIfLi5ELi5EEES1_S1_S1_E14nvfuser_zero_s;
ld.param.v2.u32 {%r65, %r66}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_b0e7166a_1601111nvfuser_154ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0+8];
ld.param.v2.u32 {%r69, %r70}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_b0e7166a_1601111nvfuser_154ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0+24];
ld.param.v2.u32 {%r71, %r72}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_b0e7166a_1601111nvfuser_154ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0+32];
ld.param.v2.u32 {%r73, %r74}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_b0e7166a_1601111nvfuser_154ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0+40];
ld.param.v2.u32 {%r75, %r76}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_b0e7166a_1601111nvfuser_154ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_2+24];
ld.param.v2.u32 {%r77, %r78}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_b0e7166a_1601111nvfuser_154ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_2+16];
ld.param.u64 %rd6, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_b0e7166a_1601111nvfuser_154ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_3];
ld.param.u64 %rd5, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_b0e7166a_1601111nvfuser_154ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0];
ld.param.u64 %rd7, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_b0e7166a_1601111nvfuser_154ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_1];
cvta.to.global.u64 %rd1, %rd5;
cvta.to.global.u64 %rd2, %rd7;
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_b0e7166a_1601111nvfuser_154ENS_6TensorIfLi5ELi5EEES1_S1_S1__param_2];
mov.u32 %r4, %tid.x;
setp.ne.s32 %p1, %r4, 0;
@%p1 bra $L__BB0_2;
mov.u32 %r89, 0;
st.shared.u32 [_ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_b0e7166a_1601111nvfuser_154ENS_6TensorIfLi5ELi5EEES1_S1_S1_E14nvfuser_zero_s], %r89;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd8, _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_b0e7166a_1601111nvfuser_154ENS_6TensorIfLi5ELi5EEES1_S1_S1_E14nvfuser_zero_s;
atom.shared.min.s32 %r90, [%rd8], %r4;
ld.shared.u32 %r12, [_ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_154_cu_b0e7166a_1601111nvfuser_154ENS_6TensorIfLi5ELi5EEES1_S1_S1_E14nvfuser_zero_s];
mov.u32 %r91, %ctaid.x;
mov.u32 %r92, %ntid.y;
mul.lo.s32 %r13, %r92, %r91;
mov.u32 %r14, %tid.y;
add.s32 %r15, %r13, %r14;
setp.lt.s32 %p2, %r15, %r65;
@%p2 bra $L__BB0_3;
bra.uni $L__BB0_6;
$L__BB0_3:
mov.u32 %r16, %ctaid.y;
mul.lo.s32 %r17, %r69, 12;
or.b32 %r93, %r17, 3;
shr.s32 %r94, %r93, 31;
shr.u32 %r95, %r94, 30;
add.s32 %r96, %r93, %r95;
shr.s32 %r97, %r96, 2;
mov.u32 %r18, %ntid.x;
add.s32 %r98, %r18, %r97;
add.s32 %r99, %r98, -1;
div.s32 %r100, %r99, %r18;
setp.ge.s32 %p3, %r16, %r100;
@%p3 bra $L__BB0_6;
shl.b32 %r19, %r4, 2;
mul.lo.s32 %r101, %r18, %r16;
shl.b32 %r20, %r101, 2;
add.s32 %r102, %r19, %r20;
or.b32 %r21, %r102, 3;
setp.ge.s32 %p4, %r21, %r17;
@%p4 bra $L__BB0_6;
mul.lo.s32 %r22, %r77, %r78;
add.s32 %r103, %r22, 1;
shr.u32 %r104, %r103, 31;
add.s32 %r105, %r103, %r104;
shr.s32 %r23, %r105, 1;
mul.lo.s32 %r24, %r69, 6;
div.s32 %r106, %r21, %r24;
mul.lo.s32 %r107, %r106, %r24;
sub.s32 %r108, %r21, %r107;
mul.lo.s32 %r25, %r69, 3;
div.s32 %r109, %r108, %r25;
mul.lo.s32 %r110, %r109, %r25;
sub.s32 %r111, %r108, %r110;
mad.lo.s32 %r112, %r23, %r75, 1;
shr.u32 %r113, %r112, 31;
add.s32 %r114, %r112, %r113;
shr.s32 %r26, %r114, 1;
mad.lo.s32 %r115, %r109, %r26, %r111;
div.s32 %r116, %r115, %r75;
mad.lo.s32 %r117, %r106, %r23, %r116;
setp.lt.s32 %p5, %r117, %r22;
@%p5 bra $L__BB0_37;
bra.uni $L__BB0_6;
$L__BB0_37:
shl.b32 %r261, %r12, 1;
add.s32 %r262, %r20, %r19;
add.s32 %r263, %r262, %r261;
div.s32 %r264, %r263, %r24;
mul.lo.s32 %r265, %r264, %r24;
sub.s32 %r266, %r263, %r265;
div.s32 %r267, %r266, %r25;
mul.lo.s32 %r268, %r267, %r25;
sub.s32 %r269, %r266, %r268;
div.s32 %r270, %r269, %r69;
mul.lo.s32 %r271, %r270, %r69;
sub.s32 %r272, %r269, %r271;
mul.lo.s32 %r273, %r15, %r70;
mad.lo.s32 %r274, %r264, %r71, %r273;
mad.lo.s32 %r275, %r267, %r72, %r274;
mad.lo.s32 %r276, %r270, %r73, %r275;
mad.lo.s32 %r277, %r272, %r74, %r276;
mul.wide.s32 %rd31, %r277, 4;
add.s64 %rd32, %rd1, %rd31;
ld.global.f32 %f57, [%rd32];
add.s32 %r278, %r263, 1;
div.s32 %r279, %r278, %r24;
mul.lo.s32 %r280, %r279, %r24;
sub.s32 %r281, %r278, %r280;
div.s32 %r282, %r281, %r25;
mul.lo.s32 %r283, %r282, %r25;
sub.s32 %r284, %r281, %r283;
div.s32 %r285, %r284, %r69;
mul.lo.s32 %r286, %r285, %r69;
sub.s32 %r287, %r284, %r286;
mad.lo.s32 %r288, %r279, %r71, %r273;
mad.lo.s32 %r289, %r282, %r72, %r288;
mad.lo.s32 %r290, %r285, %r73, %r289;
mad.lo.s32 %r291, %r287, %r74, %r290;
mul.wide.s32 %rd33, %r291, 4;
add.s64 %rd34, %rd1, %rd33;
ld.global.f32 %f58, [%rd34];
add.s32 %r292, %r263, 2;
div.s32 %r293, %r292, %r24;
mul.lo.s32 %r294, %r293, %r24;
sub.s32 %r295, %r292, %r294;
div.s32 %r296, %r295, %r25;
mul.lo.s32 %r297, %r296, %r25;
sub.s32 %r298, %r295, %r297;
div.s32 %r299, %r298, %r69;
mul.lo.s32 %r300, %r299, %r69;
sub.s32 %r301, %r298, %r300;
mad.lo.s32 %r302, %r293, %r71, %r273;
mad.lo.s32 %r303, %r296, %r72, %r302;
mad.lo.s32 %r304, %r299, %r73, %r303;
mad.lo.s32 %r305, %r301, %r74, %r304;
mul.wide.s32 %rd35, %r305, 4;
add.s64 %rd36, %rd1, %rd35;
ld.global.f32 %f59, [%rd36];
add.s32 %r306, %r263, 3;
div.s32 %r307, %r306, %r24;
mul.lo.s32 %r308, %r307, %r24;
sub.s32 %r309, %r306, %r308;
div.s32 %r310, %r309, %r25;
mul.lo.s32 %r311, %r310, %r25;
sub.s32 %r312, %r309, %r311;
div.s32 %r313, %r312, %r69;
mul.lo.s32 %r314, %r313, %r69;
sub.s32 %r315, %r312, %r314;
mad.lo.s32 %r316, %r307, %r71, %r273;
mad.lo.s32 %r317, %r310, %r72, %r316;
mad.lo.s32 %r318, %r313, %r73, %r317;
mad.lo.s32 %r319, %r315, %r74, %r318;
mul.wide.s32 %rd37, %r319, 4;
add.s64 %rd38, %rd1, %rd37;
ld.global.f32 %f60, [%rd38];
shl.b32 %r320, %r12, 3;
add.s32 %r321, %r262, %r320;
div.s32 %r322, %r321, %r24;
mul.lo.s32 %r323, %r322, %r24;
sub.s32 %r324, %r321, %r323;
div.s32 %r325, %r324, %r25;
mul.lo.s32 %r326, %r325, %r25;
sub.s32 %r327, %r324, %r326;
mad.lo.s32 %r328, %r325, %r26, %r327;
div.s32 %r329, %r328, %r75;
mul.lo.s32 %r330, %r22, %r15;
mad.lo.s32 %r331, %r322, %r23, %r330;
add.s32 %r332, %r331, %r329;
mul.wide.s32 %rd39, %r332, 4;
add.s64 %rd40, %rd2, %rd39;
ld.global.f32 %f61, [%rd40];
add.s32 %r333, %r321, 1;
div.s32 %r334, %r333, %r24;
mul.lo.s32 %r335, %r334, %r24;
sub.s32 %r336, %r333, %r335;
div.s32 %r337, %r336, %r25;
mul.lo.s32 %r338, %r337, %r25;
sub.s32 %r339, %r336, %r338;
mad.lo.s32 %r340, %r337, %r26, %r339;
div.s32 %r341, %r340, %r75;
mad.lo.s32 %r342, %r334, %r23, %r330;
add.s32 %r343, %r342, %r341;
mul.wide.s32 %rd41, %r343, 4;
add.s64 %rd42, %rd2, %rd41;
ld.global.f32 %f62, [%rd42];
add.s32 %r344, %r321, 2;
div.s32 %r345, %r344, %r24;
mul.lo.s32 %r346, %r345, %r24;
sub.s32 %r347, %r344, %r346;
div.s32 %r348, %r347, %r25;
mul.lo.s32 %r349, %r348, %r25;
sub.s32 %r350, %r347, %r349;
mad.lo.s32 %r351, %r348, %r26, %r350;
div.s32 %r352, %r351, %r75;
mad.lo.s32 %r353, %r345, %r23, %r330;
add.s32 %r354, %r353, %r352;
mul.wide.s32 %rd43, %r354, 4;
add.s64 %rd44, %rd2, %rd43;
ld.global.f32 %f63, [%rd44];
add.s32 %r355, %r321, 3;
div.s32 %r356, %r355, %r24;
mul.lo.s32 %r357, %r356, %r24;
sub.s32 %r358, %r355, %r357;
div.s32 %r359, %r358, %r25;
mul.lo.s32 %r360, %r359, %r25;
sub.s32 %r361, %r358, %r360;
mad.lo.s32 %r362, %r359, %r26, %r361;
div.s32 %r363, %r362, %r75;
mad.lo.s32 %r364, %r356, %r23, %r330;
add.s32 %r365, %r364, %r363;
mul.wide.s32 %rd45, %r365, 4;
add.s64 %rd46, %rd2, %rd45;
ld.global.f32 %f64, [%rd46];
mul.lo.s32 %r366, %r77, %r75;
mul.lo.s32 %r367, %r15, %r78;
mad.lo.s32 %r368, %r366, %r367, %r262;
mul.wide.s32 %rd47, %r368, 4;
add.s64 %rd29, %rd3, %rd47;
// begin inline asm
ld.global.cs.v4.u32 {%r253,%r254,%r255,%r256}, [%rd29];
// end inline asm
mov.b32 %f65, %r253;
fma.rn.f32 %f66, %f61, %f65, %f57;
mov.b32 %r257, %f66;
mov.b32 %f67, %r254;
fma.rn.f32 %f68, %f62, %f67, %f58;
mov.b32 %r258, %f68;
mov.b32 %f69, %r255;
fma.rn.f32 %f70, %f63, %f69, %f59;
mov.b32 %r259, %f70;
mov.b32 %f71, %r256;
fma.rn.f32 %f72, %f64, %f71, %f60;
mov.b32 %r260, %f72;
mad.lo.s32 %r369, %r17, %r15, %r262;
mul.wide.s32 %rd48, %r369, 4;
add.s64 %rd30, %rd6, %rd48;
// begin inline asm
st.global.cs.v4.s32 [%rd30], {%r257,%r258,%r259,%r260};
// end inline asm
bra.uni $L__BB0_38;
$L__BB0_6:
setp.ge.s32 %p6, %r15, %r65;
mul.lo.s32 %r27, %r69, 12;
or.b32 %r118, %r27, 3;
shr.s32 %r119, %r118, 31;
shr.u32 %r120, %r119, 30;
add.s32 %r121, %r118, %r120;
shr.s32 %r122, %r121, 2;
mov.u32 %r29, %ntid.x;
add.s32 %r123, %r29, %r122;
add.s32 %r28, %r123, -1;
shl.b32 %r124, %r4, 2;
shl.b32 %r125, %r29, 2;
mov.u32 %r30, %ctaid.y;
mad.lo.s32 %r126, %r125, %r30, %r124;
or.b32 %r31, %r126, 3;
shl.b32 %r127, %r12, 1;
add.s32 %r128, %r31, %r127;
add.s32 %r32, %r128, -3;
mul.lo.s32 %r33, %r69, 6;
mul.lo.s32 %r34, %r69, 3;
mul.lo.s32 %r35, %r15, %r70;
mov.f32 %f86, 0f00000000;
mov.f32 %f85, %f86;
@%p6 bra $L__BB0_9;
div.s32 %r129, %r28, %r29;
setp.ge.s32 %p7, %r30, %r129;
setp.ge.s32 %p8, %r31, %r27;
or.pred %p9, %p7, %p8;
@%p9 bra $L__BB0_9;
div.s32 %r130, %r32, %r33;
mul.lo.s32 %r131, %r130, %r33;
sub.s32 %r132, %r32, %r131;
div.s32 %r133, %r132, %r34;
mul.lo.s32 %r134, %r133, %r34;
sub.s32 %r135, %r132, %r134;
div.s32 %r136, %r135, %r69;
mul.lo.s32 %r137, %r136, %r69;
sub.s32 %r138, %r135, %r137;
mad.lo.s32 %r139, %r130, %r71, %r35;
mad.lo.s32 %r140, %r133, %r72, %r139;
mad.lo.s32 %r141, %r136, %r73, %r140;
mad.lo.s32 %r142, %r138, %r74, %r141;
mul.wide.s32 %rd9, %r142, 4;
add.s64 %rd10, %rd1, %rd9;
ld.global.f32 %f85, [%rd10];
$L__BB0_9:
@%p6 bra $L__BB0_12;
div.s32 %r143, %r28, %r29;
setp.ge.s32 %p11, %r30, %r143;
setp.ge.s32 %p12, %r31, %r27;
or.pred %p13, %p11, %p12;
@%p13 bra $L__BB0_12;
add.s32 %r144, %r32, 1;
div.s32 %r145, %r144, %r33;
mul.lo.s32 %r146, %r145, %r33;
sub.s32 %r147, %r144, %r146;
div.s32 %r148, %r147, %r34;
mul.lo.s32 %r149, %r148, %r34;
sub.s32 %r150, %r147, %r149;
div.s32 %r151, %r150, %r69;
mul.lo.s32 %r152, %r151, %r69;
sub.s32 %r153, %r150, %r152;
mad.lo.s32 %r154, %r145, %r71, %r35;
mad.lo.s32 %r155, %r148, %r72, %r154;
mad.lo.s32 %r156, %r151, %r73, %r155;
mad.lo.s32 %r157, %r153, %r74, %r156;
mul.wide.s32 %rd11, %r157, 4;
add.s64 %rd12, %rd1, %rd11;
ld.global.f32 %f86, [%rd12];
$L__BB0_12:
mov.f32 %f88, 0f00000000;
mov.f32 %f87, %f88;
@%p6 bra $L__BB0_15;
div.s32 %r158, %r28, %r29;
setp.ge.s32 %p15, %r30, %r158;
setp.ge.s32 %p16, %r31, %r27;
or.pred %p17, %p15, %p16;
@%p17 bra $L__BB0_15;
add.s32 %r159, %r32, 2;
div.s32 %r160, %r159, %r33;
mul.lo.s32 %r161, %r160, %r33;
sub.s32 %r162, %r159, %r161;
div.s32 %r163, %r162, %r34;
mul.lo.s32 %r164, %r163, %r34;
sub.s32 %r165, %r162, %r164;
div.s32 %r166, %r165, %r69;
mul.lo.s32 %r167, %r166, %r69;
sub.s32 %r168, %r165, %r167;
mad.lo.s32 %r169, %r160, %r71, %r35;
mad.lo.s32 %r170, %r163, %r72, %r169;
mad.lo.s32 %r171, %r166, %r73, %r170;
mad.lo.s32 %r172, %r168, %r74, %r171;
mul.wide.s32 %rd13, %r172, 4;
add.s64 %rd14, %rd1, %rd13;
ld.global.f32 %f87, [%rd14];
$L__BB0_15:
@%p6 bra $L__BB0_18;
div.s32 %r173, %r28, %r29;
setp.ge.s32 %p19, %r30, %r173;
setp.ge.s32 %p20, %r31, %r27;
or.pred %p21, %p19, %p20;
@%p21 bra $L__BB0_18;
add.s32 %r174, %r32, 3;
div.s32 %r175, %r174, %r33;
mul.lo.s32 %r176, %r175, %r33;
sub.s32 %r177, %r174, %r176;
div.s32 %r178, %r177, %r34;
mul.lo.s32 %r179, %r178, %r34;
sub.s32 %r180, %r177, %r179;
div.s32 %r181, %r180, %r69;
mul.lo.s32 %r182, %r181, %r69;
sub.s32 %r183, %r180, %r182;
mad.lo.s32 %r184, %r175, %r71, %r35;
mad.lo.s32 %r185, %r178, %r72, %r184;
mad.lo.s32 %r186, %r181, %r73, %r185;
mad.lo.s32 %r187, %r183, %r74, %r186;
mul.wide.s32 %rd15, %r187, 4;
add.s64 %rd16, %rd1, %rd15;
ld.global.f32 %f88, [%rd16];
$L__BB0_18:
mul.lo.s32 %r36, %r77, %r78;
add.s32 %r188, %r36, 1;
shr.u32 %r189, %r188, 31;
add.s32 %r190, %r188, %r189;
shr.s32 %r37, %r190, 1;
mad.lo.s32 %r191, %r30, %r29, %r4;
shl.b32 %r192, %r191, 2;
shl.b32 %r193, %r12, 3;
add.s32 %r38, %r193, %r192;
mad.lo.s32 %r194, %r37, %r75, 1;
shr.u32 %r195, %r194, 31;
add.s32 %r196, %r194, %r195;
shr.s32 %r39, %r196, 1;
mul.lo.s32 %r40, %r36, %r15;
mov.f32 %f90, 0f00000000;
mov.f32 %f89, %f90;
@%p6 bra $L__BB0_21;
div.s32 %r197, %r38, %r33;
mul.lo.s32 %r198, %r197, %r33;
sub.s32 %r199, %r38, %r198;
div.s32 %r200, %r199, %r34;
mul.lo.s32 %r201, %r200, %r34;
sub.s32 %r202, %r199, %r201;
mad.lo.s32 %r203, %r200, %r39, %r202;
div.s32 %r204, %r203, %r75;
mad.lo.s32 %r41, %r197, %r37, %r204;
setp.ge.s32 %p23, %r41, %r36;
@%p23 bra $L__BB0_21;
add.s32 %r205, %r41, %r40;
mul.wide.s32 %rd17, %r205, 4;
add.s64 %rd18, %rd2, %rd17;
ld.global.f32 %f89, [%rd18];
$L__BB0_21:
@%p6 bra $L__BB0_24;
add.s32 %r206, %r38, 1;
div.s32 %r207, %r206, %r33;
mul.lo.s32 %r208, %r207, %r33;
sub.s32 %r209, %r206, %r208;
div.s32 %r210, %r209, %r34;
mul.lo.s32 %r211, %r210, %r34;
sub.s32 %r212, %r209, %r211;
mad.lo.s32 %r213, %r210, %r39, %r212;
div.s32 %r214, %r213, %r75;
mad.lo.s32 %r42, %r207, %r37, %r214;
setp.ge.s32 %p25, %r42, %r36;
@%p25 bra $L__BB0_24;
add.s32 %r215, %r42, %r40;
mul.wide.s32 %rd19, %r215, 4;
add.s64 %rd20, %rd2, %rd19;
ld.global.f32 %f90, [%rd20];
$L__BB0_24:
mov.f32 %f92, 0f00000000;
mov.f32 %f91, %f92;
@%p6 bra $L__BB0_27;
add.s32 %r216, %r38, 2;
div.s32 %r217, %r216, %r33;
mul.lo.s32 %r218, %r217, %r33;
sub.s32 %r219, %r216, %r218;
div.s32 %r220, %r219, %r34;
mul.lo.s32 %r221, %r220, %r34;
sub.s32 %r222, %r219, %r221;
mad.lo.s32 %r223, %r220, %r39, %r222;
div.s32 %r224, %r223, %r75;
mad.lo.s32 %r43, %r217, %r37, %r224;
setp.ge.s32 %p27, %r43, %r36;
@%p27 bra $L__BB0_27;
add.s32 %r225, %r43, %r40;
mul.wide.s32 %rd21, %r225, 4;
add.s64 %rd22, %rd2, %rd21;
ld.global.f32 %f91, [%rd22];
$L__BB0_27:
@%p6 bra $L__BB0_30;
add.s32 %r226, %r38, 3;
div.s32 %r227, %r226, %r33;
mul.lo.s32 %r228, %r227, %r33;
sub.s32 %r229, %r226, %r228;
div.s32 %r230, %r229, %r34;
mul.lo.s32 %r231, %r230, %r34;
sub.s32 %r232, %r229, %r231;
mad.lo.s32 %r233, %r230, %r39, %r232;
div.s32 %r234, %r233, %r75;
mad.lo.s32 %r44, %r227, %r37, %r234;
setp.ge.s32 %p29, %r44, %r36;
@%p29 bra $L__BB0_30;
add.s32 %r235, %r44, %r40;
mul.wide.s32 %rd23, %r235, 4;
add.s64 %rd24, %rd2, %rd23;
ld.global.f32 %f92, [%rd24];
$L__BB0_30:
mov.f32 %f93, 0f00000000;
mov.f32 %f94, 0f00000000;
mov.f32 %f95, 0f00000000;
mov.f32 %f96, 0f00000000;
@%p6 bra $L__BB0_34;
div.s32 %r236, %r28, %r29;
setp.ge.s32 %p31, %r30, %r236;
@%p31 bra $L__BB0_34;
setp.ge.s32 %p32, %r31, %r27;
@%p32 bra $L__BB0_34;
mul.lo.s32 %r241, %r15, %r78;
mul.lo.s32 %r242, %r77, %r75;
mad.lo.s32 %r243, %r242, %r241, %r31;
add.s32 %r244, %r243, -3;
mul.wide.s32 %rd26, %r244, 4;
add.s64 %rd25, %rd3, %rd26;
// begin inline asm
ld.global.cs.v4.u32 {%r237,%r238,%r239,%r240}, [%rd25];
// end inline asm
mov.b32 %f96, %r237;
mov.b32 %f95, %r238;
mov.b32 %f94, %r239;
mov.b32 %f93, %r240;
$L__BB0_34:
fma.rn.f32 %f25, %f96, %f89, %f85;
fma.rn.f32 %f26, %f95, %f90, %f86;
fma.rn.f32 %f27, %f94, %f91, %f87;
fma.rn.f32 %f28, %f93, %f92, %f88;
@%p6 bra $L__BB0_38;
div.s32 %r245, %r28, %r29;
setp.ge.s32 %p34, %r30, %r245;
setp.ge.s32 %p35, %r31, %r27;
or.pred %p36, %p34, %p35;
@%p36 bra $L__BB0_38;
mad.lo.s32 %r250, %r27, %r14, %r31;
add.s32 %r251, %r250, -3;
mad.lo.s32 %r252, %r27, %r13, %r251;
mul.wide.s32 %rd28, %r252, 4;
add.s64 %rd27, %rd6, %rd28;
mov.b32 %r246, %f25;
mov.b32 %r247, %f26;
mov.b32 %r248, %f27;
mov.b32 %r249, %f28;
// begin inline asm
st.global.cs.v4.s32 [%rd27], {%r246,%r247,%r248,%r249};
// end inline asm
$L__BB0_38:
ret;
}
--- 53997da5d
+++ 03a1b695e
@@ -20,552 +20,540 @@
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1__param_1[48],
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1__param_2[48],
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1__param_3[48]
)
{
- .reg .pred %p<47>;
+ .reg .pred %p<37>;
.reg .f32 %f<97>;
- .reg .b32 %r<372>;
+ .reg .b32 %r<370>;
.reg .b64 %rd<49>;
.shared .align 4 .u32 _ZZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1_E14nvfuser_zero_s;
- ld.param.v2.u32 {%r66, %r67}, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0+8];
- ld.param.v2.u32 {%r70, %r71}, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0+24];
- ld.param.v2.u32 {%r72, %r73}, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0+32];
- ld.param.v2.u32 {%r74, %r75}, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0+40];
- ld.param.v2.u32 {%r76, %r77}, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1__param_2+24];
- ld.param.v2.u32 {%r78, %r79}, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1__param_2+16];
+ ld.param.v2.u32 {%r65, %r66}, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0+8];
+ ld.param.v2.u32 {%r69, %r70}, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0+24];
+ ld.param.v2.u32 {%r71, %r72}, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0+32];
+ ld.param.v2.u32 {%r73, %r74}, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0+40];
+ ld.param.v2.u32 {%r75, %r76}, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1__param_2+24];
+ ld.param.v2.u32 {%r77, %r78}, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1__param_2+16];
ld.param.u64 %rd6, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1__param_3];
ld.param.u64 %rd5, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1__param_0];
ld.param.u64 %rd7, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1__param_1];
cvta.to.global.u64 %rd1, %rd5;
cvta.to.global.u64 %rd2, %rd7;
ld.param.u64 %rd3, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1__param_2];
mov.u32 %r4, %tid.x;
setp.ne.s32 %p1, %r4, 0;
@%p1 bra $L__BB0_2;
- mov.u32 %r90, 0;
- st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1_E14nvfuser_zero_s], %r90;
+ mov.u32 %r89, 0;
+ st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1_E14nvfuser_zero_s], %r89;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd8, _ZZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1_E14nvfuser_zero_s;
- atom.shared.min.s32 %r91, [%rd8], %r4;
+ atom.shared.min.s32 %r90, [%rd8], %r4;
ld.shared.u32 %r12, [_ZZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_S1_S1_E14nvfuser_zero_s];
- mov.u32 %r92, %ctaid.x;
- mov.u32 %r93, %ntid.y;
- mul.lo.s32 %r13, %r93, %r92;
+ mov.u32 %r91, %ctaid.x;
+ mov.u32 %r92, %ntid.y;
+ mul.lo.s32 %r13, %r92, %r91;
mov.u32 %r14, %tid.y;
add.s32 %r15, %r13, %r14;
- setp.lt.s32 %p2, %r15, %r66;
+ setp.lt.s32 %p2, %r15, %r65;
@%p2 bra $L__BB0_3;
bra.uni $L__BB0_6;
$L__BB0_3:
mov.u32 %r16, %ctaid.y;
- mul.lo.s32 %r17, %r70, 12;
- or.b32 %r94, %r17, 3;
- shr.s32 %r95, %r94, 31;
- shr.u32 %r96, %r95, 30;
- add.s32 %r97, %r94, %r96;
- shr.s32 %r98, %r97, 2;
+ mul.lo.s32 %r17, %r69, 12;
+ or.b32 %r93, %r17, 3;
+ shr.s32 %r94, %r93, 31;
+ shr.u32 %r95, %r94, 30;
+ add.s32 %r96, %r93, %r95;
+ shr.s32 %r97, %r96, 2;
mov.u32 %r18, %ntid.x;
- add.s32 %r99, %r18, %r98;
- add.s32 %r100, %r99, -1;
- div.s32 %r101, %r100, %r18;
- setp.ge.s32 %p3, %r16, %r101;
+ add.s32 %r98, %r18, %r97;
+ add.s32 %r99, %r98, -1;
+ div.s32 %r100, %r99, %r18;
+ setp.ge.s32 %p3, %r16, %r100;
@%p3 bra $L__BB0_6;
shl.b32 %r19, %r4, 2;
- mul.lo.s32 %r102, %r18, %r16;
- shl.b32 %r20, %r102, 2;
- add.s32 %r103, %r19, %r20;
- or.b32 %r21, %r103, 3;
+ mul.lo.s32 %r101, %r18, %r16;
+ shl.b32 %r20, %r101, 2;
+ add.s32 %r102, %r19, %r20;
+ or.b32 %r21, %r102, 3;
setp.ge.s32 %p4, %r21, %r17;
@%p4 bra $L__BB0_6;
- mul.lo.s32 %r22, %r78, %r79;
- add.s32 %r104, %r22, 1;
- shr.u32 %r105, %r104, 31;
- add.s32 %r106, %r104, %r105;
- shr.s32 %r23, %r106, 1;
- mul.lo.s32 %r24, %r70, 6;
- div.s32 %r107, %r21, %r24;
- mul.lo.s32 %r108, %r107, %r24;
- sub.s32 %r109, %r21, %r108;
- mul.lo.s32 %r25, %r70, 3;
- div.s32 %r110, %r109, %r25;
- mul.lo.s32 %r111, %r110, %r25;
- sub.s32 %r112, %r109, %r111;
- mul.lo.s32 %r113, %r23, %r76;
- add.s32 %r114, %r113, 1;
- shr.u32 %r115, %r114, 31;
- add.s32 %r116, %r114, %r115;
- shr.s32 %r26, %r116, 1;
- mad.lo.s32 %r117, %r110, %r26, %r112;
- div.s32 %r118, %r117, %r76;
- mad.lo.s32 %r119, %r107, %r23, %r118;
- setp.lt.s32 %p5, %r119, %r22;
- setp.lt.s32 %p6, %r117, %r113;
- and.pred %p7, %p5, %p6;
- @%p7 bra $L__BB0_37;
+ mul.lo.s32 %r22, %r77, %r78;
+ add.s32 %r103, %r22, 1;
+ shr.u32 %r104, %r103, 31;
+ add.s32 %r105, %r103, %r104;
+ shr.s32 %r23, %r105, 1;
+ mul.lo.s32 %r24, %r69, 6;
+ div.s32 %r106, %r21, %r24;
+ mul.lo.s32 %r107, %r106, %r24;
+ sub.s32 %r108, %r21, %r107;
+ mul.lo.s32 %r25, %r69, 3;
+ div.s32 %r109, %r108, %r25;
+ mul.lo.s32 %r110, %r109, %r25;
+ sub.s32 %r111, %r108, %r110;
+ mad.lo.s32 %r112, %r23, %r75, 1;
+ shr.u32 %r113, %r112, 31;
+ add.s32 %r114, %r112, %r113;
+ shr.s32 %r26, %r114, 1;
+ mad.lo.s32 %r115, %r109, %r26, %r111;
+ div.s32 %r116, %r115, %r75;
+ mad.lo.s32 %r117, %r106, %r23, %r116;
+ setp.lt.s32 %p5, %r117, %r22;
+ @%p5 bra $L__BB0_37;
bra.uni $L__BB0_6;
$L__BB0_37:
- shl.b32 %r263, %r12, 1;
- add.s32 %r264, %r20, %r19;
- add.s32 %r265, %r264, %r263;
- div.s32 %r266, %r265, %r24;
- mul.lo.s32 %r267, %r266, %r24;
- sub.s32 %r268, %r265, %r267;
- div.s32 %r269, %r268, %r25;
- mul.lo.s32 %r270, %r269, %r25;
- sub.s32 %r271, %r268, %r270;
- div.s32 %r272, %r271, %r70;
- mul.lo.s32 %r273, %r272, %r70;
- sub.s32 %r274, %r271, %r273;
- mul.lo.s32 %r275, %r15, %r71;
- mad.lo.s32 %r276, %r266, %r72, %r275;
- mad.lo.s32 %r277, %r269, %r73, %r276;
- mad.lo.s32 %r278, %r272, %r74, %r277;
- mad.lo.s32 %r279, %r274, %r75, %r278;
- mul.wide.s32 %rd31, %r279, 4;
+ shl.b32 %r261, %r12, 1;
+ add.s32 %r262, %r20, %r19;
+ add.s32 %r263, %r262, %r261;
+ div.s32 %r264, %r263, %r24;
+ mul.lo.s32 %r265, %r264, %r24;
+ sub.s32 %r266, %r263, %r265;
+ div.s32 %r267, %r266, %r25;
+ mul.lo.s32 %r268, %r267, %r25;
+ sub.s32 %r269, %r266, %r268;
+ div.s32 %r270, %r269, %r69;
+ mul.lo.s32 %r271, %r270, %r69;
+ sub.s32 %r272, %r269, %r271;
+ mul.lo.s32 %r273, %r15, %r70;
+ mad.lo.s32 %r274, %r264, %r71, %r273;
+ mad.lo.s32 %r275, %r267, %r72, %r274;
+ mad.lo.s32 %r276, %r270, %r73, %r275;
+ mad.lo.s32 %r277, %r272, %r74, %r276;
+ mul.wide.s32 %rd31, %r277, 4;
add.s64 %rd32, %rd1, %rd31;
ld.global.f32 %f57, [%rd32];
- add.s32 %r280, %r265, 1;
- div.s32 %r281, %r280, %r24;
- mul.lo.s32 %r282, %r281, %r24;
- sub.s32 %r283, %r280, %r282;
- div.s32 %r284, %r283, %r25;
- mul.lo.s32 %r285, %r284, %r25;
- sub.s32 %r286, %r283, %r285;
- div.s32 %r287, %r286, %r70;
- mul.lo.s32 %r288, %r287, %r70;
- sub.s32 %r289, %r286, %r288;
- mad.lo.s32 %r290, %r281, %r72, %r275;
- mad.lo.s32 %r291, %r284, %r73, %r290;
- mad.lo.s32 %r292, %r287, %r74, %r291;
- mad.lo.s32 %r293, %r289, %r75, %r292;
- mul.wide.s32 %rd33, %r293, 4;
+ add.s32 %r278, %r263, 1;
+ div.s32 %r279, %r278, %r24;
+ mul.lo.s32 %r280, %r279, %r24;
+ sub.s32 %r281, %r278, %r280;
+ div.s32 %r282, %r281, %r25;
+ mul.lo.s32 %r283, %r282, %r25;
+ sub.s32 %r284, %r281, %r283;
+ div.s32 %r285, %r284, %r69;
+ mul.lo.s32 %r286, %r285, %r69;
+ sub.s32 %r287, %r284, %r286;
+ mad.lo.s32 %r288, %r279, %r71, %r273;
+ mad.lo.s32 %r289, %r282, %r72, %r288;
+ mad.lo.s32 %r290, %r285, %r73, %r289;
+ mad.lo.s32 %r291, %r287, %r74, %r290;
+ mul.wide.s32 %rd33, %r291, 4;
add.s64 %rd34, %rd1, %rd33;
ld.global.f32 %f58, [%rd34];
- add.s32 %r294, %r265, 2;
- div.s32 %r295, %r294, %r24;
- mul.lo.s32 %r296, %r295, %r24;
- sub.s32 %r297, %r294, %r296;
- div.s32 %r298, %r297, %r25;
- mul.lo.s32 %r299, %r298, %r25;
- sub.s32 %r300, %r297, %r299;
- div.s32 %r301, %r300, %r70;
- mul.lo.s32 %r302, %r301, %r70;
- sub.s32 %r303, %r300, %r302;
- mad.lo.s32 %r304, %r295, %r72, %r275;
- mad.lo.s32 %r305, %r298, %r73, %r304;
- mad.lo.s32 %r306, %r301, %r74, %r305;
- mad.lo.s32 %r307, %r303, %r75, %r306;
- mul.wide.s32 %rd35, %r307, 4;
+ add.s32 %r292, %r263, 2;
+ div.s32 %r293, %r292, %r24;
+ mul.lo.s32 %r294, %r293, %r24;
+ sub.s32 %r295, %r292, %r294;
+ div.s32 %r296, %r295, %r25;
+ mul.lo.s32 %r297, %r296, %r25;
+ sub.s32 %r298, %r295, %r297;
+ div.s32 %r299, %r298, %r69;
+ mul.lo.s32 %r300, %r299, %r69;
+ sub.s32 %r301, %r298, %r300;
+ mad.lo.s32 %r302, %r293, %r71, %r273;
+ mad.lo.s32 %r303, %r296, %r72, %r302;
+ mad.lo.s32 %r304, %r299, %r73, %r303;
+ mad.lo.s32 %r305, %r301, %r74, %r304;
+ mul.wide.s32 %rd35, %r305, 4;
add.s64 %rd36, %rd1, %rd35;
ld.global.f32 %f59, [%rd36];
- add.s32 %r308, %r265, 3;
- div.s32 %r309, %r308, %r24;
- mul.lo.s32 %r310, %r309, %r24;
- sub.s32 %r311, %r308, %r310;
- div.s32 %r312, %r311, %r25;
- mul.lo.s32 %r313, %r312, %r25;
- sub.s32 %r314, %r311, %r313;
- div.s32 %r315, %r314, %r70;
- mul.lo.s32 %r316, %r315, %r70;
- sub.s32 %r317, %r314, %r316;
- mad.lo.s32 %r318, %r309, %r72, %r275;
- mad.lo.s32 %r319, %r312, %r73, %r318;
- mad.lo.s32 %r320, %r315, %r74, %r319;
- mad.lo.s32 %r321, %r317, %r75, %r320;
- mul.wide.s32 %rd37, %r321, 4;
+ add.s32 %r306, %r263, 3;
+ div.s32 %r307, %r306, %r24;
+ mul.lo.s32 %r308, %r307, %r24;
+ sub.s32 %r309, %r306, %r308;
+ div.s32 %r310, %r309, %r25;
+ mul.lo.s32 %r311, %r310, %r25;
+ sub.s32 %r312, %r309, %r311;
+ div.s32 %r313, %r312, %r69;
+ mul.lo.s32 %r314, %r313, %r69;
+ sub.s32 %r315, %r312, %r314;
+ mad.lo.s32 %r316, %r307, %r71, %r273;
+ mad.lo.s32 %r317, %r310, %r72, %r316;
+ mad.lo.s32 %r318, %r313, %r73, %r317;
+ mad.lo.s32 %r319, %r315, %r74, %r318;
+ mul.wide.s32 %rd37, %r319, 4;
add.s64 %rd38, %rd1, %rd37;
ld.global.f32 %f60, [%rd38];
- shl.b32 %r322, %r12, 3;
- add.s32 %r323, %r264, %r322;
- div.s32 %r324, %r323, %r24;
- mul.lo.s32 %r325, %r324, %r24;
- sub.s32 %r326, %r323, %r325;
- div.s32 %r327, %r326, %r25;
- mul.lo.s32 %r328, %r327, %r25;
- sub.s32 %r329, %r326, %r328;
- mad.lo.s32 %r330, %r327, %r26, %r329;
- div.s32 %r331, %r330, %r76;
- mul.lo.s32 %r332, %r22, %r15;
- mad.lo.s32 %r333, %r324, %r23, %r332;
- add.s32 %r334, %r333, %r331;
- mul.wide.s32 %rd39, %r334, 4;
+ shl.b32 %r320, %r12, 3;
+ add.s32 %r321, %r262, %r320;
+ div.s32 %r322, %r321, %r24;
+ mul.lo.s32 %r323, %r322, %r24;
+ sub.s32 %r324, %r321, %r323;
+ div.s32 %r325, %r324, %r25;
+ mul.lo.s32 %r326, %r325, %r25;
+ sub.s32 %r327, %r324, %r326;
+ mad.lo.s32 %r328, %r325, %r26, %r327;
+ div.s32 %r329, %r328, %r75;
+ mul.lo.s32 %r330, %r22, %r15;
+ mad.lo.s32 %r331, %r322, %r23, %r330;
+ add.s32 %r332, %r331, %r329;
+ mul.wide.s32 %rd39, %r332, 4;
add.s64 %rd40, %rd2, %rd39;
ld.global.f32 %f61, [%rd40];
- add.s32 %r335, %r323, 1;
- div.s32 %r336, %r335, %r24;
- mul.lo.s32 %r337, %r336, %r24;
- sub.s32 %r338, %r335, %r337;
- div.s32 %r339, %r338, %r25;
- mul.lo.s32 %r340, %r339, %r25;
- sub.s32 %r341, %r338, %r340;
- mad.lo.s32 %r342, %r339, %r26, %r341;
- div.s32 %r343, %r342, %r76;
- mad.lo.s32 %r344, %r336, %r23, %r332;
- add.s32 %r345, %r344, %r343;
- mul.wide.s32 %rd41, %r345, 4;
+ add.s32 %r333, %r321, 1;
+ div.s32 %r334, %r333, %r24;
+ mul.lo.s32 %r335, %r334, %r24;
+ sub.s32 %r336, %r333, %r335;
+ div.s32 %r337, %r336, %r25;
+ mul.lo.s32 %r338, %r337, %r25;
+ sub.s32 %r339, %r336, %r338;
+ mad.lo.s32 %r340, %r337, %r26, %r339;
+ div.s32 %r341, %r340, %r75;
+ mad.lo.s32 %r342, %r334, %r23, %r330;
+ add.s32 %r343, %r342, %r341;
+ mul.wide.s32 %rd41, %r343, 4;
add.s64 %rd42, %rd2, %rd41;
ld.global.f32 %f62, [%rd42];
- add.s32 %r346, %r323, 2;
- div.s32 %r347, %r346, %r24;
- mul.lo.s32 %r348, %r347, %r24;
- sub.s32 %r349, %r346, %r348;
- div.s32 %r350, %r349, %r25;
- mul.lo.s32 %r351, %r350, %r25;
- sub.s32 %r352, %r349, %r351;
- mad.lo.s32 %r353, %r350, %r26, %r352;
- div.s32 %r354, %r353, %r76;
- mad.lo.s32 %r355, %r347, %r23, %r332;
- add.s32 %r356, %r355, %r354;
- mul.wide.s32 %rd43, %r356, 4;
+ add.s32 %r344, %r321, 2;
+ div.s32 %r345, %r344, %r24;
+ mul.lo.s32 %r346, %r345, %r24;
+ sub.s32 %r347, %r344, %r346;
+ div.s32 %r348, %r347, %r25;
+ mul.lo.s32 %r349, %r348, %r25;
+ sub.s32 %r350, %r347, %r349;
+ mad.lo.s32 %r351, %r348, %r26, %r350;
+ div.s32 %r352, %r351, %r75;
+ mad.lo.s32 %r353, %r345, %r23, %r330;
+ add.s32 %r354, %r353, %r352;
+ mul.wide.s32 %rd43, %r354, 4;
add.s64 %rd44, %rd2, %rd43;
ld.global.f32 %f63, [%rd44];
- add.s32 %r357, %r323, 3;
- div.s32 %r358, %r357, %r24;
- mul.lo.s32 %r359, %r358, %r24;
- sub.s32 %r360, %r357, %r359;
- div.s32 %r361, %r360, %r25;
- mul.lo.s32 %r362, %r361, %r25;
- sub.s32 %r363, %r360, %r362;
- mad.lo.s32 %r364, %r361, %r26, %r363;
- div.s32 %r365, %r364, %r76;
- mad.lo.s32 %r366, %r358, %r23, %r332;
- add.s32 %r367, %r366, %r365;
- mul.wide.s32 %rd45, %r367, 4;
+ add.s32 %r355, %r321, 3;
+ div.s32 %r356, %r355, %r24;
+ mul.lo.s32 %r357, %r356, %r24;
+ sub.s32 %r358, %r355, %r357;
+ div.s32 %r359, %r358, %r25;
+ mul.lo.s32 %r360, %r359, %r25;
+ sub.s32 %r361, %r358, %r360;
+ mad.lo.s32 %r362, %r359, %r26, %r361;
+ div.s32 %r363, %r362, %r75;
+ mad.lo.s32 %r364, %r356, %r23, %r330;
+ add.s32 %r365, %r364, %r363;
+ mul.wide.s32 %rd45, %r365, 4;
add.s64 %rd46, %rd2, %rd45;
ld.global.f32 %f64, [%rd46];
- mul.lo.s32 %r368, %r78, %r76;
- mul.lo.s32 %r369, %r15, %r79;
- mad.lo.s32 %r370, %r368, %r369, %r264;
- mul.wide.s32 %rd47, %r370, 4;
+ mul.lo.s32 %r366, %r77, %r75;
+ mul.lo.s32 %r367, %r15, %r78;
+ mad.lo.s32 %r368, %r366, %r367, %r262;
+ mul.wide.s32 %rd47, %r368, 4;
add.s64 %rd29, %rd3, %rd47;
- ld.global.cs.v4.u32 {%r255,%r256,%r257,%r258}, [%rd29];
-
- mov.b32 %f65, %r255;
+ ld.global.cs.v4.u32 {%r253,%r254,%r255,%r256}, [%rd29];
+
+ mov.b32 %f65, %r253;
fma.rn.f32 %f66, %f61, %f65, %f57;
- mov.b32 %r259, %f66;
- mov.b32 %f67, %r256;
+ mov.b32 %r257, %f66;
+ mov.b32 %f67, %r254;
fma.rn.f32 %f68, %f62, %f67, %f58;
- mov.b32 %r260, %f68;
- mov.b32 %f69, %r257;
+ mov.b32 %r258, %f68;
+ mov.b32 %f69, %r255;
fma.rn.f32 %f70, %f63, %f69, %f59;
- mov.b32 %r261, %f70;
- mov.b32 %f71, %r258;
+ mov.b32 %r259, %f70;
+ mov.b32 %f71, %r256;
fma.rn.f32 %f72, %f64, %f71, %f60;
- mov.b32 %r262, %f72;
- mad.lo.s32 %r371, %r17, %r15, %r264;
- mul.wide.s32 %rd48, %r371, 4;
+ mov.b32 %r260, %f72;
+ mad.lo.s32 %r369, %r17, %r15, %r262;
+ mul.wide.s32 %rd48, %r369, 4;
add.s64 %rd30, %rd6, %rd48;
- st.global.cs.v4.s32 [%rd30], {%r259,%r260,%r261,%r262};
+ st.global.cs.v4.s32 [%rd30], {%r257,%r258,%r259,%r260};
bra.uni $L__BB0_38;
$L__BB0_6:
- setp.ge.s32 %p8, %r15, %r66;
- mul.lo.s32 %r27, %r70, 12;
- or.b32 %r120, %r27, 3;
- shr.s32 %r121, %r120, 31;
- shr.u32 %r122, %r121, 30;
- add.s32 %r123, %r120, %r122;
- shr.s32 %r124, %r123, 2;
+ setp.ge.s32 %p6, %r15, %r65;
+ mul.lo.s32 %r27, %r69, 12;
+ or.b32 %r118, %r27, 3;
+ shr.s32 %r119, %r118, 31;
+ shr.u32 %r120, %r119, 30;
+ add.s32 %r121, %r118, %r120;
+ shr.s32 %r122, %r121, 2;
mov.u32 %r29, %ntid.x;
- add.s32 %r125, %r29, %r124;
- add.s32 %r28, %r125, -1;
- shl.b32 %r126, %r4, 2;
- shl.b32 %r127, %r29, 2;
+ add.s32 %r123, %r29, %r122;
+ add.s32 %r28, %r123, -1;
+ shl.b32 %r124, %r4, 2;
+ shl.b32 %r125, %r29, 2;
mov.u32 %r30, %ctaid.y;
- mad.lo.s32 %r128, %r127, %r30, %r126;
- or.b32 %r31, %r128, 3;
- shl.b32 %r129, %r12, 1;
- add.s32 %r130, %r31, %r129;
- add.s32 %r32, %r130, -3;
- mul.lo.s32 %r33, %r70, 6;
- mul.lo.s32 %r34, %r70, 3;
- mul.lo.s32 %r35, %r15, %r71;
+ mad.lo.s32 %r126, %r125, %r30, %r124;
+ or.b32 %r31, %r126, 3;
+ shl.b32 %r127, %r12, 1;
+ add.s32 %r128, %r31, %r127;
+ add.s32 %r32, %r128, -3;
+ mul.lo.s32 %r33, %r69, 6;
+ mul.lo.s32 %r34, %r69, 3;
+ mul.lo.s32 %r35, %r15, %r70;
mov.f32 %f86, 0f00000000;
mov.f32 %f85, %f86;
- @%p8 bra $L__BB0_9;
-
- div.s32 %r131, %r28, %r29;
- setp.ge.s32 %p9, %r30, %r131;
- setp.ge.s32 %p10, %r31, %r27;
- or.pred %p11, %p9, %p10;
- @%p11 bra $L__BB0_9;
-
- div.s32 %r132, %r32, %r33;
- mul.lo.s32 %r133, %r132, %r33;
- sub.s32 %r134, %r32, %r133;
- div.s32 %r135, %r134, %r34;
- mul.lo.s32 %r136, %r135, %r34;
- sub.s32 %r137, %r134, %r136;
- div.s32 %r138, %r137, %r70;
- mul.lo.s32 %r139, %r138, %r70;
- sub.s32 %r140, %r137, %r139;
- mad.lo.s32 %r141, %r132, %r72, %r35;
- mad.lo.s32 %r142, %r135, %r73, %r141;
- mad.lo.s32 %r143, %r138, %r74, %r142;
- mad.lo.s32 %r144, %r140, %r75, %r143;
- mul.wide.s32 %rd9, %r144, 4;
+ @%p6 bra $L__BB0_9;
+
+ div.s32 %r129, %r28, %r29;
+ setp.ge.s32 %p7, %r30, %r129;
+ setp.ge.s32 %p8, %r31, %r27;
+ or.pred %p9, %p7, %p8;
+ @%p9 bra $L__BB0_9;
+
+ div.s32 %r130, %r32, %r33;
+ mul.lo.s32 %r131, %r130, %r33;
+ sub.s32 %r132, %r32, %r131;
+ div.s32 %r133, %r132, %r34;
+ mul.lo.s32 %r134, %r133, %r34;
+ sub.s32 %r135, %r132, %r134;
+ div.s32 %r136, %r135, %r69;
+ mul.lo.s32 %r137, %r136, %r69;
+ sub.s32 %r138, %r135, %r137;
+ mad.lo.s32 %r139, %r130, %r71, %r35;
+ mad.lo.s32 %r140, %r133, %r72, %r139;
+ mad.lo.s32 %r141, %r136, %r73, %r140;
+ mad.lo.s32 %r142, %r138, %r74, %r141;
+ mul.wide.s32 %rd9, %r142, 4;
add.s64 %rd10, %rd1, %rd9;
ld.global.f32 %f85, [%rd10];
$L__BB0_9:
- @%p8 bra $L__BB0_12;
-
- div.s32 %r145, %r28, %r29;
- setp.ge.s32 %p13, %r30, %r145;
- setp.ge.s32 %p14, %r31, %r27;
- or.pred %p15, %p13, %p14;
- @%p15 bra $L__BB0_12;
-
- add.s32 %r146, %r32, 1;
- div.s32 %r147, %r146, %r33;
- mul.lo.s32 %r148, %r147, %r33;
- sub.s32 %r149, %r146, %r148;
- div.s32 %r150, %r149, %r34;
- mul.lo.s32 %r151, %r150, %r34;
- sub.s32 %r152, %r149, %r151;
- div.s32 %r153, %r152, %r70;
- mul.lo.s32 %r154, %r153, %r70;
- sub.s32 %r155, %r152, %r154;
- mad.lo.s32 %r156, %r147, %r72, %r35;
- mad.lo.s32 %r157, %r150, %r73, %r156;
- mad.lo.s32 %r158, %r153, %r74, %r157;
- mad.lo.s32 %r159, %r155, %r75, %r158;
- mul.wide.s32 %rd11, %r159, 4;
+ @%p6 bra $L__BB0_12;
+
+ div.s32 %r143, %r28, %r29;
+ setp.ge.s32 %p11, %r30, %r143;
+ setp.ge.s32 %p12, %r31, %r27;
+ or.pred %p13, %p11, %p12;
+ @%p13 bra $L__BB0_12;
+
+ add.s32 %r144, %r32, 1;
+ div.s32 %r145, %r144, %r33;
+ mul.lo.s32 %r146, %r145, %r33;
+ sub.s32 %r147, %r144, %r146;
+ div.s32 %r148, %r147, %r34;
+ mul.lo.s32 %r149, %r148, %r34;
+ sub.s32 %r150, %r147, %r149;
+ div.s32 %r151, %r150, %r69;
+ mul.lo.s32 %r152, %r151, %r69;
+ sub.s32 %r153, %r150, %r152;
+ mad.lo.s32 %r154, %r145, %r71, %r35;
+ mad.lo.s32 %r155, %r148, %r72, %r154;
+ mad.lo.s32 %r156, %r151, %r73, %r155;
+ mad.lo.s32 %r157, %r153, %r74, %r156;
+ mul.wide.s32 %rd11, %r157, 4;
add.s64 %rd12, %rd1, %rd11;
ld.global.f32 %f86, [%rd12];
$L__BB0_12:
mov.f32 %f88, 0f00000000;
mov.f32 %f87, %f88;
- @%p8 bra $L__BB0_15;
-
- div.s32 %r160, %r28, %r29;
- setp.ge.s32 %p17, %r30, %r160;
- setp.ge.s32 %p18, %r31, %r27;
- or.pred %p19, %p17, %p18;
- @%p19 bra $L__BB0_15;
-
- add.s32 %r161, %r32, 2;
- div.s32 %r162, %r161, %r33;
- mul.lo.s32 %r163, %r162, %r33;
- sub.s32 %r164, %r161, %r163;
- div.s32 %r165, %r164, %r34;
- mul.lo.s32 %r166, %r165, %r34;
- sub.s32 %r167, %r164, %r166;
- div.s32 %r168, %r167, %r70;
- mul.lo.s32 %r169, %r168, %r70;
- sub.s32 %r170, %r167, %r169;
- mad.lo.s32 %r171, %r162, %r72, %r35;
- mad.lo.s32 %r172, %r165, %r73, %r171;
- mad.lo.s32 %r173, %r168, %r74, %r172;
- mad.lo.s32 %r174, %r170, %r75, %r173;
- mul.wide.s32 %rd13, %r174, 4;
+ @%p6 bra $L__BB0_15;
+
+ div.s32 %r158, %r28, %r29;
+ setp.ge.s32 %p15, %r30, %r158;
+ setp.ge.s32 %p16, %r31, %r27;
+ or.pred %p17, %p15, %p16;
+ @%p17 bra $L__BB0_15;
+
+ add.s32 %r159, %r32, 2;
+ div.s32 %r160, %r159, %r33;
+ mul.lo.s32 %r161, %r160, %r33;
+ sub.s32 %r162, %r159, %r161;
+ div.s32 %r163, %r162, %r34;
+ mul.lo.s32 %r164, %r163, %r34;
+ sub.s32 %r165, %r162, %r164;
+ div.s32 %r166, %r165, %r69;
+ mul.lo.s32 %r167, %r166, %r69;
+ sub.s32 %r168, %r165, %r167;
+ mad.lo.s32 %r169, %r160, %r71, %r35;
+ mad.lo.s32 %r170, %r163, %r72, %r169;
+ mad.lo.s32 %r171, %r166, %r73, %r170;
+ mad.lo.s32 %r172, %r168, %r74, %r171;
+ mul.wide.s32 %rd13, %r172, 4;
add.s64 %rd14, %rd1, %rd13;
ld.global.f32 %f87, [%rd14];
$L__BB0_15:
- @%p8 bra $L__BB0_18;
-
- div.s32 %r175, %r28, %r29;
- setp.ge.s32 %p21, %r30, %r175;
- setp.ge.s32 %p22, %r31, %r27;
- or.pred %p23, %p21, %p22;
- @%p23 bra $L__BB0_18;
-
- add.s32 %r176, %r32, 3;
- div.s32 %r177, %r176, %r33;
- mul.lo.s32 %r178, %r177, %r33;
- sub.s32 %r179, %r176, %r178;
- div.s32 %r180, %r179, %r34;
- mul.lo.s32 %r181, %r180, %r34;
- sub.s32 %r182, %r179, %r181;
- div.s32 %r183, %r182, %r70;
- mul.lo.s32 %r184, %r183, %r70;
- sub.s32 %r185, %r182, %r184;
- mad.lo.s32 %r186, %r177, %r72, %r35;
- mad.lo.s32 %r187, %r180, %r73, %r186;
- mad.lo.s32 %r188, %r183, %r74, %r187;
- mad.lo.s32 %r189, %r185, %r75, %r188;
- mul.wide.s32 %rd15, %r189, 4;
+ @%p6 bra $L__BB0_18;
+
+ div.s32 %r173, %r28, %r29;
+ setp.ge.s32 %p19, %r30, %r173;
+ setp.ge.s32 %p20, %r31, %r27;
+ or.pred %p21, %p19, %p20;
+ @%p21 bra $L__BB0_18;
+
+ add.s32 %r174, %r32, 3;
+ div.s32 %r175, %r174, %r33;
+ mul.lo.s32 %r176, %r175, %r33;
+ sub.s32 %r177, %r174, %r176;
+ div.s32 %r178, %r177, %r34;
+ mul.lo.s32 %r179, %r178, %r34;
+ sub.s32 %r180, %r177, %r179;
+ div.s32 %r181, %r180, %r69;
+ mul.lo.s32 %r182, %r181, %r69;
+ sub.s32 %r183, %r180, %r182;
+ mad.lo.s32 %r184, %r175, %r71, %r35;
+ mad.lo.s32 %r185, %r178, %r72, %r184;
+ mad.lo.s32 %r186, %r181, %r73, %r185;
+ mad.lo.s32 %r187, %r183, %r74, %r186;
+ mul.wide.s32 %rd15, %r187, 4;
add.s64 %rd16, %rd1, %rd15;
ld.global.f32 %f88, [%rd16];
$L__BB0_18:
- mul.lo.s32 %r36, %r78, %r79;
- add.s32 %r190, %r36, 1;
- shr.u32 %r191, %r190, 31;
- add.s32 %r192, %r190, %r191;
- shr.s32 %r37, %r192, 1;
- mad.lo.s32 %r193, %r30, %r29, %r4;
- shl.b32 %r194, %r193, 2;
- shl.b32 %r195, %r12, 3;
- add.s32 %r38, %r195, %r194;
- mul.lo.s32 %r39, %r37, %r76;
- add.s32 %r196, %r39, 1;
- shr.u32 %r197, %r196, 31;
- add.s32 %r198, %r196, %r197;
- shr.s32 %r40, %r198, 1;
- mul.lo.s32 %r41, %r36, %r15;
+ mul.lo.s32 %r36, %r77, %r78;
+ add.s32 %r188, %r36, 1;
+ shr.u32 %r189, %r188, 31;
+ add.s32 %r190, %r188, %r189;
+ shr.s32 %r37, %r190, 1;
+ mad.lo.s32 %r191, %r30, %r29, %r4;
+ shl.b32 %r192, %r191, 2;
+ shl.b32 %r193, %r12, 3;
+ add.s32 %r38, %r193, %r192;
+ mad.lo.s32 %r194, %r37, %r75, 1;
+ shr.u32 %r195, %r194, 31;
+ add.s32 %r196, %r194, %r195;
+ shr.s32 %r39, %r196, 1;
+ mul.lo.s32 %r40, %r36, %r15;
mov.f32 %f90, 0f00000000;
mov.f32 %f89, %f90;
- @%p8 bra $L__BB0_21;
-
- div.s32 %r199, %r38, %r33;
- mul.lo.s32 %r200, %r199, %r33;
- sub.s32 %r201, %r38, %r200;
- div.s32 %r202, %r201, %r34;
- mul.lo.s32 %r203, %r202, %r34;
- sub.s32 %r204, %r201, %r203;
- mad.lo.s32 %r205, %r202, %r40, %r204;
- div.s32 %r206, %r205, %r76;
- mad.lo.s32 %r42, %r199, %r37, %r206;
- setp.ge.s32 %p25, %r42, %r36;
- setp.ge.s32 %p26, %r205, %r39;
- or.pred %p27, %p25, %p26;
- @%p27 bra $L__BB0_21;
-
- add.s32 %r207, %r42, %r41;
- mul.wide.s32 %rd17, %r207, 4;
+ @%p6 bra $L__BB0_21;
+
+ div.s32 %r197, %r38, %r33;
+ mul.lo.s32 %r198, %r197, %r33;
+ sub.s32 %r199, %r38, %r198;
+ div.s32 %r200, %r199, %r34;
+ mul.lo.s32 %r201, %r200, %r34;
+ sub.s32 %r202, %r199, %r201;
+ mad.lo.s32 %r203, %r200, %r39, %r202;
+ div.s32 %r204, %r203, %r75;
+ mad.lo.s32 %r41, %r197, %r37, %r204;
+ setp.ge.s32 %p23, %r41, %r36;
+ @%p23 bra $L__BB0_21;
+
+ add.s32 %r205, %r41, %r40;
+ mul.wide.s32 %rd17, %r205, 4;
add.s64 %rd18, %rd2, %rd17;
ld.global.f32 %f89, [%rd18];
$L__BB0_21:
- @%p8 bra $L__BB0_24;
-
- add.s32 %r208, %r38, 1;
- div.s32 %r209, %r208, %r33;
- mul.lo.s32 %r210, %r209, %r33;
- sub.s32 %r211, %r208, %r210;
- div.s32 %r212, %r211, %r34;
- mul.lo.s32 %r213, %r212, %r34;
- sub.s32 %r214, %r211, %r213;
- mad.lo.s32 %r215, %r212, %r40, %r214;
- div.s32 %r216, %r215, %r76;
- mad.lo.s32 %r43, %r209, %r37, %r216;
- setp.ge.s32 %p29, %r43, %r36;
- setp.ge.s32 %p30, %r215, %r39;
- or.pred %p31, %p29, %p30;
- @%p31 bra $L__BB0_24;
-
- add.s32 %r217, %r43, %r41;
- mul.wide.s32 %rd19, %r217, 4;
+ @%p6 bra $L__BB0_24;
+
+ add.s32 %r206, %r38, 1;
+ div.s32 %r207, %r206, %r33;
+ mul.lo.s32 %r208, %r207, %r33;
+ sub.s32 %r209, %r206, %r208;
+ div.s32 %r210, %r209, %r34;
+ mul.lo.s32 %r211, %r210, %r34;
+ sub.s32 %r212, %r209, %r211;
+ mad.lo.s32 %r213, %r210, %r39, %r212;
+ div.s32 %r214, %r213, %r75;
+ mad.lo.s32 %r42, %r207, %r37, %r214;
+ setp.ge.s32 %p25, %r42, %r36;
+ @%p25 bra $L__BB0_24;
+
+ add.s32 %r215, %r42, %r40;
+ mul.wide.s32 %rd19, %r215, 4;
add.s64 %rd20, %rd2, %rd19;
ld.global.f32 %f90, [%rd20];
$L__BB0_24:
mov.f32 %f92, 0f00000000;
mov.f32 %f91, %f92;
- @%p8 bra $L__BB0_27;
-
- add.s32 %r218, %r38, 2;
- div.s32 %r219, %r218, %r33;
- mul.lo.s32 %r220, %r219, %r33;
- sub.s32 %r221, %r218, %r220;
- div.s32 %r222, %r221, %r34;
- mul.lo.s32 %r223, %r222, %r34;
- sub.s32 %r224, %r221, %r223;
- mad.lo.s32 %r225, %r222, %r40, %r224;
- div.s32 %r226, %r225, %r76;
- mad.lo.s32 %r44, %r219, %r37, %r226;
- setp.ge.s32 %p33, %r44, %r36;
- setp.ge.s32 %p34, %r225, %r39;
- or.pred %p35, %p33, %p34;
- @%p35 bra $L__BB0_27;
-
- add.s32 %r227, %r44, %r41;
- mul.wide.s32 %rd21, %r227, 4;
+ @%p6 bra $L__BB0_27;
+
+ add.s32 %r216, %r38, 2;
+ div.s32 %r217, %r216, %r33;
+ mul.lo.s32 %r218, %r217, %r33;
+ sub.s32 %r219, %r216, %r218;
+ div.s32 %r220, %r219, %r34;
+ mul.lo.s32 %r221, %r220, %r34;
+ sub.s32 %r222, %r219, %r221;
+ mad.lo.s32 %r223, %r220, %r39, %r222;
+ div.s32 %r224, %r223, %r75;
+ mad.lo.s32 %r43, %r217, %r37, %r224;
+ setp.ge.s32 %p27, %r43, %r36;
+ @%p27 bra $L__BB0_27;
+
+ add.s32 %r225, %r43, %r40;
+ mul.wide.s32 %rd21, %r225, 4;
add.s64 %rd22, %rd2, %rd21;
ld.global.f32 %f91, [%rd22];
$L__BB0_27:
- @%p8 bra $L__BB0_30;
-
- add.s32 %r228, %r38, 3;
- div.s32 %r229, %r228, %r33;
- mul.lo.s32 %r230, %r229, %r33;
- sub.s32 %r231, %r228, %r230;
- div.s32 %r232, %r231, %r34;
- mul.lo.s32 %r233, %r232, %r34;
- sub.s32 %r234, %r231, %r233;
- mad.lo.s32 %r235, %r232, %r40, %r234;
- div.s32 %r236, %r235, %r76;
- mad.lo.s32 %r45, %r229, %r37, %r236;
- setp.ge.s32 %p37, %r45, %r36;
- setp.ge.s32 %p38, %r235, %r39;
- or.pred %p39, %p37, %p38;
- @%p39 bra $L__BB0_30;
-
- add.s32 %r237, %r45, %r41;
- mul.wide.s32 %rd23, %r237, 4;
+ @%p6 bra $L__BB0_30;
+
+ add.s32 %r226, %r38, 3;
+ div.s32 %r227, %r226, %r33;
+ mul.lo.s32 %r228, %r227, %r33;
+ sub.s32 %r229, %r226, %r228;
+ div.s32 %r230, %r229, %r34;
+ mul.lo.s32 %r231, %r230, %r34;
+ sub.s32 %r232, %r229, %r231;
+ mad.lo.s32 %r233, %r230, %r39, %r232;
+ div.s32 %r234, %r233, %r75;
+ mad.lo.s32 %r44, %r227, %r37, %r234;
+ setp.ge.s32 %p29, %r44, %r36;
+ @%p29 bra $L__BB0_30;
+
+ add.s32 %r235, %r44, %r40;
+ mul.wide.s32 %rd23, %r235, 4;
add.s64 %rd24, %rd2, %rd23;
ld.global.f32 %f92, [%rd24];
$L__BB0_30:
mov.f32 %f93, 0f00000000;
mov.f32 %f94, 0f00000000;
mov.f32 %f95, 0f00000000;
mov.f32 %f96, 0f00000000;
- @%p8 bra $L__BB0_34;
-
- div.s32 %r238, %r28, %r29;
- setp.ge.s32 %p41, %r30, %r238;
- @%p41 bra $L__BB0_34;
-
- setp.ge.s32 %p42, %r31, %r27;
- @%p42 bra $L__BB0_34;
-
- mul.lo.s32 %r243, %r15, %r79;
- mul.lo.s32 %r244, %r78, %r76;
- mad.lo.s32 %r245, %r244, %r243, %r31;
- add.s32 %r246, %r245, -3;
- mul.wide.s32 %rd26, %r246, 4;
+ @%p6 bra $L__BB0_34;
+
+ div.s32 %r236, %r28, %r29;
+ setp.ge.s32 %p31, %r30, %r236;
+ @%p31 bra $L__BB0_34;
+
+ setp.ge.s32 %p32, %r31, %r27;
+ @%p32 bra $L__BB0_34;
+
+ mul.lo.s32 %r241, %r15, %r78;
+ mul.lo.s32 %r242, %r77, %r75;
+ mad.lo.s32 %r243, %r242, %r241, %r31;
+ add.s32 %r244, %r243, -3;
+ mul.wide.s32 %rd26, %r244, 4;
add.s64 %rd25, %rd3, %rd26;
- ld.global.cs.v4.u32 {%r239,%r240,%r241,%r242}, [%rd25];
-
- mov.b32 %f96, %r239;
- mov.b32 %f95, %r240;
- mov.b32 %f94, %r241;
- mov.b32 %f93, %r242;
+ ld.global.cs.v4.u32 {%r237,%r238,%r239,%r240}, [%rd25];
+
+ mov.b32 %f96, %r237;
+ mov.b32 %f95, %r238;
+ mov.b32 %f94, %r239;
+ mov.b32 %f93, %r240;
$L__BB0_34:
fma.rn.f32 %f25, %f96, %f89, %f85;
fma.rn.f32 %f26, %f95, %f90, %f86;
fma.rn.f32 %f27, %f94, %f91, %f87;
fma.rn.f32 %f28, %f93, %f92, %f88;
- @%p8 bra $L__BB0_38;
-
- div.s32 %r247, %r28, %r29;
- setp.ge.s32 %p44, %r30, %r247;
- setp.ge.s32 %p45, %r31, %r27;
- or.pred %p46, %p44, %p45;
- @%p46 bra $L__BB0_38;
-
- mad.lo.s32 %r252, %r27, %r14, %r31;
- add.s32 %r253, %r252, -3;
- mad.lo.s32 %r254, %r27, %r13, %r253;
- mul.wide.s32 %rd28, %r254, 4;
+ @%p6 bra $L__BB0_38;
+
+ div.s32 %r245, %r28, %r29;
+ setp.ge.s32 %p34, %r30, %r245;
+ setp.ge.s32 %p35, %r31, %r27;
+ or.pred %p36, %p34, %p35;
+ @%p36 bra $L__BB0_38;
+
+ mad.lo.s32 %r250, %r27, %r14, %r31;
+ add.s32 %r251, %r250, -3;
+ mad.lo.s32 %r252, %r27, %r13, %r251;
+ mul.wide.s32 %rd28, %r252, 4;
add.s64 %rd27, %rd6, %rd28;
- mov.b32 %r248, %f25;
- mov.b32 %r249, %f26;
- mov.b32 %r250, %f27;
- mov.b32 %r251, %f28;
-
- st.global.cs.v4.s32 [%rd27], {%r248,%r249,%r250,%r251};
+ mov.b32 %r246, %f25;
+ mov.b32 %r247, %f26;
+ mov.b32 %r248, %f27;
+ mov.b32 %r249, %f28;
+
+ st.global.cs.v4.s32 [%rd27], {%r246,%r247,%r248,%r249};
$L__BB0_38:
ret;
Kernel 171
CUDA
PTX
53997da5d
Diff
03a1b695e
-10
+10 index type: int
registers: 24
gmem: 3
static smem: 4
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 4, 4> T0, Tensor<float, 3, 3> T10) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
NVFUSER_DEFINE_MAGIC_ZERO;
Array<float, 2, 1> T14;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
T14[i0] = NEG_INFINITY;
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((((nvfuser_index_t)threadIdx.x) * 2) + 1) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
T14[i0]
= T0[(((((2 * T0.alloc_stride[3LL]) * ((nvfuser_index_t)threadIdx.x)) + (T0.alloc_stride[1LL] * ((nvfuser_index_t)threadIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T0.alloc_stride[1LL]) * ((nvfuser_index_t)blockIdx.x))) + (T0.alloc_stride[3LL] * (i0 + nvfuser_zero)))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
if ((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
T14[i0]
= T0[(((((2 * T0.alloc_stride[3LL]) * ((nvfuser_index_t)threadIdx.x)) + (T0.alloc_stride[1LL] * ((nvfuser_index_t)threadIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T0.alloc_stride[1LL]) * ((nvfuser_index_t)blockIdx.x))) + (T0.alloc_stride[3LL] * (i0 + nvfuser_zero)))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<float, 1, 1> T16;
T16[0] = NEG_INFINITY;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 2; ++i1) {
T16[0] = fmax(
T16[0],
T14[i1]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 1, 1> T2;
T2[0] = NEG_INFINITY;
blockReduce<true, false, false, true>(T2[0], T16[0], [](float &a, float b) { a = fmax(a, b); }, static_cast<float*>(shared_mem), true, true, float(NEG_INFINITY), DefaultBlockDim());
Array<float, 1, 1> T3;
broadcast::blockBroadcast<true, false, false, true>(T3[0], T2[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
// Alias Allocation - register
auto& T5 = T14;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 2; ++i2) {
Array<float, 1, 1> T4;
T4[0]
= T14[i2]
- T3[0];
T5[i2]
= expf(T4[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 1, 1> T17;
T17[0] = 0.000000000e+00f;
if (((((((nvfuser_index_t)threadIdx.x) * 2) + 1) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 2; ++i3) {
T17[0]
= T17[0]
+ T5[i3];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 2; ++i3) {
if ((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
T17[0]
= T17[0]
+ T5[i3];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<float, 1, 1> T6;
T6[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T6[0], T17[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T7;
broadcast::blockBroadcast<true, false, false, true>(T7[0], T6[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T8;
T8[0]
= reciprocal(T7[0]);
if (((((((nvfuser_index_t)threadIdx.x) * 2) + 1) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
Array<float, 2, 2> T15;
#pragma unroll
for(nvfuser_index_t i4 = 0; i4 < 2; ++i4) {
Array<float, 1, 1> T9;
T9[0]
= T5[i4]
* T8[0];
T15[i4]
= T9[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T10[(((2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T15[0]);
} else {
Array<float, 2, 2> T15;
#pragma unroll
for(nvfuser_index_t i4 = 0; i4 < 2; ++i4) {
Array<float, 1, 1> T9;
T9[0]
= T5[i4]
* T8[0];
T15[i4]
= T9[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T10[(((2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T15[0]);
}
}
}
__global__ void nvfuser_N(Tensor<float, 4, 4> T0, Tensor<float, 3, 3> T10) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
NVFUSER_DEFINE_MAGIC_ZERO;
Array<float, 2, 1> T14;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
T14[i0] = NEG_INFINITY;
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
T14[i0]
= T0[(((T0.alloc_stride[1LL] * ((nvfuser_index_t)threadIdx.y)) + ((((nvfuser_index_t)blockDim.y) * T0.alloc_stride[1LL]) * ((nvfuser_index_t)blockIdx.x))) + (T0.alloc_stride[3LL] * (i0 + nvfuser_zero)))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
T14[i0]
= T0[(((T0.alloc_stride[1LL] * ((nvfuser_index_t)threadIdx.y)) + ((((nvfuser_index_t)blockDim.y) * T0.alloc_stride[1LL]) * ((nvfuser_index_t)blockIdx.x))) + (T0.alloc_stride[3LL] * (i0 + nvfuser_zero)))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<float, 1, 1> T16;
T16[0] = NEG_INFINITY;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 2; ++i1) {
T16[0] = fmax(
T16[0],
T14[i1]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 1, 1> T2;
T2[0] = NEG_INFINITY;
blockReduce<true, false, false, true>(T2[0], T16[0], [](float &a, float b) { a = fmax(a, b); }, static_cast<float*>(shared_mem), true, true, float(NEG_INFINITY), DefaultBlockDim());
Array<float, 1, 1> T3;
broadcast::blockBroadcast<true, false, false, true>(T3[0], T2[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
// Alias Allocation - register
auto& T5 = T14;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 2; ++i2) {
Array<float, 1, 1> T4;
T4[0]
= T14[i2]
- T3[0];
T5[i2]
= expf(T4[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 1, 1> T17;
T17[0] = 0.000000000e+00f;
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 2; ++i3) {
T17[0]
= T17[0]
+ T5[i3];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 2; ++i3) {
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
T17[0]
= T17[0]
+ T5[i3];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<float, 1, 1> T6;
T6[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T6[0], T17[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T7;
broadcast::blockBroadcast<true, false, false, true>(T7[0], T6[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T8;
T8[0]
= reciprocal(T7[0]);
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
Array<float, 2, 2> T15;
#pragma unroll
for(nvfuser_index_t i4 = 0; i4 < 2; ++i4) {
Array<float, 1, 1> T9;
T9[0]
= T5[i4]
* T8[0];
T15[i4]
= T9[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T10[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T15[0]);
} else {
Array<float, 2, 2> T15;
#pragma unroll
for(nvfuser_index_t i4 = 0; i4 < 2; ++i4) {
Array<float, 1, 1> T9;
T9[0]
= T5[i4]
* T8[0];
T15[i4]
= T9[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T10[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T15[0]);
}
}
}
--- 53997da5d
+++ 03a1b695e
@@ -6,23 +6,23 @@
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
T14[i0] = NEG_INFINITY;
}
NVFUSER_UPDATE_MAGIC_ZERO;
- if (((((((nvfuser_index_t)threadIdx.x) * 2) + 1) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
T14[i0]
- = T0[(((((2 * T0.alloc_stride[3LL]) * ((nvfuser_index_t)threadIdx.x)) + (T0.alloc_stride[1LL] * ((nvfuser_index_t)threadIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T0.alloc_stride[1LL]) * ((nvfuser_index_t)blockIdx.x))) + (T0.alloc_stride[3LL] * (i0 + nvfuser_zero)))];
+ = T0[(((T0.alloc_stride[1LL] * ((nvfuser_index_t)threadIdx.y)) + ((((nvfuser_index_t)blockDim.y) * T0.alloc_stride[1LL]) * ((nvfuser_index_t)blockIdx.x))) + (T0.alloc_stride[3LL] * (i0 + nvfuser_zero)))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
- if ((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
T14[i0]
- = T0[(((((2 * T0.alloc_stride[3LL]) * ((nvfuser_index_t)threadIdx.x)) + (T0.alloc_stride[1LL] * ((nvfuser_index_t)threadIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T0.alloc_stride[1LL]) * ((nvfuser_index_t)blockIdx.x))) + (T0.alloc_stride[3LL] * (i0 + nvfuser_zero)))];
+ = T0[(((T0.alloc_stride[1LL] * ((nvfuser_index_t)threadIdx.y)) + ((((nvfuser_index_t)blockDim.y) * T0.alloc_stride[1LL]) * ((nvfuser_index_t)blockIdx.x))) + (T0.alloc_stride[3LL] * (i0 + nvfuser_zero)))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<float, 1, 1> T16;
@@ -51,22 +51,22 @@
= expf(T4[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 1, 1> T17;
T17[0] = 0.000000000e+00f;
- if (((((((nvfuser_index_t)threadIdx.x) * 2) + 1) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 2; ++i3) {
T17[0]
= T17[0]
+ T5[i3];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 2; ++i3) {
- if ((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
T17[0]
= T17[0]
+ T5[i3];
}
}
@@ -78,11 +78,11 @@
Array<float, 1, 1> T7;
broadcast::blockBroadcast<true, false, false, true>(T7[0], T6[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T8;
T8[0]
= reciprocal(T7[0]);
- if (((((((nvfuser_index_t)threadIdx.x) * 2) + 1) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
Array<float, 2, 2> T15;
#pragma unroll
for(nvfuser_index_t i4 = 0; i4 < 2; ++i4) {
Array<float, 1, 1> T9;
T9[0]
@@ -90,11 +90,11 @@
* T8[0];
T15[i4]
= T9[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
- loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T10[(((2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T15[0]);
+ loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T10[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T15[0]);
} else {
Array<float, 2, 2> T15;
#pragma unroll
for(nvfuser_index_t i4 = 0; i4 < 2; ++i4) {
Array<float, 1, 1> T9;
@@ -103,10 +103,10 @@
* T8[0];
T15[i4]
= T9[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
- if ((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
- loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T10[(((2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T15[0]);
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
+ loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T10[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T15[0]);
}
}
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_176_cu_102d4ebe_1911011nvfuser_176ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEEE14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_176_cu_102d4ebe_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_176_cu_102d4ebe_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_176_cu_102d4ebe_191103std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_176_cu_102d4ebe_191105arrayE[];
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_176_cu_102d4ebe_1911011nvfuser_176ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_176_cu_102d4ebe_1911011nvfuser_176ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_0[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_176_cu_102d4ebe_1911011nvfuser_176ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_1[32]
)
{
.reg .pred %p<47>;
.reg .f32 %f<81>;
.reg .b32 %r<100>;
.reg .b64 %rd<35>;
// demoted variable
.shared .align 4 .u32 _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_176_cu_102d4ebe_1911011nvfuser_176ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEEE14nvfuser_zero_s;
ld.param.v2.u32 {%r39, %r40}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_176_cu_102d4ebe_1911011nvfuser_176ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_0+24];
ld.param.v2.u32 {%r41, %r42}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_176_cu_102d4ebe_1911011nvfuser_176ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_0+32];
ld.param.u64 %rd8, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_176_cu_102d4ebe_1911011nvfuser_176ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_1];
ld.param.u64 %rd7, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_176_cu_102d4ebe_1911011nvfuser_176ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_0];
cvta.to.global.u64 %rd1, %rd7;
mov.u32 %r1, %tid.x;
setp.ne.s32 %p3, %r1, 0;
@%p3 bra $L__BB0_2;
mov.u32 %r49, 0;
st.shared.u32 [_ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_176_cu_102d4ebe_1911011nvfuser_176ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEEE14nvfuser_zero_s], %r49;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd9, _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_176_cu_102d4ebe_1911011nvfuser_176ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEEE14nvfuser_zero_s;
atom.shared.min.s32 %r50, [%rd9], %r1;
ld.shared.u32 %r51, [_ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_176_cu_102d4ebe_1911011nvfuser_176ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEEE14nvfuser_zero_s];
shl.b32 %r4, %r51, 1;
setp.lt.s32 %p4, %r1, 1;
@%p4 bra $L__BB0_3;
bra.uni $L__BB0_4;
$L__BB0_3:
mov.u32 %r52, %tid.y;
mov.u32 %r53, %ctaid.x;
mov.u32 %r54, %ntid.y;
mad.lo.s32 %r5, %r54, %r53, %r52;
setp.lt.s32 %p5, %r5, 27454;
@%p5 bra $L__BB0_8;
bra.uni $L__BB0_4;
$L__BB0_8:
shl.b32 %r63, %r1, 1;
add.s32 %r64, %r4, %r63;
mul.lo.s32 %r65, %r5, %r40;
mad.lo.s32 %r66, %r64, %r42, %r65;
mul.wide.s32 %rd14, %r66, 4;
add.s64 %rd15, %rd1, %rd14;
ld.global.f32 %f76, [%rd15];
add.s32 %r67, %r66, %r42;
mul.wide.s32 %rd16, %r67, 4;
add.s64 %rd17, %rd1, %rd16;
ld.global.f32 %f77, [%rd17];
bra.uni $L__BB0_9;
$L__BB0_4:
mov.u32 %r55, %ntid.y;
mov.u32 %r56, %ctaid.x;
mov.u32 %r57, %tid.y;
mad.lo.s32 %r58, %r55, %r56, %r57;
setp.lt.s32 %p7, %r58, 27454;
shl.b32 %r59, %r1, 1;
add.s32 %r6, %r4, %r59;
mul.lo.s32 %r7, %r58, %r40;
and.pred %p1, %p4, %p7;
mov.f32 %f77, 0fFF800000;
not.pred %p8, %p1;
mov.f32 %f76, %f77;
@%p8 bra $L__BB0_6;
mad.lo.s32 %r60, %r6, %r42, %r7;
mul.wide.s32 %rd10, %r60, 4;
add.s64 %rd11, %rd1, %rd10;
ld.global.f32 %f76, [%rd11];
$L__BB0_6:
@%p8 bra $L__BB0_9;
add.s32 %r61, %r6, 1;
mad.lo.s32 %r62, %r61, %r42, %r7;
mul.wide.s32 %rd12, %r62, 4;
add.s64 %rd13, %rd1, %rd12;
ld.global.f32 %f77, [%rd13];
$L__BB0_9:
setp.gt.f32 %p10, %f76, %f77;
setp.nan.f32 %p11, %f76, %f76;
or.pred %p12, %p11, %p10;
selp.f32 %f24, %f76, %f77, %p12;
mov.u32 %r68, %tid.z;
mov.u32 %r8, %ntid.y;
mov.u32 %r9, %tid.y;
mad.lo.s32 %r10, %r8, %r68, %r9;
mov.u32 %r11, %ntid.x;
mad.lo.s32 %r12, %r10, %r11, %r1;
mul.wide.u32 %rd18, %r12, 4;
mov.u64 %rd19, _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_176_cu_102d4ebe_191105arrayE;
add.s64 %rd2, %rd19, %rd18;
st.shared.f32 [%rd2], %f24;
bar.sync 0;
clz.b32 %r69, %r11;
mov.u32 %r70, 31;
sub.s32 %r71, %r70, %r69;
mov.u32 %r72, 1;
shl.b32 %r13, %r72, %r71;
setp.lt.u32 %p13, %r1, %r13;
add.s32 %r73, %r13, %r1;
setp.lt.u32 %p14, %r73, %r11;
and.pred %p2, %p13, %p14;
add.s32 %r74, %r12, %r13;
mul.wide.s32 %rd20, %r74, 4;
add.s64 %rd3, %rd19, %rd20;
not.pred %p15, %p2;
@%p15 bra $L__BB0_11;
ld.shared.f32 %f25, [%rd3];
ld.shared.f32 %f26, [%rd2];
setp.nan.f32 %p16, %f26, %f26;
setp.gt.f32 %p17, %f26, %f25;
or.pred %p18, %p16, %p17;
selp.f32 %f27, %f26, %f25, %p18;
st.shared.f32 [%rd2], %f27;
$L__BB0_11:
bar.sync 0;
shr.u32 %r75, %r13, 31;
add.s32 %r76, %r13, %r75;
shr.s32 %r99, %r76, 1;
setp.lt.s32 %p19, %r13, 4;
@%p19 bra $L__BB0_16;
mov.u32 %r98, %r99;
$L__BB0_13:
setp.ge.u32 %p20, %r1, %r98;
@%p20 bra $L__BB0_15;
add.s32 %r77, %r98, %r12;
mul.wide.s32 %rd21, %r77, 4;
add.s64 %rd23, %rd19, %rd21;
ld.shared.f32 %f28, [%rd2];
setp.nan.f32 %p21, %f28, %f28;
ld.shared.f32 %f29, [%rd23];
setp.gt.f32 %p22, %f28, %f29;
or.pred %p23, %p21, %p22;
selp.f32 %f30, %f28, %f29, %p23;
st.shared.f32 [%rd2], %f30;
$L__BB0_15:
bar.sync 0;
shr.u32 %r16, %r98, 1;
setp.gt.u32 %p24, %r98, 3;
mov.u32 %r98, %r16;
@%p24 bra $L__BB0_13;
$L__BB0_16:
add.s32 %r78, %r12, 1;
mul.wide.u32 %rd24, %r78, 4;
add.s64 %rd4, %rd19, %rd24;
mov.f32 %f78, 0fFF800000;
@%p3 bra $L__BB0_19;
ld.shared.f32 %f78, [%rd2];
setp.lt.u32 %p26, %r11, 2;
@%p26 bra $L__BB0_19;
ld.shared.f32 %f32, [%rd4];
setp.gt.f32 %p27, %f78, %f32;
setp.nan.f32 %p28, %f78, %f78;
or.pred %p29, %p28, %p27;
selp.f32 %f78, %f78, %f32, %p29;
$L__BB0_19:
bar.sync 0;
mul.wide.s32 %rd26, %r10, 4;
add.s64 %rd5, %rd19, %rd26;
setp.eq.s32 %p30, %r1, 0;
@%p30 bra $L__BB0_20;
bra.uni $L__BB0_21;
$L__BB0_20:
st.shared.f32 [%rd5], %f78;
$L__BB0_21:
bar.sync 0;
ld.shared.f32 %f33, [%rd5];
bar.sync 0;
sub.f32 %f34, %f76, %f33;
mov.f32 %f35, 0f3F000000;
mov.f32 %f36, 0f3BBB989D;
fma.rn.f32 %f37, %f34, %f36, %f35;
cvt.sat.f32.f32 %f38, %f37;
mov.f32 %f39, 0f4B400001;
mov.f32 %f40, 0f437C0000;
fma.rm.f32 %f41, %f38, %f40, %f39;
add.f32 %f42, %f41, 0fCB40007F;
neg.f32 %f43, %f42;
mov.f32 %f44, 0f3FB8AA3B;
fma.rn.f32 %f45, %f34, %f44, %f43;
mov.f32 %f46, 0f32A57060;
fma.rn.f32 %f47, %f34, %f46, %f45;
mov.b32 %r79, %f41;
shl.b32 %r80, %r79, 23;
mov.b32 %f48, %r80;
ex2.approx.ftz.f32 %f49, %f47;
mul.f32 %f11, %f49, %f48;
sub.f32 %f50, %f77, %f33;
fma.rn.f32 %f51, %f50, %f36, %f35;
cvt.sat.f32.f32 %f52, %f51;
fma.rm.f32 %f53, %f52, %f40, %f39;
add.f32 %f54, %f53, 0fCB40007F;
neg.f32 %f55, %f54;
fma.rn.f32 %f56, %f50, %f44, %f55;
fma.rn.f32 %f57, %f50, %f46, %f56;
mov.b32 %r81, %f53;
shl.b32 %r82, %r81, 23;
mov.b32 %f58, %r82;
ex2.approx.ftz.f32 %f59, %f57;
mul.f32 %f12, %f59, %f58;
@%p4 bra $L__BB0_22;
bra.uni $L__BB0_23;
$L__BB0_22:
mov.u32 %r83, %ctaid.x;
mad.lo.s32 %r84, %r8, %r83, %r9;
setp.lt.s32 %p32, %r84, 27454;
@%p32 bra $L__BB0_24;
bra.uni $L__BB0_23;
$L__BB0_24:
add.f32 %f62, %f11, 0f00000000;
add.f32 %f79, %f62, %f12;
bra.uni $L__BB0_25;
$L__BB0_23:
mov.u32 %r85, %ctaid.x;
mad.lo.s32 %r86, %r8, %r85, %r9;
setp.lt.s32 %p34, %r86, 27454;
and.pred %p35, %p4, %p34;
add.f32 %f60, %f11, 0f00000000;
add.f32 %f61, %f60, %f12;
selp.f32 %f79, %f61, 0f00000000, %p35;
$L__BB0_25:
st.shared.f32 [%rd2], %f79;
bar.sync 0;
@%p15 bra $L__BB0_27;
ld.shared.f32 %f63, [%rd3];
ld.shared.f32 %f64, [%rd2];
add.f32 %f65, %f63, %f64;
st.shared.f32 [%rd2], %f65;
$L__BB0_27:
bar.sync 0;
@%p19 bra $L__BB0_31;
$L__BB0_28:
setp.ge.u32 %p38, %r1, %r99;
@%p38 bra $L__BB0_30;
add.s32 %r87, %r99, %r12;
mul.wide.s32 %rd28, %r87, 4;
add.s64 %rd30, %rd19, %rd28;
ld.shared.f32 %f66, [%rd2];
ld.shared.f32 %f67, [%rd30];
add.f32 %f68, %f67, %f66;
st.shared.f32 [%rd2], %f68;
$L__BB0_30:
bar.sync 0;
shr.u32 %r18, %r99, 1;
setp.gt.u32 %p39, %r99, 3;
mov.u32 %r99, %r18;
@%p39 bra $L__BB0_28;
$L__BB0_31:
mov.f32 %f80, 0f00000000;
@%p3 bra $L__BB0_34;
ld.shared.f32 %f70, [%rd2];
add.f32 %f80, %f70, 0f00000000;
setp.lt.u32 %p41, %r11, 2;
@%p41 bra $L__BB0_34;
ld.shared.f32 %f71, [%rd4];
add.f32 %f80, %f80, %f71;
$L__BB0_34:
bar.sync 0;
@%p3 bra $L__BB0_36;
st.shared.f32 [%rd5], %f80;
$L__BB0_36:
setp.gt.s32 %p43, %r1, 0;
bar.sync 0;
ld.shared.f32 %f72, [%rd5];
bar.sync 0;
rcp.rn.f32 %f19, %f72;
@%p43 bra $L__BB0_38;
mov.u32 %r88, %ctaid.x;
mad.lo.s32 %r19, %r8, %r88, %r9;
setp.lt.s32 %p44, %r19, 27454;
@%p44 bra $L__BB0_41;
bra.uni $L__BB0_38;
$L__BB0_41:
mul.f32 %f73, %f19, %f11;
mov.b32 %r94, %f73;
mul.f32 %f74, %f19, %f12;
mov.b32 %r95, %f74;
add.s32 %r96, %r19, %r1;
shl.b32 %r97, %r96, 1;
mul.wide.s32 %rd34, %r97, 4;
add.s64 %rd33, %rd8, %rd34;
// begin inline asm
st.global.cs.v2.s32 [%rd33], {%r94,%r95};
// end inline asm
bra.uni $L__BB0_42;
$L__BB0_38:
mul.f32 %f20, %f19, %f11;
mul.f32 %f21, %f19, %f12;
@%p43 bra $L__BB0_42;
mov.u32 %r89, %ctaid.x;
mad.lo.s32 %r20, %r8, %r89, %r9;
setp.gt.s32 %p46, %r20, 27453;
@%p46 bra $L__BB0_42;
add.s32 %r92, %r20, %r1;
shl.b32 %r93, %r92, 1;
mul.wide.s32 %rd32, %r93, 4;
add.s64 %rd31, %rd8, %rd32;
mov.b32 %r90, %f20;
mov.b32 %r91, %f21;
// begin inline asm
st.global.cs.v2.s32 [%rd31], {%r90,%r91};
// end inline asm
$L__BB0_42:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_176_cu_fa7e611e_1601111nvfuser_176ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEEE14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_176_cu_fa7e611e_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_176_cu_fa7e611e_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_176_cu_fa7e611e_160113std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_176_cu_fa7e611e_160115arrayE[];
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_176_cu_fa7e611e_1601111nvfuser_176ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_176_cu_fa7e611e_1601111nvfuser_176ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_0[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_176_cu_fa7e611e_1601111nvfuser_176ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_1[32]
)
{
.reg .pred %p<34>;
.reg .f32 %f<71>;
.reg .b32 %r<71>;
.reg .b64 %rd<28>;
// demoted variable
.shared .align 4 .u32 _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_176_cu_fa7e611e_1601111nvfuser_176ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEEE14nvfuser_zero_s;
ld.param.v2.u32 {%r32, %r33}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_176_cu_fa7e611e_1601111nvfuser_176ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_0+24];
ld.param.v2.u32 {%r34, %r35}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_176_cu_fa7e611e_1601111nvfuser_176ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_0+32];
ld.param.u64 %rd6, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_176_cu_fa7e611e_1601111nvfuser_176ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_1];
ld.param.u64 %rd5, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_176_cu_fa7e611e_1601111nvfuser_176ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_0];
mov.u32 %r1, %tid.x;
setp.ne.s32 %p2, %r1, 0;
@%p2 bra $L__BB0_2;
mov.u32 %r42, 0;
st.shared.u32 [_ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_176_cu_fa7e611e_1601111nvfuser_176ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEEE14nvfuser_zero_s], %r42;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd7, _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_176_cu_fa7e611e_1601111nvfuser_176ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEEE14nvfuser_zero_s;
atom.shared.min.s32 %r43, [%rd7], %r1;
mov.u32 %r44, %ctaid.x;
mov.u32 %r2, %ntid.y;
mov.u32 %r3, %tid.y;
mad.lo.s32 %r4, %r2, %r44, %r3;
setp.gt.s32 %p3, %r4, 27453;
mov.f32 %f67, 0fFF800000;
mov.f32 %f68, %f67;
@%p3 bra $L__BB0_4;
cvta.to.global.u64 %rd8, %rd5;
ld.shared.u32 %r45, [_ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_176_cu_fa7e611e_1601111nvfuser_176ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEEE14nvfuser_zero_s];
shl.b32 %r46, %r45, 1;
mul.lo.s32 %r47, %r4, %r33;
mad.lo.s32 %r48, %r46, %r35, %r47;
mul.wide.s32 %rd9, %r48, 4;
add.s64 %rd10, %rd8, %rd9;
ld.global.f32 %f67, [%rd10];
add.s32 %r49, %r48, %r35;
mul.wide.s32 %rd11, %r49, 4;
add.s64 %rd12, %rd8, %rd11;
ld.global.f32 %f68, [%rd12];
$L__BB0_4:
setp.gt.f32 %p4, %f67, %f68;
setp.nan.f32 %p5, %f67, %f67;
or.pred %p6, %p5, %p4;
selp.f32 %f16, %f67, %f68, %p6;
mov.u32 %r50, %tid.z;
mad.lo.s32 %r5, %r2, %r50, %r3;
mov.u32 %r6, %ntid.x;
mad.lo.s32 %r7, %r5, %r6, %r1;
mul.wide.u32 %rd13, %r7, 4;
mov.u64 %rd14, _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_176_cu_fa7e611e_160115arrayE;
add.s64 %rd1, %rd14, %rd13;
st.shared.f32 [%rd1], %f16;
bar.sync 0;
clz.b32 %r51, %r6;
mov.u32 %r52, 31;
sub.s32 %r53, %r52, %r51;
mov.u32 %r54, 1;
shl.b32 %r8, %r54, %r53;
setp.lt.u32 %p7, %r1, %r8;
add.s32 %r55, %r8, %r1;
setp.lt.u32 %p8, %r55, %r6;
and.pred %p1, %p7, %p8;
add.s32 %r56, %r7, %r8;
mul.wide.s32 %rd15, %r56, 4;
add.s64 %rd2, %rd14, %rd15;
not.pred %p9, %p1;
@%p9 bra $L__BB0_6;
ld.shared.f32 %f17, [%rd2];
ld.shared.f32 %f18, [%rd1];
setp.nan.f32 %p10, %f18, %f18;
setp.gt.f32 %p11, %f18, %f17;
or.pred %p12, %p10, %p11;
selp.f32 %f19, %f18, %f17, %p12;
st.shared.f32 [%rd1], %f19;
$L__BB0_6:
bar.sync 0;
shr.u32 %r57, %r8, 31;
add.s32 %r58, %r8, %r57;
shr.s32 %r70, %r58, 1;
setp.lt.s32 %p13, %r8, 4;
@%p13 bra $L__BB0_11;
mov.u32 %r69, %r70;
$L__BB0_8:
setp.ge.u32 %p14, %r1, %r69;
@%p14 bra $L__BB0_10;
add.s32 %r59, %r69, %r7;
mul.wide.s32 %rd16, %r59, 4;
add.s64 %rd18, %rd14, %rd16;
ld.shared.f32 %f20, [%rd1];
setp.nan.f32 %p15, %f20, %f20;
ld.shared.f32 %f21, [%rd18];
setp.gt.f32 %p16, %f20, %f21;
or.pred %p17, %p15, %p16;
selp.f32 %f22, %f20, %f21, %p17;
st.shared.f32 [%rd1], %f22;
$L__BB0_10:
bar.sync 0;
shr.u32 %r11, %r69, 1;
setp.gt.u32 %p18, %r69, 3;
mov.u32 %r69, %r11;
@%p18 bra $L__BB0_8;
$L__BB0_11:
add.s32 %r60, %r7, 1;
mul.wide.u32 %rd19, %r60, 4;
add.s64 %rd3, %rd14, %rd19;
mov.f32 %f69, 0fFF800000;
@%p2 bra $L__BB0_14;
ld.shared.f32 %f69, [%rd1];
setp.lt.u32 %p20, %r6, 2;
@%p20 bra $L__BB0_14;
ld.shared.f32 %f24, [%rd3];
setp.gt.f32 %p21, %f69, %f24;
setp.nan.f32 %p22, %f69, %f69;
or.pred %p23, %p22, %p21;
selp.f32 %f69, %f69, %f24, %p23;
$L__BB0_14:
bar.sync 0;
mul.wide.s32 %rd21, %r5, 4;
add.s64 %rd4, %rd14, %rd21;
setp.eq.s32 %p24, %r1, 0;
@%p24 bra $L__BB0_15;
bra.uni $L__BB0_16;
$L__BB0_15:
st.shared.f32 [%rd4], %f69;
$L__BB0_16:
setp.lt.s32 %p25, %r4, 27454;
bar.sync 0;
ld.shared.f32 %f25, [%rd4];
bar.sync 0;
sub.f32 %f26, %f67, %f25;
mov.f32 %f27, 0f3F000000;
mov.f32 %f28, 0f3BBB989D;
fma.rn.f32 %f29, %f26, %f28, %f27;
cvt.sat.f32.f32 %f30, %f29;
mov.f32 %f31, 0f4B400001;
mov.f32 %f32, 0f437C0000;
fma.rm.f32 %f33, %f30, %f32, %f31;
add.f32 %f34, %f33, 0fCB40007F;
neg.f32 %f35, %f34;
mov.f32 %f36, 0f3FB8AA3B;
fma.rn.f32 %f37, %f26, %f36, %f35;
mov.f32 %f38, 0f32A57060;
fma.rn.f32 %f39, %f26, %f38, %f37;
mov.b32 %r61, %f33;
shl.b32 %r62, %r61, 23;
mov.b32 %f40, %r62;
ex2.approx.ftz.f32 %f41, %f39;
mul.f32 %f8, %f41, %f40;
sub.f32 %f42, %f68, %f25;
fma.rn.f32 %f43, %f42, %f28, %f27;
cvt.sat.f32.f32 %f44, %f43;
fma.rm.f32 %f45, %f44, %f32, %f31;
add.f32 %f46, %f45, 0fCB40007F;
neg.f32 %f47, %f46;
fma.rn.f32 %f48, %f42, %f36, %f47;
fma.rn.f32 %f49, %f42, %f38, %f48;
mov.b32 %r63, %f45;
shl.b32 %r64, %r63, 23;
mov.b32 %f50, %r64;
ex2.approx.ftz.f32 %f51, %f49;
mul.f32 %f9, %f51, %f50;
add.f32 %f52, %f8, 0f00000000;
add.f32 %f53, %f52, %f9;
selp.f32 %f54, %f53, 0f00000000, %p25;
st.shared.f32 [%rd1], %f54;
bar.sync 0;
@%p9 bra $L__BB0_18;
ld.shared.f32 %f55, [%rd2];
ld.shared.f32 %f56, [%rd1];
add.f32 %f57, %f55, %f56;
st.shared.f32 [%rd1], %f57;
$L__BB0_18:
bar.sync 0;
@%p13 bra $L__BB0_22;
$L__BB0_19:
setp.ge.u32 %p28, %r1, %r70;
@%p28 bra $L__BB0_21;
add.s32 %r65, %r70, %r7;
mul.wide.s32 %rd23, %r65, 4;
add.s64 %rd25, %rd14, %rd23;
ld.shared.f32 %f58, [%rd1];
ld.shared.f32 %f59, [%rd25];
add.f32 %f60, %f59, %f58;
st.shared.f32 [%rd1], %f60;
$L__BB0_21:
bar.sync 0;
shr.u32 %r13, %r70, 1;
setp.gt.u32 %p29, %r70, 3;
mov.u32 %r70, %r13;
@%p29 bra $L__BB0_19;
$L__BB0_22:
mov.f32 %f70, 0f00000000;
@%p2 bra $L__BB0_25;
ld.shared.f32 %f62, [%rd1];
add.f32 %f70, %f62, 0f00000000;
setp.lt.u32 %p31, %r6, 2;
@%p31 bra $L__BB0_25;
ld.shared.f32 %f63, [%rd3];
add.f32 %f70, %f70, %f63;
$L__BB0_25:
bar.sync 0;
@%p2 bra $L__BB0_27;
st.shared.f32 [%rd4], %f70;
$L__BB0_27:
bar.sync 0;
ld.shared.f32 %f13, [%rd4];
bar.sync 0;
@%p3 bra $L__BB0_29;
rcp.rn.f32 %f64, %f13;
mul.f32 %f65, %f64, %f8;
mov.b32 %r66, %f65;
mul.f32 %f66, %f64, %f9;
mov.b32 %r67, %f66;
shl.b32 %r68, %r4, 1;
mul.wide.s32 %rd27, %r68, 4;
add.s64 %rd26, %rd6, %rd27;
// begin inline asm
st.global.cs.v2.s32 [%rd26], {%r66,%r67};
// end inline asm
$L__BB0_29:
ret;
}
--- 53997da5d
+++ 03a1b695e
@@ -19,350 +19,260 @@
.entry _ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE(
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_0[40],
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_1[32]
)
{
- .reg .pred %p<47>;
- .reg .f32 %f<81>;
- .reg .b32 %r<100>;
- .reg .b64 %rd<35>;
+ .reg .pred %p<34>;
+ .reg .f32 %f<71>;
+ .reg .b32 %r<71>;
+ .reg .b64 %rd<28>;
.shared .align 4 .u32 _ZZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEEE14nvfuser_zero_s;
- ld.param.v2.u32 {%r39, %r40}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_0+24];
- ld.param.v2.u32 {%r41, %r42}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_0+32];
- ld.param.u64 %rd8, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_1];
- ld.param.u64 %rd7, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_0];
- cvta.to.global.u64 %rd1, %rd7;
+ ld.param.v2.u32 {%r32, %r33}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_0+24];
+ ld.param.v2.u32 {%r34, %r35}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_0+32];
+ ld.param.u64 %rd6, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_1];
+ ld.param.u64 %rd5, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_0];
mov.u32 %r1, %tid.x;
- setp.ne.s32 %p3, %r1, 0;
- @%p3 bra $L__BB0_2;
-
- mov.u32 %r49, 0;
- st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEEE14nvfuser_zero_s], %r49;
+ setp.ne.s32 %p2, %r1, 0;
+ @%p2 bra $L__BB0_2;
+
+ mov.u32 %r42, 0;
+ st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEEE14nvfuser_zero_s], %r42;
$L__BB0_2:
bar.sync 0;
- mov.u64 %rd9, _ZZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEEE14nvfuser_zero_s;
- atom.shared.min.s32 %r50, [%rd9], %r1;
- ld.shared.u32 %r51, [_ZZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEEE14nvfuser_zero_s];
- shl.b32 %r4, %r51, 1;
- setp.lt.s32 %p4, %r1, 1;
- @%p4 bra $L__BB0_3;
- bra.uni $L__BB0_4;
-
-$L__BB0_3:
- mov.u32 %r52, %tid.y;
- mov.u32 %r53, %ctaid.x;
- mov.u32 %r54, %ntid.y;
- mad.lo.s32 %r5, %r54, %r53, %r52;
- setp.lt.s32 %p5, %r5, 27454;
- @%p5 bra $L__BB0_8;
- bra.uni $L__BB0_4;
+ mov.u64 %rd7, _ZZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEEE14nvfuser_zero_s;
+ atom.shared.min.s32 %r43, [%rd7], %r1;
+ mov.u32 %r44, %ctaid.x;
+ mov.u32 %r2, %ntid.y;
+ mov.u32 %r3, %tid.y;
+ mad.lo.s32 %r4, %r2, %r44, %r3;
+ setp.gt.s32 %p3, %r4, 27453;
+ mov.f32 %f67, 0fFF800000;
+ mov.f32 %f68, %f67;
+ @%p3 bra $L__BB0_4;
+
+ cvta.to.global.u64 %rd8, %rd5;
+ ld.shared.u32 %r45, [_ZZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEEE14nvfuser_zero_s];
+ shl.b32 %r46, %r45, 1;
+ mul.lo.s32 %r47, %r4, %r33;
+ mad.lo.s32 %r48, %r46, %r35, %r47;
+ mul.wide.s32 %rd9, %r48, 4;
+ add.s64 %rd10, %rd8, %rd9;
+ ld.global.f32 %f67, [%rd10];
+ add.s32 %r49, %r48, %r35;
+ mul.wide.s32 %rd11, %r49, 4;
+ add.s64 %rd12, %rd8, %rd11;
+ ld.global.f32 %f68, [%rd12];
+
+$L__BB0_4:
+ setp.gt.f32 %p4, %f67, %f68;
+ setp.nan.f32 %p5, %f67, %f67;
+ or.pred %p6, %p5, %p4;
+ selp.f32 %f16, %f67, %f68, %p6;
+ mov.u32 %r50, %tid.z;
+ mad.lo.s32 %r5, %r2, %r50, %r3;
+ mov.u32 %r6, %ntid.x;
+ mad.lo.s32 %r7, %r5, %r6, %r1;
+ mul.wide.u32 %rd13, %r7, 4;
+ mov.u64 %rd14, _ZN11kernelscope6kernelE;
+ add.s64 %rd1, %rd14, %rd13;
+ st.shared.f32 [%rd1], %f16;
+ bar.sync 0;
+ clz.b32 %r51, %r6;
+ mov.u32 %r52, 31;
+ sub.s32 %r53, %r52, %r51;
+ mov.u32 %r54, 1;
+ shl.b32 %r8, %r54, %r53;
+ setp.lt.u32 %p7, %r1, %r8;
+ add.s32 %r55, %r8, %r1;
+ setp.lt.u32 %p8, %r55, %r6;
+ and.pred %p1, %p7, %p8;
+ add.s32 %r56, %r7, %r8;
+ mul.wide.s32 %rd15, %r56, 4;
+ add.s64 %rd2, %rd14, %rd15;
+ not.pred %p9, %p1;
+ @%p9 bra $L__BB0_6;
+
+ ld.shared.f32 %f17, [%rd2];
+ ld.shared.f32 %f18, [%rd1];
+ setp.nan.f32 %p10, %f18, %f18;
+ setp.gt.f32 %p11, %f18, %f17;
+ or.pred %p12, %p10, %p11;
+ selp.f32 %f19, %f18, %f17, %p12;
+ st.shared.f32 [%rd1], %f19;
+
+$L__BB0_6:
+ bar.sync 0;
+ shr.u32 %r57, %r8, 31;
+ add.s32 %r58, %r8, %r57;
+ shr.s32 %r70, %r58, 1;
+ setp.lt.s32 %p13, %r8, 4;
+ @%p13 bra $L__BB0_11;
+
+ mov.u32 %r69, %r70;
$L__BB0_8:
- shl.b32 %r63, %r1, 1;
- add.s32 %r64, %r4, %r63;
- mul.lo.s32 %r65, %r5, %r40;
- mad.lo.s32 %r66, %r64, %r42, %r65;
- mul.wide.s32 %rd14, %r66, 4;
- add.s64 %rd15, %rd1, %rd14;
- ld.global.f32 %f76, [%rd15];
- add.s32 %r67, %r66, %r42;
- mul.wide.s32 %rd16, %r67, 4;
- add.s64 %rd17, %rd1, %rd16;
- ld.global.f32 %f77, [%rd17];
- bra.uni $L__BB0_9;
-
-$L__BB0_4:
- mov.u32 %r55, %ntid.y;
- mov.u32 %r56, %ctaid.x;
- mov.u32 %r57, %tid.y;
- mad.lo.s32 %r58, %r55, %r56, %r57;
- setp.lt.s32 %p7, %r58, 27454;
- shl.b32 %r59, %r1, 1;
- add.s32 %r6, %r4, %r59;
- mul.lo.s32 %r7, %r58, %r40;
- and.pred %p1, %p4, %p7;
- mov.f32 %f77, 0fFF800000;
- not.pred %p8, %p1;
- mov.f32 %f76, %f77;
- @%p8 bra $L__BB0_6;
-
- mad.lo.s32 %r60, %r6, %r42, %r7;
- mul.wide.s32 %rd10, %r60, 4;
- add.s64 %rd11, %rd1, %rd10;
- ld.global.f32 %f76, [%rd11];
-
-$L__BB0_6:
- @%p8 bra $L__BB0_9;
-
- add.s32 %r61, %r6, 1;
- mad.lo.s32 %r62, %r61, %r42, %r7;
- mul.wide.s32 %rd12, %r62, 4;
- add.s64 %rd13, %rd1, %rd12;
- ld.global.f32 %f77, [%rd13];
-
-$L__BB0_9:
- setp.gt.f32 %p10, %f76, %f77;
- setp.nan.f32 %p11, %f76, %f76;
- or.pred %p12, %p11, %p10;
- selp.f32 %f24, %f76, %f77, %p12;
- mov.u32 %r68, %tid.z;
- mov.u32 %r8, %ntid.y;
- mov.u32 %r9, %tid.y;
- mad.lo.s32 %r10, %r8, %r68, %r9;
- mov.u32 %r11, %ntid.x;
- mad.lo.s32 %r12, %r10, %r11, %r1;
- mul.wide.u32 %rd18, %r12, 4;
- mov.u64 %rd19, _ZN11kernelscope6kernelE;
- add.s64 %rd2, %rd19, %rd18;
- st.shared.f32 [%rd2], %f24;
- bar.sync 0;
- clz.b32 %r69, %r11;
- mov.u32 %r70, 31;
- sub.s32 %r71, %r70, %r69;
- mov.u32 %r72, 1;
- shl.b32 %r13, %r72, %r71;
- setp.lt.u32 %p13, %r1, %r13;
- add.s32 %r73, %r13, %r1;
- setp.lt.u32 %p14, %r73, %r11;
- and.pred %p2, %p13, %p14;
- add.s32 %r74, %r12, %r13;
- mul.wide.s32 %rd20, %r74, 4;
- add.s64 %rd3, %rd19, %rd20;
- not.pred %p15, %p2;
- @%p15 bra $L__BB0_11;
-
- ld.shared.f32 %f25, [%rd3];
- ld.shared.f32 %f26, [%rd2];
- setp.nan.f32 %p16, %f26, %f26;
- setp.gt.f32 %p17, %f26, %f25;
- or.pred %p18, %p16, %p17;
- selp.f32 %f27, %f26, %f25, %p18;
- st.shared.f32 [%rd2], %f27;
+ setp.ge.u32 %p14, %r1, %r69;
+ @%p14 bra $L__BB0_10;
+
+ add.s32 %r59, %r69, %r7;
+ mul.wide.s32 %rd16, %r59, 4;
+ add.s64 %rd18, %rd14, %rd16;
+ ld.shared.f32 %f20, [%rd1];
+ setp.nan.f32 %p15, %f20, %f20;
+ ld.shared.f32 %f21, [%rd18];
+ setp.gt.f32 %p16, %f20, %f21;
+ or.pred %p17, %p15, %p16;
+ selp.f32 %f22, %f20, %f21, %p17;
+ st.shared.f32 [%rd1], %f22;
+
+$L__BB0_10:
+ bar.sync 0;
+ shr.u32 %r11, %r69, 1;
+ setp.gt.u32 %p18, %r69, 3;
+ mov.u32 %r69, %r11;
+ @%p18 bra $L__BB0_8;
$L__BB0_11:
- bar.sync 0;
- shr.u32 %r75, %r13, 31;
- add.s32 %r76, %r13, %r75;
- shr.s32 %r99, %r76, 1;
- setp.lt.s32 %p19, %r13, 4;
- @%p19 bra $L__BB0_16;
-
- mov.u32 %r98, %r99;
-
-$L__BB0_13:
- setp.ge.u32 %p20, %r1, %r98;
- @%p20 bra $L__BB0_15;
-
- add.s32 %r77, %r98, %r12;
- mul.wide.s32 %rd21, %r77, 4;
- add.s64 %rd23, %rd19, %rd21;
- ld.shared.f32 %f28, [%rd2];
- setp.nan.f32 %p21, %f28, %f28;
- ld.shared.f32 %f29, [%rd23];
- setp.gt.f32 %p22, %f28, %f29;
- or.pred %p23, %p21, %p22;
- selp.f32 %f30, %f28, %f29, %p23;
- st.shared.f32 [%rd2], %f30;
+ add.s32 %r60, %r7, 1;
+ mul.wide.u32 %rd19, %r60, 4;
+ add.s64 %rd3, %rd14, %rd19;
+ mov.f32 %f69, 0fFF800000;
+ @%p2 bra $L__BB0_14;
+
+ ld.shared.f32 %f69, [%rd1];
+ setp.lt.u32 %p20, %r6, 2;
+ @%p20 bra $L__BB0_14;
+
+ ld.shared.f32 %f24, [%rd3];
+ setp.gt.f32 %p21, %f69, %f24;
+ setp.nan.f32 %p22, %f69, %f69;
+ or.pred %p23, %p22, %p21;
+ selp.f32 %f69, %f69, %f24, %p23;
+
+$L__BB0_14:
+ bar.sync 0;
+ mul.wide.s32 %rd21, %r5, 4;
+ add.s64 %rd4, %rd14, %rd21;
+ setp.eq.s32 %p24, %r1, 0;
+ @%p24 bra $L__BB0_15;
+ bra.uni $L__BB0_16;
$L__BB0_15:
- bar.sync 0;
- shr.u32 %r16, %r98, 1;
- setp.gt.u32 %p24, %r98, 3;
- mov.u32 %r98, %r16;
- @%p24 bra $L__BB0_13;
+ st.shared.f32 [%rd4], %f69;
$L__BB0_16:
- add.s32 %r78, %r12, 1;
- mul.wide.u32 %rd24, %r78, 4;
- add.s64 %rd4, %rd19, %rd24;
- mov.f32 %f78, 0fFF800000;
- @%p3 bra $L__BB0_19;
-
- ld.shared.f32 %f78, [%rd2];
- setp.lt.u32 %p26, %r11, 2;
- @%p26 bra $L__BB0_19;
-
- ld.shared.f32 %f32, [%rd4];
- setp.gt.f32 %p27, %f78, %f32;
- setp.nan.f32 %p28, %f78, %f78;
- or.pred %p29, %p28, %p27;
- selp.f32 %f78, %f78, %f32, %p29;
+ setp.lt.s32 %p25, %r4, 27454;
+ bar.sync 0;
+ ld.shared.f32 %f25, [%rd4];
+ bar.sync 0;
+ sub.f32 %f26, %f67, %f25;
+ mov.f32 %f27, 0f3F000000;
+ mov.f32 %f28, 0f3BBB989D;
+ fma.rn.f32 %f29, %f26, %f28, %f27;
+ cvt.sat.f32.f32 %f30, %f29;
+ mov.f32 %f31, 0f4B400001;
+ mov.f32 %f32, 0f437C0000;
+ fma.rm.f32 %f33, %f30, %f32, %f31;
+ add.f32 %f34, %f33, 0fCB40007F;
+ neg.f32 %f35, %f34;
+ mov.f32 %f36, 0f3FB8AA3B;
+ fma.rn.f32 %f37, %f26, %f36, %f35;
+ mov.f32 %f38, 0f32A57060;
+ fma.rn.f32 %f39, %f26, %f38, %f37;
+ mov.b32 %r61, %f33;
+ shl.b32 %r62, %r61, 23;
+ mov.b32 %f40, %r62;
+ ex2.approx.ftz.f32 %f41, %f39;
+ mul.f32 %f8, %f41, %f40;
+ sub.f32 %f42, %f68, %f25;
+ fma.rn.f32 %f43, %f42, %f28, %f27;
+ cvt.sat.f32.f32 %f44, %f43;
+ fma.rm.f32 %f45, %f44, %f32, %f31;
+ add.f32 %f46, %f45, 0fCB40007F;
+ neg.f32 %f47, %f46;
+ fma.rn.f32 %f48, %f42, %f36, %f47;
+ fma.rn.f32 %f49, %f42, %f38, %f48;
+ mov.b32 %r63, %f45;
+ shl.b32 %r64, %r63, 23;
+ mov.b32 %f50, %r64;
+ ex2.approx.ftz.f32 %f51, %f49;
+ mul.f32 %f9, %f51, %f50;
+ add.f32 %f52, %f8, 0f00000000;
+ add.f32 %f53, %f52, %f9;
+ selp.f32 %f54, %f53, 0f00000000, %p25;
+ st.shared.f32 [%rd1], %f54;
+ bar.sync 0;
+ @%p9 bra $L__BB0_18;
+
+ ld.shared.f32 %f55, [%rd2];
+ ld.shared.f32 %f56, [%rd1];
+ add.f32 %f57, %f55, %f56;
+ st.shared.f32 [%rd1], %f57;
+
+$L__BB0_18:
+ bar.sync 0;
+ @%p13 bra $L__BB0_22;
$L__BB0_19:
- bar.sync 0;
- mul.wide.s32 %rd26, %r10, 4;
- add.s64 %rd5, %rd19, %rd26;
- setp.eq.s32 %p30, %r1, 0;
- @%p30 bra $L__BB0_20;
- bra.uni $L__BB0_21;
-
-$L__BB0_20:
- st.shared.f32 [%rd5], %f78;
+ setp.ge.u32 %p28, %r1, %r70;
+ @%p28 bra $L__BB0_21;
+
+ add.s32 %r65, %r70, %r7;
+ mul.wide.s32 %rd23, %r65, 4;
+ add.s64 %rd25, %rd14, %rd23;
+ ld.shared.f32 %f58, [%rd1];
+ ld.shared.f32 %f59, [%rd25];
+ add.f32 %f60, %f59, %f58;
+ st.shared.f32 [%rd1], %f60;
$L__BB0_21:
bar.sync 0;
- ld.shared.f32 %f33, [%rd5];
- bar.sync 0;
- sub.f32 %f34, %f76, %f33;
- mov.f32 %f35, 0f3F000000;
- mov.f32 %f36, 0f3BBB989D;
- fma.rn.f32 %f37, %f34, %f36, %f35;
- cvt.sat.f32.f32 %f38, %f37;
- mov.f32 %f39, 0f4B400001;
- mov.f32 %f40, 0f437C0000;
- fma.rm.f32 %f41, %f38, %f40, %f39;
- add.f32 %f42, %f41, 0fCB40007F;
- neg.f32 %f43, %f42;
- mov.f32 %f44, 0f3FB8AA3B;
- fma.rn.f32 %f45, %f34, %f44, %f43;
- mov.f32 %f46, 0f32A57060;
- fma.rn.f32 %f47, %f34, %f46, %f45;
- mov.b32 %r79, %f41;
- shl.b32 %r80, %r79, 23;
- mov.b32 %f48, %r80;
- ex2.approx.ftz.f32 %f49, %f47;
- mul.f32 %f11, %f49, %f48;
- sub.f32 %f50, %f77, %f33;
- fma.rn.f32 %f51, %f50, %f36, %f35;
- cvt.sat.f32.f32 %f52, %f51;
- fma.rm.f32 %f53, %f52, %f40, %f39;
- add.f32 %f54, %f53, 0fCB40007F;
- neg.f32 %f55, %f54;
- fma.rn.f32 %f56, %f50, %f44, %f55;
- fma.rn.f32 %f57, %f50, %f46, %f56;
- mov.b32 %r81, %f53;
- shl.b32 %r82, %r81, 23;
- mov.b32 %f58, %r82;
- ex2.approx.ftz.f32 %f59, %f57;
- mul.f32 %f12, %f59, %f58;
- @%p4 bra $L__BB0_22;
- bra.uni $L__BB0_23;
+ shr.u32 %r13, %r70, 1;
+ setp.gt.u32 %p29, %r70, 3;
+ mov.u32 %r70, %r13;
+ @%p29 bra $L__BB0_19;
$L__BB0_22:
- mov.u32 %r83, %ctaid.x;
- mad.lo.s32 %r84, %r8, %r83, %r9;
- setp.lt.s32 %p32, %r84, 27454;
- @%p32 bra $L__BB0_24;
- bra.uni $L__BB0_23;
-
-$L__BB0_24:
- add.f32 %f62, %f11, 0f00000000;
- add.f32 %f79, %f62, %f12;
- bra.uni $L__BB0_25;
-
-$L__BB0_23:
- mov.u32 %r85, %ctaid.x;
- mad.lo.s32 %r86, %r8, %r85, %r9;
- setp.lt.s32 %p34, %r86, 27454;
- and.pred %p35, %p4, %p34;
- add.f32 %f60, %f11, 0f00000000;
- add.f32 %f61, %f60, %f12;
- selp.f32 %f79, %f61, 0f00000000, %p35;
+ mov.f32 %f70, 0f00000000;
+ @%p2 bra $L__BB0_25;
+
+ ld.shared.f32 %f62, [%rd1];
+ add.f32 %f70, %f62, 0f00000000;
+ setp.lt.u32 %p31, %r6, 2;
+ @%p31 bra $L__BB0_25;
+
+ ld.shared.f32 %f63, [%rd3];
+ add.f32 %f70, %f70, %f63;
$L__BB0_25:
- st.shared.f32 [%rd2], %f79;
- bar.sync 0;
- @%p15 bra $L__BB0_27;
-
- ld.shared.f32 %f63, [%rd3];
- ld.shared.f32 %f64, [%rd2];
- add.f32 %f65, %f63, %f64;
- st.shared.f32 [%rd2], %f65;
+ bar.sync 0;
+ @%p2 bra $L__BB0_27;
+
+ st.shared.f32 [%rd4], %f70;
$L__BB0_27:
bar.sync 0;
- @%p19 bra $L__BB0_31;
-
-$L__BB0_28:
- setp.ge.u32 %p38, %r1, %r99;
- @%p38 bra $L__BB0_30;
-
- add.s32 %r87, %r99, %r12;
- mul.wide.s32 %rd28, %r87, 4;
- add.s64 %rd30, %rd19, %rd28;
- ld.shared.f32 %f66, [%rd2];
- ld.shared.f32 %f67, [%rd30];
- add.f32 %f68, %f67, %f66;
- st.shared.f32 [%rd2], %f68;
-
-$L__BB0_30:
- bar.sync 0;
- shr.u32 %r18, %r99, 1;
- setp.gt.u32 %p39, %r99, 3;
- mov.u32 %r99, %r18;
- @%p39 bra $L__BB0_28;
-
-$L__BB0_31:
- mov.f32 %f80, 0f00000000;
- @%p3 bra $L__BB0_34;
-
- ld.shared.f32 %f70, [%rd2];
- add.f32 %f80, %f70, 0f00000000;
- setp.lt.u32 %p41, %r11, 2;
- @%p41 bra $L__BB0_34;
-
- ld.shared.f32 %f71, [%rd4];
- add.f32 %f80, %f80, %f71;
-
-$L__BB0_34:
- bar.sync 0;
- @%p3 bra $L__BB0_36;
-
- st.shared.f32 [%rd5], %f80;
-
-$L__BB0_36:
- setp.gt.s32 %p43, %r1, 0;
- bar.sync 0;
- ld.shared.f32 %f72, [%rd5];
- bar.sync 0;
- rcp.rn.f32 %f19, %f72;
- @%p43 bra $L__BB0_38;
-
- mov.u32 %r88, %ctaid.x;
- mad.lo.s32 %r19, %r8, %r88, %r9;
- setp.lt.s32 %p44, %r19, 27454;
- @%p44 bra $L__BB0_41;
- bra.uni $L__BB0_38;
-
-$L__BB0_41:
- mul.f32 %f73, %f19, %f11;
- mov.b32 %r94, %f73;
- mul.f32 %f74, %f19, %f12;
- mov.b32 %r95, %f74;
- add.s32 %r96, %r19, %r1;
- shl.b32 %r97, %r96, 1;
- mul.wide.s32 %rd34, %r97, 4;
- add.s64 %rd33, %rd8, %rd34;
+ ld.shared.f32 %f13, [%rd4];
+ bar.sync 0;
+ @%p3 bra $L__BB0_29;
+
+ rcp.rn.f32 %f64, %f13;
+ mul.f32 %f65, %f64, %f8;
+ mov.b32 %r66, %f65;
+ mul.f32 %f66, %f64, %f9;
+ mov.b32 %r67, %f66;
+ shl.b32 %r68, %r4, 1;
+ mul.wide.s32 %rd27, %r68, 4;
+ add.s64 %rd26, %rd6, %rd27;
- st.global.cs.v2.s32 [%rd33], {%r94,%r95};
+ st.global.cs.v2.s32 [%rd26], {%r66,%r67};
- bra.uni $L__BB0_42;
-
-$L__BB0_38:
- mul.f32 %f20, %f19, %f11;
- mul.f32 %f21, %f19, %f12;
- @%p43 bra $L__BB0_42;
-
- mov.u32 %r89, %ctaid.x;
- mad.lo.s32 %r20, %r8, %r89, %r9;
- setp.gt.s32 %p46, %r20, 27453;
- @%p46 bra $L__BB0_42;
-
- add.s32 %r92, %r20, %r1;
- shl.b32 %r93, %r92, 1;
- mul.wide.s32 %rd32, %r93, 4;
- add.s64 %rd31, %rd8, %rd32;
- mov.b32 %r90, %f20;
- mov.b32 %r91, %f21;
-
- st.global.cs.v2.s32 [%rd31], {%r90,%r91};
-
-
-$L__BB0_42:
+
+$L__BB0_29:
ret;
}
Kernel 178
CUDA
PTX
53997da5d
Diff
03a1b695e
-9
+9 index type: int
registers: 0
gmem: 0
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 3, 3> T0, Tensor<float, 3, 3> T8, Tensor<float, 3, 3> T14) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
NVFUSER_DEFINE_MAGIC_ZERO;
Array<float, 2, 1> T12;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
T12[i0] = NEG_INFINITY;
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((((nvfuser_index_t)threadIdx.x) * 2) + 1) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
T12[i0]
= T0[(((((2 * T0.alloc_stride[2LL]) * ((nvfuser_index_t)threadIdx.x)) + (T0.alloc_stride[1LL] * ((nvfuser_index_t)threadIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T0.alloc_stride[1LL]) * ((nvfuser_index_t)blockIdx.x))) + (T0.alloc_stride[2LL] * (i0 + nvfuser_zero)))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
if ((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
T12[i0]
= T0[(((((2 * T0.alloc_stride[2LL]) * ((nvfuser_index_t)threadIdx.x)) + (T0.alloc_stride[1LL] * ((nvfuser_index_t)threadIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T0.alloc_stride[1LL]) * ((nvfuser_index_t)blockIdx.x))) + (T0.alloc_stride[2LL] * (i0 + nvfuser_zero)))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<float, 1, 1> T16;
T16[0] = NEG_INFINITY;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 2; ++i1) {
T16[0] = fmax(
T16[0],
T12[i1]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 1, 1> T2;
T2[0] = NEG_INFINITY;
blockReduce<true, false, false, true>(T2[0], T16[0], [](float &a, float b) { a = fmax(a, b); }, static_cast<float*>(shared_mem), true, true, float(NEG_INFINITY), DefaultBlockDim());
Array<float, 1, 1> T3;
broadcast::blockBroadcast<true, false, false, true>(T3[0], T2[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T17;
T17[0] = 0.000000000e+00f;
if (((((((nvfuser_index_t)threadIdx.x) * 2) + 1) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
Array<float, 2, 2> T15;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 2; ++i2) {
Array<float, 1, 1> T4;
T4[0]
= T12[i2]
- T3[0];
Array<float, 1, 1> T5;
T5[0]
= expf(T4[0]);
T17[0]
= T17[0]
+ T5[0];
T15[i2]
= T5[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T14[(((2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T15[0]);
} else {
Array<float, 2, 2> T15;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 2; ++i2) {
Array<float, 1, 1> T4;
T4[0]
= T12[i2]
- T3[0];
Array<float, 1, 1> T5;
T5[0]
= expf(T4[0]);
if ((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
T17[0]
= T17[0]
+ T5[0];
}
T15[i2]
= T5[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T14[(((2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T15[0]);
}
}
Array<float, 1, 1> T6;
T6[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T6[0], T17[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T7;
T7[0]
= T6[0];
Array<float, 1, 1> T13;
T13[0]
= reciprocal(T7[0]);
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
T8[(((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x)))]
= T13[0];
}
}
__global__ void nvfuser_N(Tensor<float, 3, 3> T0, Tensor<float, 3, 3> T8, Tensor<float, 3, 3> T14) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
NVFUSER_DEFINE_MAGIC_ZERO;
Array<float, 2, 1> T12;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
T12[i0] = NEG_INFINITY;
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
T12[i0]
= T0[(((T0.alloc_stride[1LL] * ((nvfuser_index_t)threadIdx.y)) + ((((nvfuser_index_t)blockDim.y) * T0.alloc_stride[1LL]) * ((nvfuser_index_t)blockIdx.x))) + (T0.alloc_stride[2LL] * (i0 + nvfuser_zero)))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
T12[i0]
= T0[(((T0.alloc_stride[1LL] * ((nvfuser_index_t)threadIdx.y)) + ((((nvfuser_index_t)blockDim.y) * T0.alloc_stride[1LL]) * ((nvfuser_index_t)blockIdx.x))) + (T0.alloc_stride[2LL] * (i0 + nvfuser_zero)))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<float, 1, 1> T16;
T16[0] = NEG_INFINITY;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 2; ++i1) {
T16[0] = fmax(
T16[0],
T12[i1]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 1, 1> T2;
T2[0] = NEG_INFINITY;
blockReduce<true, false, false, true>(T2[0], T16[0], [](float &a, float b) { a = fmax(a, b); }, static_cast<float*>(shared_mem), true, true, float(NEG_INFINITY), DefaultBlockDim());
Array<float, 1, 1> T3;
broadcast::blockBroadcast<true, false, false, true>(T3[0], T2[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T17;
T17[0] = 0.000000000e+00f;
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
Array<float, 2, 2> T15;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 2; ++i2) {
Array<float, 1, 1> T4;
T4[0]
= T12[i2]
- T3[0];
Array<float, 1, 1> T5;
T5[0]
= expf(T4[0]);
T17[0]
= T17[0]
+ T5[0];
T15[i2]
= T5[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T14[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T15[0]);
} else {
Array<float, 2, 2> T15;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 2; ++i2) {
Array<float, 1, 1> T4;
T4[0]
= T12[i2]
- T3[0];
Array<float, 1, 1> T5;
T5[0]
= expf(T4[0]);
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
T17[0]
= T17[0]
+ T5[0];
}
T15[i2]
= T5[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T14[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T15[0]);
}
}
Array<float, 1, 1> T6;
T6[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T6[0], T17[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T7;
T7[0]
= T6[0];
Array<float, 1, 1> T13;
T13[0]
= reciprocal(T7[0]);
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
T8[(((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x)))]
= T13[0];
}
}
--- 53997da5d
+++ 03a1b695e
@@ -6,23 +6,23 @@
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
T12[i0] = NEG_INFINITY;
}
NVFUSER_UPDATE_MAGIC_ZERO;
- if (((((((nvfuser_index_t)threadIdx.x) * 2) + 1) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
T12[i0]
- = T0[(((((2 * T0.alloc_stride[2LL]) * ((nvfuser_index_t)threadIdx.x)) + (T0.alloc_stride[1LL] * ((nvfuser_index_t)threadIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T0.alloc_stride[1LL]) * ((nvfuser_index_t)blockIdx.x))) + (T0.alloc_stride[2LL] * (i0 + nvfuser_zero)))];
+ = T0[(((T0.alloc_stride[1LL] * ((nvfuser_index_t)threadIdx.y)) + ((((nvfuser_index_t)blockDim.y) * T0.alloc_stride[1LL]) * ((nvfuser_index_t)blockIdx.x))) + (T0.alloc_stride[2LL] * (i0 + nvfuser_zero)))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
- if ((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
T12[i0]
- = T0[(((((2 * T0.alloc_stride[2LL]) * ((nvfuser_index_t)threadIdx.x)) + (T0.alloc_stride[1LL] * ((nvfuser_index_t)threadIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T0.alloc_stride[1LL]) * ((nvfuser_index_t)blockIdx.x))) + (T0.alloc_stride[2LL] * (i0 + nvfuser_zero)))];
+ = T0[(((T0.alloc_stride[1LL] * ((nvfuser_index_t)threadIdx.y)) + ((((nvfuser_index_t)blockDim.y) * T0.alloc_stride[1LL]) * ((nvfuser_index_t)blockIdx.x))) + (T0.alloc_stride[2LL] * (i0 + nvfuser_zero)))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<float, 1, 1> T16;
@@ -39,11 +39,11 @@
blockReduce<true, false, false, true>(T2[0], T16[0], [](float &a, float b) { a = fmax(a, b); }, static_cast<float*>(shared_mem), true, true, float(NEG_INFINITY), DefaultBlockDim());
Array<float, 1, 1> T3;
broadcast::blockBroadcast<true, false, false, true>(T3[0], T2[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T17;
T17[0] = 0.000000000e+00f;
- if (((((((nvfuser_index_t)threadIdx.x) * 2) + 1) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
Array<float, 2, 2> T15;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 2; ++i2) {
Array<float, 1, 1> T4;
T4[0]
@@ -57,11 +57,11 @@
+ T5[0];
T15[i2]
= T5[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
- loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T14[(((2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T15[0]);
+ loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T14[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T15[0]);
} else {
Array<float, 2, 2> T15;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 2; ++i2) {
Array<float, 1, 1> T4;
@@ -69,21 +69,21 @@
= T12[i2]
- T3[0];
Array<float, 1, 1> T5;
T5[0]
= expf(T4[0]);
- if ((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
T17[0]
= T17[0]
+ T5[0];
}
T15[i2]
= T5[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
- if ((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
- loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T14[(((2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T15[0]);
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
+ loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T14[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T15[0]);
}
}
Array<float, 1, 1> T6;
T6[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T6[0], T17[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_183_cu_aba0e25b_1911011nvfuser_183ENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_183_cu_aba0e25b_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_183_cu_aba0e25b_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_183_cu_aba0e25b_191103std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_183_cu_aba0e25b_191105arrayE[];
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_183_cu_aba0e25b_1911011nvfuser_183ENS_6TensorIfLi3ELi3EEES1_S1_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_183_cu_aba0e25b_1911011nvfuser_183ENS_6TensorIfLi3ELi3EEES1_S1__param_0[32],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_183_cu_aba0e25b_1911011nvfuser_183ENS_6TensorIfLi3ELi3EEES1_S1__param_1[32],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_183_cu_aba0e25b_1911011nvfuser_183ENS_6TensorIfLi3ELi3EEES1_S1__param_2[32]
)
{
.reg .pred %p<44>;
.reg .f32 %f<105>;
.reg .b32 %r<112>;
.reg .b64 %rd<39>;
// demoted variable
.shared .align 4 .u32 _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_183_cu_aba0e25b_1911011nvfuser_183ENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s;
ld.param.v2.u32 {%r43, %r44}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_183_cu_aba0e25b_1911011nvfuser_183ENS_6TensorIfLi3ELi3EEES1_S1__param_0+24];
ld.param.u64 %rd9, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_183_cu_aba0e25b_1911011nvfuser_183ENS_6TensorIfLi3ELi3EEES1_S1__param_2];
ld.param.u64 %rd8, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_183_cu_aba0e25b_1911011nvfuser_183ENS_6TensorIfLi3ELi3EEES1_S1__param_1];
ld.param.u64 %rd7, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_183_cu_aba0e25b_1911011nvfuser_183ENS_6TensorIfLi3ELi3EEES1_S1__param_0];
cvta.to.global.u64 %rd1, %rd7;
mov.u32 %r1, %tid.x;
setp.ne.s32 %p4, %r1, 0;
@%p4 bra $L__BB0_2;
mov.u32 %r57, 0;
st.shared.u32 [_ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_183_cu_aba0e25b_1911011nvfuser_183ENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s], %r57;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd10, _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_183_cu_aba0e25b_1911011nvfuser_183ENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s;
atom.shared.min.s32 %r58, [%rd10], %r1;
ld.shared.u32 %r59, [_ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_183_cu_aba0e25b_1911011nvfuser_183ENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s];
shl.b32 %r4, %r59, 1;
setp.lt.s32 %p5, %r1, 1;
@%p5 bra $L__BB0_3;
bra.uni $L__BB0_4;
$L__BB0_3:
mov.u32 %r60, %tid.y;
mov.u32 %r61, %ctaid.x;
mov.u32 %r62, %ntid.y;
mad.lo.s32 %r5, %r62, %r61, %r60;
setp.lt.s32 %p6, %r5, 27454;
@%p6 bra $L__BB0_8;
bra.uni $L__BB0_4;
$L__BB0_8:
shl.b32 %r71, %r1, 1;
add.s32 %r72, %r4, %r71;
mul.lo.s32 %r73, %r5, %r43;
mad.lo.s32 %r74, %r72, %r44, %r73;
mul.wide.s32 %rd15, %r74, 4;
add.s64 %rd16, %rd1, %rd15;
ld.global.f32 %f100, [%rd16];
add.s32 %r75, %r74, %r44;
mul.wide.s32 %rd17, %r75, 4;
add.s64 %rd18, %rd1, %rd17;
ld.global.f32 %f101, [%rd18];
bra.uni $L__BB0_9;
$L__BB0_4:
mov.u32 %r63, %ntid.y;
mov.u32 %r64, %ctaid.x;
mov.u32 %r65, %tid.y;
mad.lo.s32 %r66, %r63, %r64, %r65;
setp.lt.s32 %p8, %r66, 27454;
shl.b32 %r67, %r1, 1;
add.s32 %r6, %r4, %r67;
mul.lo.s32 %r7, %r66, %r43;
and.pred %p1, %p5, %p8;
mov.f32 %f101, 0fFF800000;
not.pred %p9, %p1;
mov.f32 %f100, %f101;
@%p9 bra $L__BB0_6;
mad.lo.s32 %r68, %r6, %r44, %r7;
mul.wide.s32 %rd11, %r68, 4;
add.s64 %rd12, %rd1, %rd11;
ld.global.f32 %f100, [%rd12];
$L__BB0_6:
@%p9 bra $L__BB0_9;
add.s32 %r69, %r6, 1;
mad.lo.s32 %r70, %r69, %r44, %r7;
mul.wide.s32 %rd13, %r70, 4;
add.s64 %rd14, %rd1, %rd13;
ld.global.f32 %f101, [%rd14];
$L__BB0_9:
setp.gt.f32 %p11, %f100, %f101;
setp.nan.f32 %p12, %f100, %f100;
or.pred %p13, %p12, %p11;
selp.f32 %f23, %f100, %f101, %p13;
mov.u32 %r76, %tid.z;
mov.u32 %r8, %ntid.y;
mov.u32 %r9, %tid.y;
mad.lo.s32 %r10, %r8, %r76, %r9;
mov.u32 %r11, %ntid.x;
mad.lo.s32 %r12, %r10, %r11, %r1;
mul.wide.u32 %rd19, %r12, 4;
mov.u64 %rd20, _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_183_cu_aba0e25b_191105arrayE;
add.s64 %rd2, %rd20, %rd19;
st.shared.f32 [%rd2], %f23;
bar.sync 0;
clz.b32 %r77, %r11;
mov.u32 %r78, 31;
sub.s32 %r79, %r78, %r77;
mov.u32 %r80, 1;
shl.b32 %r13, %r80, %r79;
setp.lt.u32 %p14, %r1, %r13;
add.s32 %r81, %r13, %r1;
setp.lt.u32 %p15, %r81, %r11;
and.pred %p2, %p14, %p15;
add.s32 %r82, %r12, %r13;
mul.wide.s32 %rd21, %r82, 4;
add.s64 %rd3, %rd20, %rd21;
not.pred %p16, %p2;
@%p16 bra $L__BB0_11;
ld.shared.f32 %f24, [%rd3];
ld.shared.f32 %f25, [%rd2];
setp.nan.f32 %p17, %f25, %f25;
setp.gt.f32 %p18, %f25, %f24;
or.pred %p19, %p17, %p18;
selp.f32 %f26, %f25, %f24, %p19;
st.shared.f32 [%rd2], %f26;
$L__BB0_11:
bar.sync 0;
shr.u32 %r83, %r13, 31;
add.s32 %r84, %r13, %r83;
shr.s32 %r111, %r84, 1;
setp.lt.s32 %p20, %r13, 4;
@%p20 bra $L__BB0_16;
mov.u32 %r110, %r111;
$L__BB0_13:
setp.ge.u32 %p21, %r1, %r110;
@%p21 bra $L__BB0_15;
add.s32 %r85, %r110, %r12;
mul.wide.s32 %rd22, %r85, 4;
add.s64 %rd24, %rd20, %rd22;
ld.shared.f32 %f27, [%rd2];
setp.nan.f32 %p22, %f27, %f27;
ld.shared.f32 %f28, [%rd24];
setp.gt.f32 %p23, %f27, %f28;
or.pred %p24, %p22, %p23;
selp.f32 %f29, %f27, %f28, %p24;
st.shared.f32 [%rd2], %f29;
$L__BB0_15:
bar.sync 0;
shr.u32 %r16, %r110, 1;
setp.gt.u32 %p25, %r110, 3;
mov.u32 %r110, %r16;
@%p25 bra $L__BB0_13;
$L__BB0_16:
add.s32 %r86, %r12, 1;
mul.wide.u32 %rd25, %r86, 4;
add.s64 %rd4, %rd20, %rd25;
mov.f32 %f102, 0fFF800000;
@%p4 bra $L__BB0_19;
ld.shared.f32 %f102, [%rd2];
setp.lt.u32 %p27, %r11, 2;
@%p27 bra $L__BB0_19;
ld.shared.f32 %f31, [%rd4];
setp.gt.f32 %p28, %f102, %f31;
setp.nan.f32 %p29, %f102, %f102;
or.pred %p30, %p29, %p28;
selp.f32 %f102, %f102, %f31, %p30;
$L__BB0_19:
bar.sync 0;
mul.wide.s32 %rd27, %r10, 4;
add.s64 %rd5, %rd20, %rd27;
setp.eq.s32 %p31, %r1, 0;
@%p31 bra $L__BB0_20;
bra.uni $L__BB0_21;
$L__BB0_20:
st.shared.f32 [%rd5], %f102;
$L__BB0_21:
bar.sync 0;
ld.shared.f32 %f11, [%rd5];
bar.sync 0;
@%p5 bra $L__BB0_22;
bra.uni $L__BB0_23;
$L__BB0_22:
mov.u32 %r87, %ctaid.x;
mad.lo.s32 %r17, %r8, %r87, %r9;
setp.lt.s32 %p33, %r17, 27454;
@%p33 bra $L__BB0_25;
bra.uni $L__BB0_23;
$L__BB0_25:
sub.f32 %f60, %f100, %f11;
mov.f32 %f61, 0f3F000000;
mov.f32 %f62, 0f3BBB989D;
fma.rn.f32 %f63, %f60, %f62, %f61;
cvt.sat.f32.f32 %f64, %f63;
mov.f32 %f65, 0f4B400001;
mov.f32 %f66, 0f437C0000;
fma.rm.f32 %f67, %f64, %f66, %f65;
add.f32 %f68, %f67, 0fCB40007F;
neg.f32 %f69, %f68;
mov.f32 %f70, 0f3FB8AA3B;
fma.rn.f32 %f71, %f60, %f70, %f69;
mov.f32 %f72, 0f32A57060;
fma.rn.f32 %f73, %f60, %f72, %f71;
mov.b32 %r102, %f67;
shl.b32 %r103, %r102, 23;
mov.b32 %f74, %r103;
ex2.approx.ftz.f32 %f75, %f73;
mul.f32 %f76, %f75, %f74;
add.f32 %f77, %f76, 0f00000000;
mov.b32 %r100, %f76;
sub.f32 %f78, %f101, %f11;
fma.rn.f32 %f79, %f78, %f62, %f61;
cvt.sat.f32.f32 %f80, %f79;
fma.rm.f32 %f81, %f80, %f66, %f65;
add.f32 %f82, %f81, 0fCB40007F;
neg.f32 %f83, %f82;
fma.rn.f32 %f84, %f78, %f70, %f83;
fma.rn.f32 %f85, %f78, %f72, %f84;
mov.b32 %r104, %f81;
shl.b32 %r105, %r104, 23;
mov.b32 %f86, %r105;
ex2.approx.ftz.f32 %f87, %f85;
mul.f32 %f88, %f87, %f86;
add.f32 %f103, %f77, %f88;
mov.b32 %r101, %f88;
add.s32 %r106, %r17, %r1;
shl.b32 %r107, %r106, 1;
mul.wide.s32 %rd32, %r107, 4;
add.s64 %rd31, %rd9, %rd32;
// begin inline asm
st.global.cs.v2.s32 [%rd31], {%r100,%r101};
// end inline asm
bra.uni $L__BB0_26;
$L__BB0_23:
mov.u32 %r88, %ctaid.x;
mad.lo.s32 %r89, %r8, %r88, %r9;
setp.lt.s32 %p35, %r89, 27454;
sub.f32 %f33, %f100, %f11;
mov.f32 %f34, 0f3F000000;
mov.f32 %f35, 0f3BBB989D;
fma.rn.f32 %f36, %f33, %f35, %f34;
cvt.sat.f32.f32 %f37, %f36;
mov.f32 %f38, 0f4B400001;
mov.f32 %f39, 0f437C0000;
fma.rm.f32 %f40, %f37, %f39, %f38;
add.f32 %f41, %f40, 0fCB40007F;
neg.f32 %f42, %f41;
mov.f32 %f43, 0f3FB8AA3B;
fma.rn.f32 %f44, %f33, %f43, %f42;
mov.f32 %f45, 0f32A57060;
fma.rn.f32 %f46, %f33, %f45, %f44;
mov.b32 %r90, %f40;
shl.b32 %r91, %r90, 23;
mov.b32 %f47, %r91;
ex2.approx.ftz.f32 %f48, %f46;
mul.f32 %f12, %f48, %f47;
add.f32 %f49, %f12, 0f00000000;
mov.f32 %f103, 0f00000000;
and.pred %p3, %p5, %p35;
sub.f32 %f50, %f101, %f11;
fma.rn.f32 %f51, %f50, %f35, %f34;
cvt.sat.f32.f32 %f52, %f51;
fma.rm.f32 %f53, %f52, %f39, %f38;
add.f32 %f54, %f53, 0fCB40007F;
neg.f32 %f55, %f54;
fma.rn.f32 %f56, %f50, %f43, %f55;
fma.rn.f32 %f57, %f50, %f45, %f56;
mov.b32 %r92, %f53;
shl.b32 %r93, %r92, 23;
mov.b32 %f58, %r93;
ex2.approx.ftz.f32 %f59, %f57;
mul.f32 %f13, %f59, %f58;
add.f32 %f14, %f49, %f13;
not.pred %p36, %p3;
@%p36 bra $L__BB0_26;
mov.b32 %r95, %f13;
add.s32 %r96, %r1, %r9;
mad.lo.s32 %r98, %r8, %r88, %r96;
shl.b32 %r99, %r98, 1;
mul.wide.s32 %rd30, %r99, 4;
add.s64 %rd29, %rd9, %rd30;
mov.b32 %r94, %f12;
// begin inline asm
st.global.cs.v2.s32 [%rd29], {%r94,%r95};
// end inline asm
selp.f32 %f103, %f14, 0f00000000, %p3;
$L__BB0_26:
st.shared.f32 [%rd2], %f103;
bar.sync 0;
@%p16 bra $L__BB0_28;
ld.shared.f32 %f89, [%rd3];
ld.shared.f32 %f90, [%rd2];
add.f32 %f91, %f89, %f90;
st.shared.f32 [%rd2], %f91;
$L__BB0_28:
bar.sync 0;
@%p20 bra $L__BB0_32;
$L__BB0_29:
setp.ge.u32 %p39, %r1, %r111;
@%p39 bra $L__BB0_31;
add.s32 %r108, %r111, %r12;
mul.wide.s32 %rd33, %r108, 4;
add.s64 %rd35, %rd20, %rd33;
ld.shared.f32 %f92, [%rd2];
ld.shared.f32 %f93, [%rd35];
add.f32 %f94, %f93, %f92;
st.shared.f32 [%rd2], %f94;
$L__BB0_31:
bar.sync 0;
shr.u32 %r19, %r111, 1;
setp.gt.u32 %p40, %r111, 3;
mov.u32 %r111, %r19;
@%p40 bra $L__BB0_29;
$L__BB0_32:
mov.f32 %f104, 0f00000000;
@%p4 bra $L__BB0_35;
ld.shared.f32 %f96, [%rd2];
add.f32 %f104, %f96, 0f00000000;
setp.lt.u32 %p42, %r11, 2;
@%p42 bra $L__BB0_35;
ld.shared.f32 %f97, [%rd4];
add.f32 %f104, %f104, %f97;
$L__BB0_35:
bar.sync 0;
mov.u32 %r109, %ctaid.x;
mad.lo.s32 %r20, %r8, %r109, %r9;
setp.gt.s32 %p43, %r20, 27453;
@%p43 bra $L__BB0_37;
rcp.rn.f32 %f98, %f104;
cvta.to.global.u64 %rd36, %rd8;
mul.wide.s32 %rd37, %r20, 4;
add.s64 %rd38, %rd36, %rd37;
st.global.f32 [%rd38], %f98;
$L__BB0_37:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_183_cu_41f3cdfb_1601111nvfuser_183ENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_183_cu_41f3cdfb_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_183_cu_41f3cdfb_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_183_cu_41f3cdfb_160113std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_183_cu_41f3cdfb_160115arrayE[];
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_183_cu_41f3cdfb_1601111nvfuser_183ENS_6TensorIfLi3ELi3EEES1_S1_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_183_cu_41f3cdfb_1601111nvfuser_183ENS_6TensorIfLi3ELi3EEES1_S1__param_0[32],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_183_cu_41f3cdfb_1601111nvfuser_183ENS_6TensorIfLi3ELi3EEES1_S1__param_1[32],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_183_cu_41f3cdfb_1601111nvfuser_183ENS_6TensorIfLi3ELi3EEES1_S1__param_2[32]
)
{
.reg .pred %p<33>;
.reg .f32 %f<70>;
.reg .b32 %r<79>;
.reg .b64 %rd<32>;
// demoted variable
.shared .align 4 .u32 _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_183_cu_41f3cdfb_1601111nvfuser_183ENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s;
ld.param.v2.u32 {%r36, %r37}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_183_cu_41f3cdfb_1601111nvfuser_183ENS_6TensorIfLi3ELi3EEES1_S1__param_0+24];
ld.param.u64 %rd7, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_183_cu_41f3cdfb_1601111nvfuser_183ENS_6TensorIfLi3ELi3EEES1_S1__param_2];
ld.param.u64 %rd6, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_183_cu_41f3cdfb_1601111nvfuser_183ENS_6TensorIfLi3ELi3EEES1_S1__param_1];
ld.param.u64 %rd5, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_183_cu_41f3cdfb_1601111nvfuser_183ENS_6TensorIfLi3ELi3EEES1_S1__param_0];
mov.u32 %r1, %tid.x;
setp.ne.s32 %p2, %r1, 0;
@%p2 bra $L__BB0_2;
mov.u32 %r50, 0;
st.shared.u32 [_ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_183_cu_41f3cdfb_1601111nvfuser_183ENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s], %r50;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd8, _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_183_cu_41f3cdfb_1601111nvfuser_183ENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s;
atom.shared.min.s32 %r51, [%rd8], %r1;
mov.u32 %r52, %ctaid.x;
mov.u32 %r2, %ntid.y;
mov.u32 %r3, %tid.y;
mad.lo.s32 %r4, %r2, %r52, %r3;
setp.gt.s32 %p3, %r4, 27453;
mov.f32 %f65, 0fFF800000;
mov.f32 %f66, %f65;
@%p3 bra $L__BB0_4;
cvta.to.global.u64 %rd9, %rd5;
ld.shared.u32 %r53, [_ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_183_cu_41f3cdfb_1601111nvfuser_183ENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s];
shl.b32 %r54, %r53, 1;
mul.lo.s32 %r55, %r4, %r36;
mad.lo.s32 %r56, %r54, %r37, %r55;
mul.wide.s32 %rd10, %r56, 4;
add.s64 %rd11, %rd9, %rd10;
ld.global.f32 %f65, [%rd11];
add.s32 %r57, %r56, %r37;
mul.wide.s32 %rd12, %r57, 4;
add.s64 %rd13, %rd9, %rd12;
ld.global.f32 %f66, [%rd13];
$L__BB0_4:
setp.gt.f32 %p4, %f65, %f66;
setp.nan.f32 %p5, %f65, %f65;
or.pred %p6, %p5, %p4;
selp.f32 %f16, %f65, %f66, %p6;
mov.u32 %r58, %tid.z;
mad.lo.s32 %r5, %r2, %r58, %r3;
mov.u32 %r6, %ntid.x;
mad.lo.s32 %r7, %r5, %r6, %r1;
mul.wide.u32 %rd14, %r7, 4;
mov.u64 %rd15, _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_183_cu_41f3cdfb_160115arrayE;
add.s64 %rd1, %rd15, %rd14;
st.shared.f32 [%rd1], %f16;
bar.sync 0;
clz.b32 %r59, %r6;
mov.u32 %r60, 31;
sub.s32 %r61, %r60, %r59;
mov.u32 %r62, 1;
shl.b32 %r8, %r62, %r61;
setp.lt.u32 %p7, %r1, %r8;
add.s32 %r63, %r8, %r1;
setp.lt.u32 %p8, %r63, %r6;
and.pred %p1, %p7, %p8;
add.s32 %r64, %r7, %r8;
mul.wide.s32 %rd16, %r64, 4;
add.s64 %rd2, %rd15, %rd16;
not.pred %p9, %p1;
@%p9 bra $L__BB0_6;
ld.shared.f32 %f17, [%rd2];
ld.shared.f32 %f18, [%rd1];
setp.nan.f32 %p10, %f18, %f18;
setp.gt.f32 %p11, %f18, %f17;
or.pred %p12, %p10, %p11;
selp.f32 %f19, %f18, %f17, %p12;
st.shared.f32 [%rd1], %f19;
$L__BB0_6:
bar.sync 0;
shr.u32 %r65, %r8, 31;
add.s32 %r66, %r8, %r65;
shr.s32 %r78, %r66, 1;
setp.lt.s32 %p13, %r8, 4;
@%p13 bra $L__BB0_11;
mov.u32 %r77, %r78;
$L__BB0_8:
setp.ge.u32 %p14, %r1, %r77;
@%p14 bra $L__BB0_10;
add.s32 %r67, %r77, %r7;
mul.wide.s32 %rd17, %r67, 4;
add.s64 %rd19, %rd15, %rd17;
ld.shared.f32 %f20, [%rd1];
setp.nan.f32 %p15, %f20, %f20;
ld.shared.f32 %f21, [%rd19];
setp.gt.f32 %p16, %f20, %f21;
or.pred %p17, %p15, %p16;
selp.f32 %f22, %f20, %f21, %p17;
st.shared.f32 [%rd1], %f22;
$L__BB0_10:
bar.sync 0;
shr.u32 %r11, %r77, 1;
setp.gt.u32 %p18, %r77, 3;
mov.u32 %r77, %r11;
@%p18 bra $L__BB0_8;
$L__BB0_11:
add.s32 %r68, %r7, 1;
mul.wide.u32 %rd20, %r68, 4;
add.s64 %rd3, %rd15, %rd20;
mov.f32 %f67, 0fFF800000;
@%p2 bra $L__BB0_14;
ld.shared.f32 %f67, [%rd1];
setp.lt.u32 %p20, %r6, 2;
@%p20 bra $L__BB0_14;
ld.shared.f32 %f24, [%rd3];
setp.gt.f32 %p21, %f67, %f24;
setp.nan.f32 %p22, %f67, %f67;
or.pred %p23, %p22, %p21;
selp.f32 %f67, %f67, %f24, %p23;
$L__BB0_14:
bar.sync 0;
mul.wide.s32 %rd22, %r5, 4;
add.s64 %rd4, %rd15, %rd22;
setp.eq.s32 %p24, %r1, 0;
@%p24 bra $L__BB0_15;
bra.uni $L__BB0_16;
$L__BB0_15:
st.shared.f32 [%rd4], %f67;
$L__BB0_16:
bar.sync 0;
ld.shared.f32 %f8, [%rd4];
bar.sync 0;
mov.f32 %f68, 0f00000000;
@%p3 bra $L__BB0_18;
sub.f32 %f26, %f65, %f8;
mov.f32 %f27, 0f3F000000;
mov.f32 %f28, 0f3BBB989D;
fma.rn.f32 %f29, %f26, %f28, %f27;
cvt.sat.f32.f32 %f30, %f29;
mov.f32 %f31, 0f4B400001;
mov.f32 %f32, 0f437C0000;
fma.rm.f32 %f33, %f30, %f32, %f31;
add.f32 %f34, %f33, 0fCB40007F;
neg.f32 %f35, %f34;
mov.f32 %f36, 0f3FB8AA3B;
fma.rn.f32 %f37, %f26, %f36, %f35;
mov.f32 %f38, 0f32A57060;
fma.rn.f32 %f39, %f26, %f38, %f37;
mov.b32 %r71, %f33;
shl.b32 %r72, %r71, 23;
mov.b32 %f40, %r72;
ex2.approx.ftz.f32 %f41, %f39;
mul.f32 %f42, %f41, %f40;
add.f32 %f43, %f42, 0f00000000;
mov.b32 %r69, %f42;
sub.f32 %f44, %f66, %f8;
fma.rn.f32 %f45, %f44, %f28, %f27;
cvt.sat.f32.f32 %f46, %f45;
fma.rm.f32 %f47, %f46, %f32, %f31;
add.f32 %f48, %f47, 0fCB40007F;
neg.f32 %f49, %f48;
fma.rn.f32 %f50, %f44, %f36, %f49;
fma.rn.f32 %f51, %f44, %f38, %f50;
mov.b32 %r73, %f47;
shl.b32 %r74, %r73, 23;
mov.b32 %f52, %r74;
ex2.approx.ftz.f32 %f53, %f51;
mul.f32 %f54, %f53, %f52;
add.f32 %f68, %f43, %f54;
mov.b32 %r70, %f54;
shl.b32 %r75, %r4, 1;
mul.wide.s32 %rd25, %r75, 4;
add.s64 %rd24, %rd7, %rd25;
// begin inline asm
st.global.cs.v2.s32 [%rd24], {%r69,%r70};
// end inline asm
$L__BB0_18:
st.shared.f32 [%rd1], %f68;
bar.sync 0;
@%p9 bra $L__BB0_20;
ld.shared.f32 %f55, [%rd2];
ld.shared.f32 %f56, [%rd1];
add.f32 %f57, %f55, %f56;
st.shared.f32 [%rd1], %f57;
$L__BB0_20:
bar.sync 0;
@%p13 bra $L__BB0_24;
$L__BB0_21:
setp.ge.u32 %p28, %r1, %r78;
@%p28 bra $L__BB0_23;
add.s32 %r76, %r78, %r7;
mul.wide.s32 %rd26, %r76, 4;
add.s64 %rd28, %rd15, %rd26;
ld.shared.f32 %f58, [%rd1];
ld.shared.f32 %f59, [%rd28];
add.f32 %f60, %f59, %f58;
st.shared.f32 [%rd1], %f60;
$L__BB0_23:
bar.sync 0;
shr.u32 %r13, %r78, 1;
setp.gt.u32 %p29, %r78, 3;
mov.u32 %r78, %r13;
@%p29 bra $L__BB0_21;
$L__BB0_24:
mov.f32 %f69, 0f00000000;
@%p2 bra $L__BB0_27;
ld.shared.f32 %f62, [%rd1];
add.f32 %f69, %f62, 0f00000000;
setp.lt.u32 %p31, %r6, 2;
@%p31 bra $L__BB0_27;
ld.shared.f32 %f63, [%rd3];
add.f32 %f69, %f69, %f63;
$L__BB0_27:
bar.sync 0;
@%p3 bra $L__BB0_29;
rcp.rn.f32 %f64, %f69;
cvta.to.global.u64 %rd29, %rd6;
mul.wide.s32 %rd30, %r4, 4;
add.s64 %rd31, %rd29, %rd30;
st.global.f32 [%rd31], %f64;
$L__BB0_29:
ret;
}
--- 53997da5d
+++ 03a1b695e
@@ -20,364 +20,257 @@
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1__param_0[32],
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1__param_1[32],
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1__param_2[32]
)
{
- .reg .pred %p<44>;
- .reg .f32 %f<105>;
- .reg .b32 %r<112>;
- .reg .b64 %rd<39>;
+ .reg .pred %p<33>;
+ .reg .f32 %f<70>;
+ .reg .b32 %r<79>;
+ .reg .b64 %rd<32>;
.shared .align 4 .u32 _ZZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s;
- ld.param.v2.u32 {%r43, %r44}, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1__param_0+24];
- ld.param.u64 %rd9, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1__param_2];
- ld.param.u64 %rd8, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1__param_1];
- ld.param.u64 %rd7, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1__param_0];
- cvta.to.global.u64 %rd1, %rd7;
+ ld.param.v2.u32 {%r36, %r37}, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1__param_0+24];
+ ld.param.u64 %rd7, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1__param_2];
+ ld.param.u64 %rd6, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1__param_1];
+ ld.param.u64 %rd5, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1__param_0];
mov.u32 %r1, %tid.x;
- setp.ne.s32 %p4, %r1, 0;
- @%p4 bra $L__BB0_2;
-
- mov.u32 %r57, 0;
- st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s], %r57;
+ setp.ne.s32 %p2, %r1, 0;
+ @%p2 bra $L__BB0_2;
+
+ mov.u32 %r50, 0;
+ st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s], %r50;
$L__BB0_2:
bar.sync 0;
- mov.u64 %rd10, _ZZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s;
- atom.shared.min.s32 %r58, [%rd10], %r1;
- ld.shared.u32 %r59, [_ZZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s];
- shl.b32 %r4, %r59, 1;
- setp.lt.s32 %p5, %r1, 1;
- @%p5 bra $L__BB0_3;
- bra.uni $L__BB0_4;
-
-$L__BB0_3:
- mov.u32 %r60, %tid.y;
- mov.u32 %r61, %ctaid.x;
- mov.u32 %r62, %ntid.y;
- mad.lo.s32 %r5, %r62, %r61, %r60;
- setp.lt.s32 %p6, %r5, 27454;
- @%p6 bra $L__BB0_8;
- bra.uni $L__BB0_4;
+ mov.u64 %rd8, _ZZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s;
+ atom.shared.min.s32 %r51, [%rd8], %r1;
+ mov.u32 %r52, %ctaid.x;
+ mov.u32 %r2, %ntid.y;
+ mov.u32 %r3, %tid.y;
+ mad.lo.s32 %r4, %r2, %r52, %r3;
+ setp.gt.s32 %p3, %r4, 27453;
+ mov.f32 %f65, 0fFF800000;
+ mov.f32 %f66, %f65;
+ @%p3 bra $L__BB0_4;
+
+ cvta.to.global.u64 %rd9, %rd5;
+ ld.shared.u32 %r53, [_ZZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEES1_S1_E14nvfuser_zero_s];
+ shl.b32 %r54, %r53, 1;
+ mul.lo.s32 %r55, %r4, %r36;
+ mad.lo.s32 %r56, %r54, %r37, %r55;
+ mul.wide.s32 %rd10, %r56, 4;
+ add.s64 %rd11, %rd9, %rd10;
+ ld.global.f32 %f65, [%rd11];
+ add.s32 %r57, %r56, %r37;
+ mul.wide.s32 %rd12, %r57, 4;
+ add.s64 %rd13, %rd9, %rd12;
+ ld.global.f32 %f66, [%rd13];
+
+$L__BB0_4:
+ setp.gt.f32 %p4, %f65, %f66;
+ setp.nan.f32 %p5, %f65, %f65;
+ or.pred %p6, %p5, %p4;
+ selp.f32 %f16, %f65, %f66, %p6;
+ mov.u32 %r58, %tid.z;
+ mad.lo.s32 %r5, %r2, %r58, %r3;
+ mov.u32 %r6, %ntid.x;
+ mad.lo.s32 %r7, %r5, %r6, %r1;
+ mul.wide.u32 %rd14, %r7, 4;
+ mov.u64 %rd15, _ZN11kernelscope6kernelE;
+ add.s64 %rd1, %rd15, %rd14;
+ st.shared.f32 [%rd1], %f16;
+ bar.sync 0;
+ clz.b32 %r59, %r6;
+ mov.u32 %r60, 31;
+ sub.s32 %r61, %r60, %r59;
+ mov.u32 %r62, 1;
+ shl.b32 %r8, %r62, %r61;
+ setp.lt.u32 %p7, %r1, %r8;
+ add.s32 %r63, %r8, %r1;
+ setp.lt.u32 %p8, %r63, %r6;
+ and.pred %p1, %p7, %p8;
+ add.s32 %r64, %r7, %r8;
+ mul.wide.s32 %rd16, %r64, 4;
+ add.s64 %rd2, %rd15, %rd16;
+ not.pred %p9, %p1;
+ @%p9 bra $L__BB0_6;
+
+ ld.shared.f32 %f17, [%rd2];
+ ld.shared.f32 %f18, [%rd1];
+ setp.nan.f32 %p10, %f18, %f18;
+ setp.gt.f32 %p11, %f18, %f17;
+ or.pred %p12, %p10, %p11;
+ selp.f32 %f19, %f18, %f17, %p12;
+ st.shared.f32 [%rd1], %f19;
+
+$L__BB0_6:
+ bar.sync 0;
+ shr.u32 %r65, %r8, 31;
+ add.s32 %r66, %r8, %r65;
+ shr.s32 %r78, %r66, 1;
+ setp.lt.s32 %p13, %r8, 4;
+ @%p13 bra $L__BB0_11;
+
+ mov.u32 %r77, %r78;
$L__BB0_8:
- shl.b32 %r71, %r1, 1;
- add.s32 %r72, %r4, %r71;
- mul.lo.s32 %r73, %r5, %r43;
- mad.lo.s32 %r74, %r72, %r44, %r73;
- mul.wide.s32 %rd15, %r74, 4;
- add.s64 %rd16, %rd1, %rd15;
- ld.global.f32 %f100, [%rd16];
- add.s32 %r75, %r74, %r44;
- mul.wide.s32 %rd17, %r75, 4;
- add.s64 %rd18, %rd1, %rd17;
- ld.global.f32 %f101, [%rd18];
- bra.uni $L__BB0_9;
-
-$L__BB0_4:
- mov.u32 %r63, %ntid.y;
- mov.u32 %r64, %ctaid.x;
- mov.u32 %r65, %tid.y;
- mad.lo.s32 %r66, %r63, %r64, %r65;
- setp.lt.s32 %p8, %r66, 27454;
- shl.b32 %r67, %r1, 1;
- add.s32 %r6, %r4, %r67;
- mul.lo.s32 %r7, %r66, %r43;
- and.pred %p1, %p5, %p8;
- mov.f32 %f101, 0fFF800000;
- not.pred %p9, %p1;
- mov.f32 %f100, %f101;
- @%p9 bra $L__BB0_6;
-
- mad.lo.s32 %r68, %r6, %r44, %r7;
- mul.wide.s32 %rd11, %r68, 4;
- add.s64 %rd12, %rd1, %rd11;
- ld.global.f32 %f100, [%rd12];
-
-$L__BB0_6:
- @%p9 bra $L__BB0_9;
-
- add.s32 %r69, %r6, 1;
- mad.lo.s32 %r70, %r69, %r44, %r7;
- mul.wide.s32 %rd13, %r70, 4;
- add.s64 %rd14, %rd1, %rd13;
- ld.global.f32 %f101, [%rd14];
-
-$L__BB0_9:
- setp.gt.f32 %p11, %f100, %f101;
- setp.nan.f32 %p12, %f100, %f100;
- or.pred %p13, %p12, %p11;
- selp.f32 %f23, %f100, %f101, %p13;
- mov.u32 %r76, %tid.z;
- mov.u32 %r8, %ntid.y;
- mov.u32 %r9, %tid.y;
- mad.lo.s32 %r10, %r8, %r76, %r9;
- mov.u32 %r11, %ntid.x;
- mad.lo.s32 %r12, %r10, %r11, %r1;
- mul.wide.u32 %rd19, %r12, 4;
- mov.u64 %rd20, _ZN11kernelscope6kernelE;
- add.s64 %rd2, %rd20, %rd19;
- st.shared.f32 [%rd2], %f23;
- bar.sync 0;
- clz.b32 %r77, %r11;
- mov.u32 %r78, 31;
- sub.s32 %r79, %r78, %r77;
- mov.u32 %r80, 1;
- shl.b32 %r13, %r80, %r79;
- setp.lt.u32 %p14, %r1, %r13;
- add.s32 %r81, %r13, %r1;
- setp.lt.u32 %p15, %r81, %r11;
- and.pred %p2, %p14, %p15;
- add.s32 %r82, %r12, %r13;
- mul.wide.s32 %rd21, %r82, 4;
- add.s64 %rd3, %rd20, %rd21;
- not.pred %p16, %p2;
- @%p16 bra $L__BB0_11;
+ setp.ge.u32 %p14, %r1, %r77;
+ @%p14 bra $L__BB0_10;
+
+ add.s32 %r67, %r77, %r7;
+ mul.wide.s32 %rd17, %r67, 4;
+ add.s64 %rd19, %rd15, %rd17;
+ ld.shared.f32 %f20, [%rd1];
+ setp.nan.f32 %p15, %f20, %f20;
+ ld.shared.f32 %f21, [%rd19];
+ setp.gt.f32 %p16, %f20, %f21;
+ or.pred %p17, %p15, %p16;
+ selp.f32 %f22, %f20, %f21, %p17;
+ st.shared.f32 [%rd1], %f22;
+
+$L__BB0_10:
+ bar.sync 0;
+ shr.u32 %r11, %r77, 1;
+ setp.gt.u32 %p18, %r77, 3;
+ mov.u32 %r77, %r11;
+ @%p18 bra $L__BB0_8;
+
+$L__BB0_11:
+ add.s32 %r68, %r7, 1;
+ mul.wide.u32 %rd20, %r68, 4;
+ add.s64 %rd3, %rd15, %rd20;
+ mov.f32 %f67, 0fFF800000;
+ @%p2 bra $L__BB0_14;
+
+ ld.shared.f32 %f67, [%rd1];
+ setp.lt.u32 %p20, %r6, 2;
+ @%p20 bra $L__BB0_14;
ld.shared.f32 %f24, [%rd3];
- ld.shared.f32 %f25, [%rd2];
- setp.nan.f32 %p17, %f25, %f25;
- setp.gt.f32 %p18, %f25, %f24;
- or.pred %p19, %p17, %p18;
- selp.f32 %f26, %f25, %f24, %p19;
- st.shared.f32 [%rd2], %f26;
-
-$L__BB0_11:
- bar.sync 0;
- shr.u32 %r83, %r13, 31;
- add.s32 %r84, %r13, %r83;
- shr.s32 %r111, %r84, 1;
- setp.lt.s32 %p20, %r13, 4;
- @%p20 bra $L__BB0_16;
-
- mov.u32 %r110, %r111;
-
-$L__BB0_13:
- setp.ge.u32 %p21, %r1, %r110;
- @%p21 bra $L__BB0_15;
-
- add.s32 %r85, %r110, %r12;
- mul.wide.s32 %rd22, %r85, 4;
- add.s64 %rd24, %rd20, %rd22;
- ld.shared.f32 %f27, [%rd2];
- setp.nan.f32 %p22, %f27, %f27;
- ld.shared.f32 %f28, [%rd24];
- setp.gt.f32 %p23, %f27, %f28;
- or.pred %p24, %p22, %p23;
- selp.f32 %f29, %f27, %f28, %p24;
- st.shared.f32 [%rd2], %f29;
+ setp.gt.f32 %p21, %f67, %f24;
+ setp.nan.f32 %p22, %f67, %f67;
+ or.pred %p23, %p22, %p21;
+ selp.f32 %f67, %f67, %f24, %p23;
+
+$L__BB0_14:
+ bar.sync 0;
+ mul.wide.s32 %rd22, %r5, 4;
+ add.s64 %rd4, %rd15, %rd22;
+ setp.eq.s32 %p24, %r1, 0;
+ @%p24 bra $L__BB0_15;
+ bra.uni $L__BB0_16;
$L__BB0_15:
- bar.sync 0;
- shr.u32 %r16, %r110, 1;
- setp.gt.u32 %p25, %r110, 3;
- mov.u32 %r110, %r16;
- @%p25 bra $L__BB0_13;
+ st.shared.f32 [%rd4], %f67;
$L__BB0_16:
- add.s32 %r86, %r12, 1;
- mul.wide.u32 %rd25, %r86, 4;
- add.s64 %rd4, %rd20, %rd25;
- mov.f32 %f102, 0fFF800000;
- @%p4 bra $L__BB0_19;
-
- ld.shared.f32 %f102, [%rd2];
- setp.lt.u32 %p27, %r11, 2;
- @%p27 bra $L__BB0_19;
-
- ld.shared.f32 %f31, [%rd4];
- setp.gt.f32 %p28, %f102, %f31;
- setp.nan.f32 %p29, %f102, %f102;
- or.pred %p30, %p29, %p28;
- selp.f32 %f102, %f102, %f31, %p30;
-
-$L__BB0_19:
- bar.sync 0;
- mul.wide.s32 %rd27, %r10, 4;
- add.s64 %rd5, %rd20, %rd27;
- setp.eq.s32 %p31, %r1, 0;
- @%p31 bra $L__BB0_20;
- bra.uni $L__BB0_21;
+ bar.sync 0;
+ ld.shared.f32 %f8, [%rd4];
+ bar.sync 0;
+ mov.f32 %f68, 0f00000000;
+ @%p3 bra $L__BB0_18;
+
+ sub.f32 %f26, %f65, %f8;
+ mov.f32 %f27, 0f3F000000;
+ mov.f32 %f28, 0f3BBB989D;
+ fma.rn.f32 %f29, %f26, %f28, %f27;
+ cvt.sat.f32.f32 %f30, %f29;
+ mov.f32 %f31, 0f4B400001;
+ mov.f32 %f32, 0f437C0000;
+ fma.rm.f32 %f33, %f30, %f32, %f31;
+ add.f32 %f34, %f33, 0fCB40007F;
+ neg.f32 %f35, %f34;
+ mov.f32 %f36, 0f3FB8AA3B;
+ fma.rn.f32 %f37, %f26, %f36, %f35;
+ mov.f32 %f38, 0f32A57060;
+ fma.rn.f32 %f39, %f26, %f38, %f37;
+ mov.b32 %r71, %f33;
+ shl.b32 %r72, %r71, 23;
+ mov.b32 %f40, %r72;
+ ex2.approx.ftz.f32 %f41, %f39;
+ mul.f32 %f42, %f41, %f40;
+ add.f32 %f43, %f42, 0f00000000;
+ mov.b32 %r69, %f42;
+ sub.f32 %f44, %f66, %f8;
+ fma.rn.f32 %f45, %f44, %f28, %f27;
+ cvt.sat.f32.f32 %f46, %f45;
+ fma.rm.f32 %f47, %f46, %f32, %f31;
+ add.f32 %f48, %f47, 0fCB40007F;
+ neg.f32 %f49, %f48;
+ fma.rn.f32 %f50, %f44, %f36, %f49;
+ fma.rn.f32 %f51, %f44, %f38, %f50;
+ mov.b32 %r73, %f47;
+ shl.b32 %r74, %r73, 23;
+ mov.b32 %f52, %r74;
+ ex2.approx.ftz.f32 %f53, %f51;
+ mul.f32 %f54, %f53, %f52;
+ add.f32 %f68, %f43, %f54;
+ mov.b32 %r70, %f54;
+ shl.b32 %r75, %r4, 1;
+ mul.wide.s32 %rd25, %r75, 4;
+ add.s64 %rd24, %rd7, %rd25;
+
+ st.global.cs.v2.s32 [%rd24], {%r69,%r70};
+
+
+$L__BB0_18:
+ st.shared.f32 [%rd1], %f68;
+ bar.sync 0;
+ @%p9 bra $L__BB0_20;
+
+ ld.shared.f32 %f55, [%rd2];
+ ld.shared.f32 %f56, [%rd1];
+ add.f32 %f57, %f55, %f56;
+ st.shared.f32 [%rd1], %f57;
$L__BB0_20:
- st.shared.f32 [%rd5], %f102;
+ bar.sync 0;
+ @%p13 bra $L__BB0_24;
$L__BB0_21:
- bar.sync 0;
- ld.shared.f32 %f11, [%rd5];
- bar.sync 0;
- @%p5 bra $L__BB0_22;
- bra.uni $L__BB0_23;
-
-$L__BB0_22:
- mov.u32 %r87, %ctaid.x;
- mad.lo.s32 %r17, %r8, %r87, %r9;
- setp.lt.s32 %p33, %r17, 27454;
- @%p33 bra $L__BB0_25;
- bra.uni $L__BB0_23;
-
-$L__BB0_25:
- sub.f32 %f60, %f100, %f11;
- mov.f32 %f61, 0f3F000000;
- mov.f32 %f62, 0f3BBB989D;
- fma.rn.f32 %f63, %f60, %f62, %f61;
- cvt.sat.f32.f32 %f64, %f63;
- mov.f32 %f65, 0f4B400001;
- mov.f32 %f66, 0f437C0000;
- fma.rm.f32 %f67, %f64, %f66, %f65;
- add.f32 %f68, %f67, 0fCB40007F;
- neg.f32 %f69, %f68;
- mov.f32 %f70, 0f3FB8AA3B;
- fma.rn.f32 %f71, %f60, %f70, %f69;
- mov.f32 %f72, 0f32A57060;
- fma.rn.f32 %f73, %f60, %f72, %f71;
- mov.b32 %r102, %f67;
- shl.b32 %r103, %r102, 23;
- mov.b32 %f74, %r103;
- ex2.approx.ftz.f32 %f75, %f73;
- mul.f32 %f76, %f75, %f74;
- add.f32 %f77, %f76, 0f00000000;
- mov.b32 %r100, %f76;
- sub.f32 %f78, %f101, %f11;
- fma.rn.f32 %f79, %f78, %f62, %f61;
- cvt.sat.f32.f32 %f80, %f79;
- fma.rm.f32 %f81, %f80, %f66, %f65;
- add.f32 %f82, %f81, 0fCB40007F;
- neg.f32 %f83, %f82;
- fma.rn.f32 %f84, %f78, %f70, %f83;
- fma.rn.f32 %f85, %f78, %f72, %f84;
- mov.b32 %r104, %f81;
- shl.b32 %r105, %r104, 23;
- mov.b32 %f86, %r105;
- ex2.approx.ftz.f32 %f87, %f85;
- mul.f32 %f88, %f87, %f86;
- add.f32 %f103, %f77, %f88;
- mov.b32 %r101, %f88;
- add.s32 %r106, %r17, %r1;
- shl.b32 %r107, %r106, 1;
- mul.wide.s32 %rd32, %r107, 4;
- add.s64 %rd31, %rd9, %rd32;
-
- st.global.cs.v2.s32 [%rd31], {%r100,%r101};
-
- bra.uni $L__BB0_26;
+ setp.ge.u32 %p28, %r1, %r78;
+ @%p28 bra $L__BB0_23;
+
+ add.s32 %r76, %r78, %r7;
+ mul.wide.s32 %rd26, %r76, 4;
+ add.s64 %rd28, %rd15, %rd26;
+ ld.shared.f32 %f58, [%rd1];
+ ld.shared.f32 %f59, [%rd28];
+ add.f32 %f60, %f59, %f58;
+ st.shared.f32 [%rd1], %f60;
$L__BB0_23:
- mov.u32 %r88, %ctaid.x;
- mad.lo.s32 %r89, %r8, %r88, %r9;
- setp.lt.s32 %p35, %r89, 27454;
- sub.f32 %f33, %f100, %f11;
- mov.f32 %f34, 0f3F000000;
- mov.f32 %f35, 0f3BBB989D;
- fma.rn.f32 %f36, %f33, %f35, %f34;
- cvt.sat.f32.f32 %f37, %f36;
- mov.f32 %f38, 0f4B400001;
- mov.f32 %f39, 0f437C0000;
- fma.rm.f32 %f40, %f37, %f39, %f38;
- add.f32 %f41, %f40, 0fCB40007F;
- neg.f32 %f42, %f41;
- mov.f32 %f43, 0f3FB8AA3B;
- fma.rn.f32 %f44, %f33, %f43, %f42;
- mov.f32 %f45, 0f32A57060;
- fma.rn.f32 %f46, %f33, %f45, %f44;
- mov.b32 %r90, %f40;
- shl.b32 %r91, %r90, 23;
- mov.b32 %f47, %r91;
- ex2.approx.ftz.f32 %f48, %f46;
- mul.f32 %f12, %f48, %f47;
- add.f32 %f49, %f12, 0f00000000;
- mov.f32 %f103, 0f00000000;
- and.pred %p3, %p5, %p35;
- sub.f32 %f50, %f101, %f11;
- fma.rn.f32 %f51, %f50, %f35, %f34;
- cvt.sat.f32.f32 %f52, %f51;
- fma.rm.f32 %f53, %f52, %f39, %f38;
- add.f32 %f54, %f53, 0fCB40007F;
- neg.f32 %f55, %f54;
- fma.rn.f32 %f56, %f50, %f43, %f55;
- fma.rn.f32 %f57, %f50, %f45, %f56;
- mov.b32 %r92, %f53;
- shl.b32 %r93, %r92, 23;
- mov.b32 %f58, %r93;
- ex2.approx.ftz.f32 %f59, %f57;
- mul.f32 %f13, %f59, %f58;
- add.f32 %f14, %f49, %f13;
- not.pred %p36, %p3;
- @%p36 bra $L__BB0_26;
-
- mov.b32 %r95, %f13;
- add.s32 %r96, %r1, %r9;
- mad.lo.s32 %r98, %r8, %r88, %r96;
- shl.b32 %r99, %r98, 1;
- mul.wide.s32 %rd30, %r99, 4;
- add.s64 %rd29, %rd9, %rd30;
- mov.b32 %r94, %f12;
-
- st.global.cs.v2.s32 [%rd29], {%r94,%r95};
-
- selp.f32 %f103, %f14, 0f00000000, %p3;
-
-$L__BB0_26:
- st.shared.f32 [%rd2], %f103;
- bar.sync 0;
- @%p16 bra $L__BB0_28;
-
- ld.shared.f32 %f89, [%rd3];
- ld.shared.f32 %f90, [%rd2];
- add.f32 %f91, %f89, %f90;
- st.shared.f32 [%rd2], %f91;
-
-$L__BB0_28:
- bar.sync 0;
- @%p20 bra $L__BB0_32;
+ bar.sync 0;
+ shr.u32 %r13, %r78, 1;
+ setp.gt.u32 %p29, %r78, 3;
+ mov.u32 %r78, %r13;
+ @%p29 bra $L__BB0_21;
+
+$L__BB0_24:
+ mov.f32 %f69, 0f00000000;
+ @%p2 bra $L__BB0_27;
+
+ ld.shared.f32 %f62, [%rd1];
+ add.f32 %f69, %f62, 0f00000000;
+ setp.lt.u32 %p31, %r6, 2;
+ @%p31 bra $L__BB0_27;
+
+ ld.shared.f32 %f63, [%rd3];
+ add.f32 %f69, %f69, %f63;
+
+$L__BB0_27:
+ bar.sync 0;
+ @%p3 bra $L__BB0_29;
+
+ rcp.rn.f32 %f64, %f69;
+ cvta.to.global.u64 %rd29, %rd6;
+ mul.wide.s32 %rd30, %r4, 4;
+ add.s64 %rd31, %rd29, %rd30;
+ st.global.f32 [%rd31], %f64;
$L__BB0_29:
- setp.ge.u32 %p39, %r1, %r111;
- @%p39 bra $L__BB0_31;
-
- add.s32 %r108, %r111, %r12;
- mul.wide.s32 %rd33, %r108, 4;
- add.s64 %rd35, %rd20, %rd33;
- ld.shared.f32 %f92, [%rd2];
- ld.shared.f32 %f93, [%rd35];
- add.f32 %f94, %f93, %f92;
- st.shared.f32 [%rd2], %f94;
-
-$L__BB0_31:
- bar.sync 0;
- shr.u32 %r19, %r111, 1;
- setp.gt.u32 %p40, %r111, 3;
- mov.u32 %r111, %r19;
- @%p40 bra $L__BB0_29;
-
-$L__BB0_32:
- mov.f32 %f104, 0f00000000;
- @%p4 bra $L__BB0_35;
-
- ld.shared.f32 %f96, [%rd2];
- add.f32 %f104, %f96, 0f00000000;
- setp.lt.u32 %p42, %r11, 2;
- @%p42 bra $L__BB0_35;
-
- ld.shared.f32 %f97, [%rd4];
- add.f32 %f104, %f104, %f97;
-
-$L__BB0_35:
- bar.sync 0;
- mov.u32 %r109, %ctaid.x;
- mad.lo.s32 %r20, %r8, %r109, %r9;
- setp.gt.s32 %p43, %r20, 27453;
- @%p43 bra $L__BB0_37;
-
- rcp.rn.f32 %f98, %f104;
- cvta.to.global.u64 %rd36, %rd8;
- mul.wide.s32 %rd37, %r20, 4;
- add.s64 %rd38, %rd36, %rd37;
- st.global.f32 [%rd38], %f98;
-
-$L__BB0_37:
ret;
}
Kernel 187
CUDA
PTX
53997da5d
Diff
03a1b695e
-10
+10 index type: int
registers: 24
gmem: 3
static smem: 4
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 4, 4> T0, Tensor<float, 3, 3> T10) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
NVFUSER_DEFINE_MAGIC_ZERO;
Array<float, 2, 1> T14;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
T14[i0] = NEG_INFINITY;
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((((nvfuser_index_t)threadIdx.x) * 2) + 1) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
T14[i0]
= T0[(((((2 * T0.alloc_stride[3LL]) * ((nvfuser_index_t)threadIdx.x)) + (T0.alloc_stride[1LL] * ((nvfuser_index_t)threadIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T0.alloc_stride[1LL]) * ((nvfuser_index_t)blockIdx.x))) + (T0.alloc_stride[3LL] * (i0 + nvfuser_zero)))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
if ((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
T14[i0]
= T0[(((((2 * T0.alloc_stride[3LL]) * ((nvfuser_index_t)threadIdx.x)) + (T0.alloc_stride[1LL] * ((nvfuser_index_t)threadIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T0.alloc_stride[1LL]) * ((nvfuser_index_t)blockIdx.x))) + (T0.alloc_stride[3LL] * (i0 + nvfuser_zero)))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<float, 1, 1> T16;
T16[0] = NEG_INFINITY;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 2; ++i1) {
T16[0] = fmax(
T16[0],
T14[i1]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 1, 1> T2;
T2[0] = NEG_INFINITY;
blockReduce<true, false, false, true>(T2[0], T16[0], [](float &a, float b) { a = fmax(a, b); }, static_cast<float*>(shared_mem), true, true, float(NEG_INFINITY), DefaultBlockDim());
Array<float, 1, 1> T3;
broadcast::blockBroadcast<true, false, false, true>(T3[0], T2[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
// Alias Allocation - register
auto& T5 = T14;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 2; ++i2) {
Array<float, 1, 1> T4;
T4[0]
= T14[i2]
- T3[0];
T5[i2]
= expf(T4[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 1, 1> T17;
T17[0] = 0.000000000e+00f;
if (((((((nvfuser_index_t)threadIdx.x) * 2) + 1) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 2; ++i3) {
T17[0]
= T17[0]
+ T5[i3];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 2; ++i3) {
if ((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
T17[0]
= T17[0]
+ T5[i3];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<float, 1, 1> T6;
T6[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T6[0], T17[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T7;
broadcast::blockBroadcast<true, false, false, true>(T7[0], T6[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T8;
T8[0]
= reciprocal(T7[0]);
if (((((((nvfuser_index_t)threadIdx.x) * 2) + 1) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
Array<float, 2, 2> T15;
#pragma unroll
for(nvfuser_index_t i4 = 0; i4 < 2; ++i4) {
Array<float, 1, 1> T9;
T9[0]
= T5[i4]
* T8[0];
T15[i4]
= T9[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T10[(((2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T15[0]);
} else {
Array<float, 2, 2> T15;
#pragma unroll
for(nvfuser_index_t i4 = 0; i4 < 2; ++i4) {
Array<float, 1, 1> T9;
T9[0]
= T5[i4]
* T8[0];
T15[i4]
= T9[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T10[(((2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T15[0]);
}
}
}
__global__ void nvfuser_N(Tensor<float, 4, 4> T0, Tensor<float, 3, 3> T10) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
NVFUSER_DEFINE_MAGIC_ZERO;
Array<float, 2, 1> T14;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
T14[i0] = NEG_INFINITY;
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
T14[i0]
= T0[(((T0.alloc_stride[1LL] * ((nvfuser_index_t)threadIdx.y)) + ((((nvfuser_index_t)blockDim.y) * T0.alloc_stride[1LL]) * ((nvfuser_index_t)blockIdx.x))) + (T0.alloc_stride[3LL] * (i0 + nvfuser_zero)))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
T14[i0]
= T0[(((T0.alloc_stride[1LL] * ((nvfuser_index_t)threadIdx.y)) + ((((nvfuser_index_t)blockDim.y) * T0.alloc_stride[1LL]) * ((nvfuser_index_t)blockIdx.x))) + (T0.alloc_stride[3LL] * (i0 + nvfuser_zero)))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<float, 1, 1> T16;
T16[0] = NEG_INFINITY;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 2; ++i1) {
T16[0] = fmax(
T16[0],
T14[i1]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 1, 1> T2;
T2[0] = NEG_INFINITY;
blockReduce<true, false, false, true>(T2[0], T16[0], [](float &a, float b) { a = fmax(a, b); }, static_cast<float*>(shared_mem), true, true, float(NEG_INFINITY), DefaultBlockDim());
Array<float, 1, 1> T3;
broadcast::blockBroadcast<true, false, false, true>(T3[0], T2[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
// Alias Allocation - register
auto& T5 = T14;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 2; ++i2) {
Array<float, 1, 1> T4;
T4[0]
= T14[i2]
- T3[0];
T5[i2]
= expf(T4[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 1, 1> T17;
T17[0] = 0.000000000e+00f;
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 2; ++i3) {
T17[0]
= T17[0]
+ T5[i3];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 2; ++i3) {
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
T17[0]
= T17[0]
+ T5[i3];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<float, 1, 1> T6;
T6[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T6[0], T17[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T7;
broadcast::blockBroadcast<true, false, false, true>(T7[0], T6[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T8;
T8[0]
= reciprocal(T7[0]);
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
Array<float, 2, 2> T15;
#pragma unroll
for(nvfuser_index_t i4 = 0; i4 < 2; ++i4) {
Array<float, 1, 1> T9;
T9[0]
= T5[i4]
* T8[0];
T15[i4]
= T9[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T10[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T15[0]);
} else {
Array<float, 2, 2> T15;
#pragma unroll
for(nvfuser_index_t i4 = 0; i4 < 2; ++i4) {
Array<float, 1, 1> T9;
T9[0]
= T5[i4]
* T8[0];
T15[i4]
= T9[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T10[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T15[0]);
}
}
}
--- 53997da5d
+++ 03a1b695e
@@ -6,23 +6,23 @@
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
T14[i0] = NEG_INFINITY;
}
NVFUSER_UPDATE_MAGIC_ZERO;
- if (((((((nvfuser_index_t)threadIdx.x) * 2) + 1) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
T14[i0]
- = T0[(((((2 * T0.alloc_stride[3LL]) * ((nvfuser_index_t)threadIdx.x)) + (T0.alloc_stride[1LL] * ((nvfuser_index_t)threadIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T0.alloc_stride[1LL]) * ((nvfuser_index_t)blockIdx.x))) + (T0.alloc_stride[3LL] * (i0 + nvfuser_zero)))];
+ = T0[(((T0.alloc_stride[1LL] * ((nvfuser_index_t)threadIdx.y)) + ((((nvfuser_index_t)blockDim.y) * T0.alloc_stride[1LL]) * ((nvfuser_index_t)blockIdx.x))) + (T0.alloc_stride[3LL] * (i0 + nvfuser_zero)))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 2; ++i0) {
- if ((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
T14[i0]
- = T0[(((((2 * T0.alloc_stride[3LL]) * ((nvfuser_index_t)threadIdx.x)) + (T0.alloc_stride[1LL] * ((nvfuser_index_t)threadIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T0.alloc_stride[1LL]) * ((nvfuser_index_t)blockIdx.x))) + (T0.alloc_stride[3LL] * (i0 + nvfuser_zero)))];
+ = T0[(((T0.alloc_stride[1LL] * ((nvfuser_index_t)threadIdx.y)) + ((((nvfuser_index_t)blockDim.y) * T0.alloc_stride[1LL]) * ((nvfuser_index_t)blockIdx.x))) + (T0.alloc_stride[3LL] * (i0 + nvfuser_zero)))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<float, 1, 1> T16;
@@ -51,22 +51,22 @@
= expf(T4[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 1, 1> T17;
T17[0] = 0.000000000e+00f;
- if (((((((nvfuser_index_t)threadIdx.x) * 2) + 1) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 2; ++i3) {
T17[0]
= T17[0]
+ T5[i3];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 2; ++i3) {
- if ((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
T17[0]
= T17[0]
+ T5[i3];
}
}
@@ -78,11 +78,11 @@
Array<float, 1, 1> T7;
broadcast::blockBroadcast<true, false, false, true>(T7[0], T6[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T8;
T8[0]
= reciprocal(T7[0]);
- if (((((((nvfuser_index_t)threadIdx.x) * 2) + 1) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
Array<float, 2, 2> T15;
#pragma unroll
for(nvfuser_index_t i4 = 0; i4 < 2; ++i4) {
Array<float, 1, 1> T9;
T9[0]
@@ -90,11 +90,11 @@
* T8[0];
T15[i4]
= T9[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
- loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T10[(((2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T15[0]);
+ loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T10[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T15[0]);
} else {
Array<float, 2, 2> T15;
#pragma unroll
for(nvfuser_index_t i4 = 0; i4 < 2; ++i4) {
Array<float, 1, 1> T9;
@@ -103,10 +103,10 @@
* T8[0];
T15[i4]
= T9[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
- if ((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < 2) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
- loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T10[(((2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T15[0]);
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
+ loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T10[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))], &T15[0]);
}
}
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_192_cu_24f8cd48_1911011nvfuser_192ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEEE14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_192_cu_24f8cd48_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_192_cu_24f8cd48_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_192_cu_24f8cd48_191103std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_192_cu_24f8cd48_191105arrayE[];
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_192_cu_24f8cd48_1911011nvfuser_192ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_192_cu_24f8cd48_1911011nvfuser_192ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_0[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_192_cu_24f8cd48_1911011nvfuser_192ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_1[32]
)
{
.reg .pred %p<47>;
.reg .f32 %f<81>;
.reg .b32 %r<100>;
.reg .b64 %rd<35>;
// demoted variable
.shared .align 4 .u32 _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_192_cu_24f8cd48_1911011nvfuser_192ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEEE14nvfuser_zero_s;
ld.param.v2.u32 {%r39, %r40}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_192_cu_24f8cd48_1911011nvfuser_192ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_0+24];
ld.param.v2.u32 {%r41, %r42}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_192_cu_24f8cd48_1911011nvfuser_192ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_0+32];
ld.param.u64 %rd8, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_192_cu_24f8cd48_1911011nvfuser_192ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_1];
ld.param.u64 %rd7, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_192_cu_24f8cd48_1911011nvfuser_192ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_0];
cvta.to.global.u64 %rd1, %rd7;
mov.u32 %r1, %tid.x;
setp.ne.s32 %p3, %r1, 0;
@%p3 bra $L__BB0_2;
mov.u32 %r49, 0;
st.shared.u32 [_ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_192_cu_24f8cd48_1911011nvfuser_192ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEEE14nvfuser_zero_s], %r49;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd9, _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_192_cu_24f8cd48_1911011nvfuser_192ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEEE14nvfuser_zero_s;
atom.shared.min.s32 %r50, [%rd9], %r1;
ld.shared.u32 %r51, [_ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_192_cu_24f8cd48_1911011nvfuser_192ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEEE14nvfuser_zero_s];
shl.b32 %r4, %r51, 1;
setp.lt.s32 %p4, %r1, 1;
@%p4 bra $L__BB0_3;
bra.uni $L__BB0_4;
$L__BB0_3:
mov.u32 %r52, %tid.y;
mov.u32 %r53, %ctaid.x;
mov.u32 %r54, %ntid.y;
mad.lo.s32 %r5, %r54, %r53, %r52;
setp.lt.s32 %p5, %r5, 27454;
@%p5 bra $L__BB0_8;
bra.uni $L__BB0_4;
$L__BB0_8:
shl.b32 %r63, %r1, 1;
add.s32 %r64, %r4, %r63;
mul.lo.s32 %r65, %r5, %r40;
mad.lo.s32 %r66, %r64, %r42, %r65;
mul.wide.s32 %rd14, %r66, 4;
add.s64 %rd15, %rd1, %rd14;
ld.global.f32 %f76, [%rd15];
add.s32 %r67, %r66, %r42;
mul.wide.s32 %rd16, %r67, 4;
add.s64 %rd17, %rd1, %rd16;
ld.global.f32 %f77, [%rd17];
bra.uni $L__BB0_9;
$L__BB0_4:
mov.u32 %r55, %ntid.y;
mov.u32 %r56, %ctaid.x;
mov.u32 %r57, %tid.y;
mad.lo.s32 %r58, %r55, %r56, %r57;
setp.lt.s32 %p7, %r58, 27454;
shl.b32 %r59, %r1, 1;
add.s32 %r6, %r4, %r59;
mul.lo.s32 %r7, %r58, %r40;
and.pred %p1, %p4, %p7;
mov.f32 %f77, 0fFF800000;
not.pred %p8, %p1;
mov.f32 %f76, %f77;
@%p8 bra $L__BB0_6;
mad.lo.s32 %r60, %r6, %r42, %r7;
mul.wide.s32 %rd10, %r60, 4;
add.s64 %rd11, %rd1, %rd10;
ld.global.f32 %f76, [%rd11];
$L__BB0_6:
@%p8 bra $L__BB0_9;
add.s32 %r61, %r6, 1;
mad.lo.s32 %r62, %r61, %r42, %r7;
mul.wide.s32 %rd12, %r62, 4;
add.s64 %rd13, %rd1, %rd12;
ld.global.f32 %f77, [%rd13];
$L__BB0_9:
setp.gt.f32 %p10, %f76, %f77;
setp.nan.f32 %p11, %f76, %f76;
or.pred %p12, %p11, %p10;
selp.f32 %f24, %f76, %f77, %p12;
mov.u32 %r68, %tid.z;
mov.u32 %r8, %ntid.y;
mov.u32 %r9, %tid.y;
mad.lo.s32 %r10, %r8, %r68, %r9;
mov.u32 %r11, %ntid.x;
mad.lo.s32 %r12, %r10, %r11, %r1;
mul.wide.u32 %rd18, %r12, 4;
mov.u64 %rd19, _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_192_cu_24f8cd48_191105arrayE;
add.s64 %rd2, %rd19, %rd18;
st.shared.f32 [%rd2], %f24;
bar.sync 0;
clz.b32 %r69, %r11;
mov.u32 %r70, 31;
sub.s32 %r71, %r70, %r69;
mov.u32 %r72, 1;
shl.b32 %r13, %r72, %r71;
setp.lt.u32 %p13, %r1, %r13;
add.s32 %r73, %r13, %r1;
setp.lt.u32 %p14, %r73, %r11;
and.pred %p2, %p13, %p14;
add.s32 %r74, %r12, %r13;
mul.wide.s32 %rd20, %r74, 4;
add.s64 %rd3, %rd19, %rd20;
not.pred %p15, %p2;
@%p15 bra $L__BB0_11;
ld.shared.f32 %f25, [%rd3];
ld.shared.f32 %f26, [%rd2];
setp.nan.f32 %p16, %f26, %f26;
setp.gt.f32 %p17, %f26, %f25;
or.pred %p18, %p16, %p17;
selp.f32 %f27, %f26, %f25, %p18;
st.shared.f32 [%rd2], %f27;
$L__BB0_11:
bar.sync 0;
shr.u32 %r75, %r13, 31;
add.s32 %r76, %r13, %r75;
shr.s32 %r99, %r76, 1;
setp.lt.s32 %p19, %r13, 4;
@%p19 bra $L__BB0_16;
mov.u32 %r98, %r99;
$L__BB0_13:
setp.ge.u32 %p20, %r1, %r98;
@%p20 bra $L__BB0_15;
add.s32 %r77, %r98, %r12;
mul.wide.s32 %rd21, %r77, 4;
add.s64 %rd23, %rd19, %rd21;
ld.shared.f32 %f28, [%rd2];
setp.nan.f32 %p21, %f28, %f28;
ld.shared.f32 %f29, [%rd23];
setp.gt.f32 %p22, %f28, %f29;
or.pred %p23, %p21, %p22;
selp.f32 %f30, %f28, %f29, %p23;
st.shared.f32 [%rd2], %f30;
$L__BB0_15:
bar.sync 0;
shr.u32 %r16, %r98, 1;
setp.gt.u32 %p24, %r98, 3;
mov.u32 %r98, %r16;
@%p24 bra $L__BB0_13;
$L__BB0_16:
add.s32 %r78, %r12, 1;
mul.wide.u32 %rd24, %r78, 4;
add.s64 %rd4, %rd19, %rd24;
mov.f32 %f78, 0fFF800000;
@%p3 bra $L__BB0_19;
ld.shared.f32 %f78, [%rd2];
setp.lt.u32 %p26, %r11, 2;
@%p26 bra $L__BB0_19;
ld.shared.f32 %f32, [%rd4];
setp.gt.f32 %p27, %f78, %f32;
setp.nan.f32 %p28, %f78, %f78;
or.pred %p29, %p28, %p27;
selp.f32 %f78, %f78, %f32, %p29;
$L__BB0_19:
bar.sync 0;
mul.wide.s32 %rd26, %r10, 4;
add.s64 %rd5, %rd19, %rd26;
setp.eq.s32 %p30, %r1, 0;
@%p30 bra $L__BB0_20;
bra.uni $L__BB0_21;
$L__BB0_20:
st.shared.f32 [%rd5], %f78;
$L__BB0_21:
bar.sync 0;
ld.shared.f32 %f33, [%rd5];
bar.sync 0;
sub.f32 %f34, %f76, %f33;
mov.f32 %f35, 0f3F000000;
mov.f32 %f36, 0f3BBB989D;
fma.rn.f32 %f37, %f34, %f36, %f35;
cvt.sat.f32.f32 %f38, %f37;
mov.f32 %f39, 0f4B400001;
mov.f32 %f40, 0f437C0000;
fma.rm.f32 %f41, %f38, %f40, %f39;
add.f32 %f42, %f41, 0fCB40007F;
neg.f32 %f43, %f42;
mov.f32 %f44, 0f3FB8AA3B;
fma.rn.f32 %f45, %f34, %f44, %f43;
mov.f32 %f46, 0f32A57060;
fma.rn.f32 %f47, %f34, %f46, %f45;
mov.b32 %r79, %f41;
shl.b32 %r80, %r79, 23;
mov.b32 %f48, %r80;
ex2.approx.ftz.f32 %f49, %f47;
mul.f32 %f11, %f49, %f48;
sub.f32 %f50, %f77, %f33;
fma.rn.f32 %f51, %f50, %f36, %f35;
cvt.sat.f32.f32 %f52, %f51;
fma.rm.f32 %f53, %f52, %f40, %f39;
add.f32 %f54, %f53, 0fCB40007F;
neg.f32 %f55, %f54;
fma.rn.f32 %f56, %f50, %f44, %f55;
fma.rn.f32 %f57, %f50, %f46, %f56;
mov.b32 %r81, %f53;
shl.b32 %r82, %r81, 23;
mov.b32 %f58, %r82;
ex2.approx.ftz.f32 %f59, %f57;
mul.f32 %f12, %f59, %f58;
@%p4 bra $L__BB0_22;
bra.uni $L__BB0_23;
$L__BB0_22:
mov.u32 %r83, %ctaid.x;
mad.lo.s32 %r84, %r8, %r83, %r9;
setp.lt.s32 %p32, %r84, 27454;
@%p32 bra $L__BB0_24;
bra.uni $L__BB0_23;
$L__BB0_24:
add.f32 %f62, %f11, 0f00000000;
add.f32 %f79, %f62, %f12;
bra.uni $L__BB0_25;
$L__BB0_23:
mov.u32 %r85, %ctaid.x;
mad.lo.s32 %r86, %r8, %r85, %r9;
setp.lt.s32 %p34, %r86, 27454;
and.pred %p35, %p4, %p34;
add.f32 %f60, %f11, 0f00000000;
add.f32 %f61, %f60, %f12;
selp.f32 %f79, %f61, 0f00000000, %p35;
$L__BB0_25:
st.shared.f32 [%rd2], %f79;
bar.sync 0;
@%p15 bra $L__BB0_27;
ld.shared.f32 %f63, [%rd3];
ld.shared.f32 %f64, [%rd2];
add.f32 %f65, %f63, %f64;
st.shared.f32 [%rd2], %f65;
$L__BB0_27:
bar.sync 0;
@%p19 bra $L__BB0_31;
$L__BB0_28:
setp.ge.u32 %p38, %r1, %r99;
@%p38 bra $L__BB0_30;
add.s32 %r87, %r99, %r12;
mul.wide.s32 %rd28, %r87, 4;
add.s64 %rd30, %rd19, %rd28;
ld.shared.f32 %f66, [%rd2];
ld.shared.f32 %f67, [%rd30];
add.f32 %f68, %f67, %f66;
st.shared.f32 [%rd2], %f68;
$L__BB0_30:
bar.sync 0;
shr.u32 %r18, %r99, 1;
setp.gt.u32 %p39, %r99, 3;
mov.u32 %r99, %r18;
@%p39 bra $L__BB0_28;
$L__BB0_31:
mov.f32 %f80, 0f00000000;
@%p3 bra $L__BB0_34;
ld.shared.f32 %f70, [%rd2];
add.f32 %f80, %f70, 0f00000000;
setp.lt.u32 %p41, %r11, 2;
@%p41 bra $L__BB0_34;
ld.shared.f32 %f71, [%rd4];
add.f32 %f80, %f80, %f71;
$L__BB0_34:
bar.sync 0;
@%p3 bra $L__BB0_36;
st.shared.f32 [%rd5], %f80;
$L__BB0_36:
setp.gt.s32 %p43, %r1, 0;
bar.sync 0;
ld.shared.f32 %f72, [%rd5];
bar.sync 0;
rcp.rn.f32 %f19, %f72;
@%p43 bra $L__BB0_38;
mov.u32 %r88, %ctaid.x;
mad.lo.s32 %r19, %r8, %r88, %r9;
setp.lt.s32 %p44, %r19, 27454;
@%p44 bra $L__BB0_41;
bra.uni $L__BB0_38;
$L__BB0_41:
mul.f32 %f73, %f19, %f11;
mov.b32 %r94, %f73;
mul.f32 %f74, %f19, %f12;
mov.b32 %r95, %f74;
add.s32 %r96, %r19, %r1;
shl.b32 %r97, %r96, 1;
mul.wide.s32 %rd34, %r97, 4;
add.s64 %rd33, %rd8, %rd34;
// begin inline asm
st.global.cs.v2.s32 [%rd33], {%r94,%r95};
// end inline asm
bra.uni $L__BB0_42;
$L__BB0_38:
mul.f32 %f20, %f19, %f11;
mul.f32 %f21, %f19, %f12;
@%p43 bra $L__BB0_42;
mov.u32 %r89, %ctaid.x;
mad.lo.s32 %r20, %r8, %r89, %r9;
setp.gt.s32 %p46, %r20, 27453;
@%p46 bra $L__BB0_42;
add.s32 %r92, %r20, %r1;
shl.b32 %r93, %r92, 1;
mul.wide.s32 %rd32, %r93, 4;
add.s64 %rd31, %rd8, %rd32;
mov.b32 %r90, %f20;
mov.b32 %r91, %f21;
// begin inline asm
st.global.cs.v2.s32 [%rd31], {%r90,%r91};
// end inline asm
$L__BB0_42:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_192_cu_ceabe2e8_1601111nvfuser_192ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEEE14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_192_cu_ceabe2e8_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_192_cu_ceabe2e8_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_192_cu_ceabe2e8_160113std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_192_cu_ceabe2e8_160115arrayE[];
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_192_cu_ceabe2e8_1601111nvfuser_192ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_192_cu_ceabe2e8_1601111nvfuser_192ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_0[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_192_cu_ceabe2e8_1601111nvfuser_192ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_1[32]
)
{
.reg .pred %p<34>;
.reg .f32 %f<71>;
.reg .b32 %r<71>;
.reg .b64 %rd<28>;
// demoted variable
.shared .align 4 .u32 _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_192_cu_ceabe2e8_1601111nvfuser_192ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEEE14nvfuser_zero_s;
ld.param.v2.u32 {%r32, %r33}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_192_cu_ceabe2e8_1601111nvfuser_192ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_0+24];
ld.param.v2.u32 {%r34, %r35}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_192_cu_ceabe2e8_1601111nvfuser_192ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_0+32];
ld.param.u64 %rd6, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_192_cu_ceabe2e8_1601111nvfuser_192ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_1];
ld.param.u64 %rd5, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_192_cu_ceabe2e8_1601111nvfuser_192ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_0];
mov.u32 %r1, %tid.x;
setp.ne.s32 %p2, %r1, 0;
@%p2 bra $L__BB0_2;
mov.u32 %r42, 0;
st.shared.u32 [_ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_192_cu_ceabe2e8_1601111nvfuser_192ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEEE14nvfuser_zero_s], %r42;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd7, _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_192_cu_ceabe2e8_1601111nvfuser_192ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEEE14nvfuser_zero_s;
atom.shared.min.s32 %r43, [%rd7], %r1;
mov.u32 %r44, %ctaid.x;
mov.u32 %r2, %ntid.y;
mov.u32 %r3, %tid.y;
mad.lo.s32 %r4, %r2, %r44, %r3;
setp.gt.s32 %p3, %r4, 27453;
mov.f32 %f67, 0fFF800000;
mov.f32 %f68, %f67;
@%p3 bra $L__BB0_4;
cvta.to.global.u64 %rd8, %rd5;
ld.shared.u32 %r45, [_ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_192_cu_ceabe2e8_1601111nvfuser_192ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEEE14nvfuser_zero_s];
shl.b32 %r46, %r45, 1;
mul.lo.s32 %r47, %r4, %r33;
mad.lo.s32 %r48, %r46, %r35, %r47;
mul.wide.s32 %rd9, %r48, 4;
add.s64 %rd10, %rd8, %rd9;
ld.global.f32 %f67, [%rd10];
add.s32 %r49, %r48, %r35;
mul.wide.s32 %rd11, %r49, 4;
add.s64 %rd12, %rd8, %rd11;
ld.global.f32 %f68, [%rd12];
$L__BB0_4:
setp.gt.f32 %p4, %f67, %f68;
setp.nan.f32 %p5, %f67, %f67;
or.pred %p6, %p5, %p4;
selp.f32 %f16, %f67, %f68, %p6;
mov.u32 %r50, %tid.z;
mad.lo.s32 %r5, %r2, %r50, %r3;
mov.u32 %r6, %ntid.x;
mad.lo.s32 %r7, %r5, %r6, %r1;
mul.wide.u32 %rd13, %r7, 4;
mov.u64 %rd14, _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_192_cu_ceabe2e8_160115arrayE;
add.s64 %rd1, %rd14, %rd13;
st.shared.f32 [%rd1], %f16;
bar.sync 0;
clz.b32 %r51, %r6;
mov.u32 %r52, 31;
sub.s32 %r53, %r52, %r51;
mov.u32 %r54, 1;
shl.b32 %r8, %r54, %r53;
setp.lt.u32 %p7, %r1, %r8;
add.s32 %r55, %r8, %r1;
setp.lt.u32 %p8, %r55, %r6;
and.pred %p1, %p7, %p8;
add.s32 %r56, %r7, %r8;
mul.wide.s32 %rd15, %r56, 4;
add.s64 %rd2, %rd14, %rd15;
not.pred %p9, %p1;
@%p9 bra $L__BB0_6;
ld.shared.f32 %f17, [%rd2];
ld.shared.f32 %f18, [%rd1];
setp.nan.f32 %p10, %f18, %f18;
setp.gt.f32 %p11, %f18, %f17;
or.pred %p12, %p10, %p11;
selp.f32 %f19, %f18, %f17, %p12;
st.shared.f32 [%rd1], %f19;
$L__BB0_6:
bar.sync 0;
shr.u32 %r57, %r8, 31;
add.s32 %r58, %r8, %r57;
shr.s32 %r70, %r58, 1;
setp.lt.s32 %p13, %r8, 4;
@%p13 bra $L__BB0_11;
mov.u32 %r69, %r70;
$L__BB0_8:
setp.ge.u32 %p14, %r1, %r69;
@%p14 bra $L__BB0_10;
add.s32 %r59, %r69, %r7;
mul.wide.s32 %rd16, %r59, 4;
add.s64 %rd18, %rd14, %rd16;
ld.shared.f32 %f20, [%rd1];
setp.nan.f32 %p15, %f20, %f20;
ld.shared.f32 %f21, [%rd18];
setp.gt.f32 %p16, %f20, %f21;
or.pred %p17, %p15, %p16;
selp.f32 %f22, %f20, %f21, %p17;
st.shared.f32 [%rd1], %f22;
$L__BB0_10:
bar.sync 0;
shr.u32 %r11, %r69, 1;
setp.gt.u32 %p18, %r69, 3;
mov.u32 %r69, %r11;
@%p18 bra $L__BB0_8;
$L__BB0_11:
add.s32 %r60, %r7, 1;
mul.wide.u32 %rd19, %r60, 4;
add.s64 %rd3, %rd14, %rd19;
mov.f32 %f69, 0fFF800000;
@%p2 bra $L__BB0_14;
ld.shared.f32 %f69, [%rd1];
setp.lt.u32 %p20, %r6, 2;
@%p20 bra $L__BB0_14;
ld.shared.f32 %f24, [%rd3];
setp.gt.f32 %p21, %f69, %f24;
setp.nan.f32 %p22, %f69, %f69;
or.pred %p23, %p22, %p21;
selp.f32 %f69, %f69, %f24, %p23;
$L__BB0_14:
bar.sync 0;
mul.wide.s32 %rd21, %r5, 4;
add.s64 %rd4, %rd14, %rd21;
setp.eq.s32 %p24, %r1, 0;
@%p24 bra $L__BB0_15;
bra.uni $L__BB0_16;
$L__BB0_15:
st.shared.f32 [%rd4], %f69;
$L__BB0_16:
setp.lt.s32 %p25, %r4, 27454;
bar.sync 0;
ld.shared.f32 %f25, [%rd4];
bar.sync 0;
sub.f32 %f26, %f67, %f25;
mov.f32 %f27, 0f3F000000;
mov.f32 %f28, 0f3BBB989D;
fma.rn.f32 %f29, %f26, %f28, %f27;
cvt.sat.f32.f32 %f30, %f29;
mov.f32 %f31, 0f4B400001;
mov.f32 %f32, 0f437C0000;
fma.rm.f32 %f33, %f30, %f32, %f31;
add.f32 %f34, %f33, 0fCB40007F;
neg.f32 %f35, %f34;
mov.f32 %f36, 0f3FB8AA3B;
fma.rn.f32 %f37, %f26, %f36, %f35;
mov.f32 %f38, 0f32A57060;
fma.rn.f32 %f39, %f26, %f38, %f37;
mov.b32 %r61, %f33;
shl.b32 %r62, %r61, 23;
mov.b32 %f40, %r62;
ex2.approx.ftz.f32 %f41, %f39;
mul.f32 %f8, %f41, %f40;
sub.f32 %f42, %f68, %f25;
fma.rn.f32 %f43, %f42, %f28, %f27;
cvt.sat.f32.f32 %f44, %f43;
fma.rm.f32 %f45, %f44, %f32, %f31;
add.f32 %f46, %f45, 0fCB40007F;
neg.f32 %f47, %f46;
fma.rn.f32 %f48, %f42, %f36, %f47;
fma.rn.f32 %f49, %f42, %f38, %f48;
mov.b32 %r63, %f45;
shl.b32 %r64, %r63, 23;
mov.b32 %f50, %r64;
ex2.approx.ftz.f32 %f51, %f49;
mul.f32 %f9, %f51, %f50;
add.f32 %f52, %f8, 0f00000000;
add.f32 %f53, %f52, %f9;
selp.f32 %f54, %f53, 0f00000000, %p25;
st.shared.f32 [%rd1], %f54;
bar.sync 0;
@%p9 bra $L__BB0_18;
ld.shared.f32 %f55, [%rd2];
ld.shared.f32 %f56, [%rd1];
add.f32 %f57, %f55, %f56;
st.shared.f32 [%rd1], %f57;
$L__BB0_18:
bar.sync 0;
@%p13 bra $L__BB0_22;
$L__BB0_19:
setp.ge.u32 %p28, %r1, %r70;
@%p28 bra $L__BB0_21;
add.s32 %r65, %r70, %r7;
mul.wide.s32 %rd23, %r65, 4;
add.s64 %rd25, %rd14, %rd23;
ld.shared.f32 %f58, [%rd1];
ld.shared.f32 %f59, [%rd25];
add.f32 %f60, %f59, %f58;
st.shared.f32 [%rd1], %f60;
$L__BB0_21:
bar.sync 0;
shr.u32 %r13, %r70, 1;
setp.gt.u32 %p29, %r70, 3;
mov.u32 %r70, %r13;
@%p29 bra $L__BB0_19;
$L__BB0_22:
mov.f32 %f70, 0f00000000;
@%p2 bra $L__BB0_25;
ld.shared.f32 %f62, [%rd1];
add.f32 %f70, %f62, 0f00000000;
setp.lt.u32 %p31, %r6, 2;
@%p31 bra $L__BB0_25;
ld.shared.f32 %f63, [%rd3];
add.f32 %f70, %f70, %f63;
$L__BB0_25:
bar.sync 0;
@%p2 bra $L__BB0_27;
st.shared.f32 [%rd4], %f70;
$L__BB0_27:
bar.sync 0;
ld.shared.f32 %f13, [%rd4];
bar.sync 0;
@%p3 bra $L__BB0_29;
rcp.rn.f32 %f64, %f13;
mul.f32 %f65, %f64, %f8;
mov.b32 %r66, %f65;
mul.f32 %f66, %f64, %f9;
mov.b32 %r67, %f66;
shl.b32 %r68, %r4, 1;
mul.wide.s32 %rd27, %r68, 4;
add.s64 %rd26, %rd6, %rd27;
// begin inline asm
st.global.cs.v2.s32 [%rd26], {%r66,%r67};
// end inline asm
$L__BB0_29:
ret;
}
--- 53997da5d
+++ 03a1b695e
@@ -19,350 +19,260 @@
.entry _ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE(
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_0[40],
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_1[32]
)
{
- .reg .pred %p<47>;
- .reg .f32 %f<81>;
- .reg .b32 %r<100>;
- .reg .b64 %rd<35>;
+ .reg .pred %p<34>;
+ .reg .f32 %f<71>;
+ .reg .b32 %r<71>;
+ .reg .b64 %rd<28>;
.shared .align 4 .u32 _ZZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEEE14nvfuser_zero_s;
- ld.param.v2.u32 {%r39, %r40}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_0+24];
- ld.param.v2.u32 {%r41, %r42}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_0+32];
- ld.param.u64 %rd8, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_1];
- ld.param.u64 %rd7, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_0];
- cvta.to.global.u64 %rd1, %rd7;
+ ld.param.v2.u32 {%r32, %r33}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_0+24];
+ ld.param.v2.u32 {%r34, %r35}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_0+32];
+ ld.param.u64 %rd6, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_1];
+ ld.param.u64 %rd5, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEE_param_0];
mov.u32 %r1, %tid.x;
- setp.ne.s32 %p3, %r1, 0;
- @%p3 bra $L__BB0_2;
-
- mov.u32 %r49, 0;
- st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEEE14nvfuser_zero_s], %r49;
+ setp.ne.s32 %p2, %r1, 0;
+ @%p2 bra $L__BB0_2;
+
+ mov.u32 %r42, 0;
+ st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEEE14nvfuser_zero_s], %r42;
$L__BB0_2:
bar.sync 0;
- mov.u64 %rd9, _ZZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEEE14nvfuser_zero_s;
- atom.shared.min.s32 %r50, [%rd9], %r1;
- ld.shared.u32 %r51, [_ZZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEEE14nvfuser_zero_s];
- shl.b32 %r4, %r51, 1;
- setp.lt.s32 %p4, %r1, 1;
- @%p4 bra $L__BB0_3;
- bra.uni $L__BB0_4;
-
-$L__BB0_3:
- mov.u32 %r52, %tid.y;
- mov.u32 %r53, %ctaid.x;
- mov.u32 %r54, %ntid.y;
- mad.lo.s32 %r5, %r54, %r53, %r52;
- setp.lt.s32 %p5, %r5, 27454;
- @%p5 bra $L__BB0_8;
- bra.uni $L__BB0_4;
+ mov.u64 %rd7, _ZZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEEE14nvfuser_zero_s;
+ atom.shared.min.s32 %r43, [%rd7], %r1;
+ mov.u32 %r44, %ctaid.x;
+ mov.u32 %r2, %ntid.y;
+ mov.u32 %r3, %tid.y;
+ mad.lo.s32 %r4, %r2, %r44, %r3;
+ setp.gt.s32 %p3, %r4, 27453;
+ mov.f32 %f67, 0fFF800000;
+ mov.f32 %f68, %f67;
+ @%p3 bra $L__BB0_4;
+
+ cvta.to.global.u64 %rd8, %rd5;
+ ld.shared.u32 %r45, [_ZZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEEE14nvfuser_zero_s];
+ shl.b32 %r46, %r45, 1;
+ mul.lo.s32 %r47, %r4, %r33;
+ mad.lo.s32 %r48, %r46, %r35, %r47;
+ mul.wide.s32 %rd9, %r48, 4;
+ add.s64 %rd10, %rd8, %rd9;
+ ld.global.f32 %f67, [%rd10];
+ add.s32 %r49, %r48, %r35;
+ mul.wide.s32 %rd11, %r49, 4;
+ add.s64 %rd12, %rd8, %rd11;
+ ld.global.f32 %f68, [%rd12];
+
+$L__BB0_4:
+ setp.gt.f32 %p4, %f67, %f68;
+ setp.nan.f32 %p5, %f67, %f67;
+ or.pred %p6, %p5, %p4;
+ selp.f32 %f16, %f67, %f68, %p6;
+ mov.u32 %r50, %tid.z;
+ mad.lo.s32 %r5, %r2, %r50, %r3;
+ mov.u32 %r6, %ntid.x;
+ mad.lo.s32 %r7, %r5, %r6, %r1;
+ mul.wide.u32 %rd13, %r7, 4;
+ mov.u64 %rd14, _ZN11kernelscope6kernelE;
+ add.s64 %rd1, %rd14, %rd13;
+ st.shared.f32 [%rd1], %f16;
+ bar.sync 0;
+ clz.b32 %r51, %r6;
+ mov.u32 %r52, 31;
+ sub.s32 %r53, %r52, %r51;
+ mov.u32 %r54, 1;
+ shl.b32 %r8, %r54, %r53;
+ setp.lt.u32 %p7, %r1, %r8;
+ add.s32 %r55, %r8, %r1;
+ setp.lt.u32 %p8, %r55, %r6;
+ and.pred %p1, %p7, %p8;
+ add.s32 %r56, %r7, %r8;
+ mul.wide.s32 %rd15, %r56, 4;
+ add.s64 %rd2, %rd14, %rd15;
+ not.pred %p9, %p1;
+ @%p9 bra $L__BB0_6;
+
+ ld.shared.f32 %f17, [%rd2];
+ ld.shared.f32 %f18, [%rd1];
+ setp.nan.f32 %p10, %f18, %f18;
+ setp.gt.f32 %p11, %f18, %f17;
+ or.pred %p12, %p10, %p11;
+ selp.f32 %f19, %f18, %f17, %p12;
+ st.shared.f32 [%rd1], %f19;
+
+$L__BB0_6:
+ bar.sync 0;
+ shr.u32 %r57, %r8, 31;
+ add.s32 %r58, %r8, %r57;
+ shr.s32 %r70, %r58, 1;
+ setp.lt.s32 %p13, %r8, 4;
+ @%p13 bra $L__BB0_11;
+
+ mov.u32 %r69, %r70;
$L__BB0_8:
- shl.b32 %r63, %r1, 1;
- add.s32 %r64, %r4, %r63;
- mul.lo.s32 %r65, %r5, %r40;
- mad.lo.s32 %r66, %r64, %r42, %r65;
- mul.wide.s32 %rd14, %r66, 4;
- add.s64 %rd15, %rd1, %rd14;
- ld.global.f32 %f76, [%rd15];
- add.s32 %r67, %r66, %r42;
- mul.wide.s32 %rd16, %r67, 4;
- add.s64 %rd17, %rd1, %rd16;
- ld.global.f32 %f77, [%rd17];
- bra.uni $L__BB0_9;
-
-$L__BB0_4:
- mov.u32 %r55, %ntid.y;
- mov.u32 %r56, %ctaid.x;
- mov.u32 %r57, %tid.y;
- mad.lo.s32 %r58, %r55, %r56, %r57;
- setp.lt.s32 %p7, %r58, 27454;
- shl.b32 %r59, %r1, 1;
- add.s32 %r6, %r4, %r59;
- mul.lo.s32 %r7, %r58, %r40;
- and.pred %p1, %p4, %p7;
- mov.f32 %f77, 0fFF800000;
- not.pred %p8, %p1;
- mov.f32 %f76, %f77;
- @%p8 bra $L__BB0_6;
-
- mad.lo.s32 %r60, %r6, %r42, %r7;
- mul.wide.s32 %rd10, %r60, 4;
- add.s64 %rd11, %rd1, %rd10;
- ld.global.f32 %f76, [%rd11];
-
-$L__BB0_6:
- @%p8 bra $L__BB0_9;
-
- add.s32 %r61, %r6, 1;
- mad.lo.s32 %r62, %r61, %r42, %r7;
- mul.wide.s32 %rd12, %r62, 4;
- add.s64 %rd13, %rd1, %rd12;
- ld.global.f32 %f77, [%rd13];
-
-$L__BB0_9:
- setp.gt.f32 %p10, %f76, %f77;
- setp.nan.f32 %p11, %f76, %f76;
- or.pred %p12, %p11, %p10;
- selp.f32 %f24, %f76, %f77, %p12;
- mov.u32 %r68, %tid.z;
- mov.u32 %r8, %ntid.y;
- mov.u32 %r9, %tid.y;
- mad.lo.s32 %r10, %r8, %r68, %r9;
- mov.u32 %r11, %ntid.x;
- mad.lo.s32 %r12, %r10, %r11, %r1;
- mul.wide.u32 %rd18, %r12, 4;
- mov.u64 %rd19, _ZN11kernelscope6kernelE;
- add.s64 %rd2, %rd19, %rd18;
- st.shared.f32 [%rd2], %f24;
- bar.sync 0;
- clz.b32 %r69, %r11;
- mov.u32 %r70, 31;
- sub.s32 %r71, %r70, %r69;
- mov.u32 %r72, 1;
- shl.b32 %r13, %r72, %r71;
- setp.lt.u32 %p13, %r1, %r13;
- add.s32 %r73, %r13, %r1;
- setp.lt.u32 %p14, %r73, %r11;
- and.pred %p2, %p13, %p14;
- add.s32 %r74, %r12, %r13;
- mul.wide.s32 %rd20, %r74, 4;
- add.s64 %rd3, %rd19, %rd20;
- not.pred %p15, %p2;
- @%p15 bra $L__BB0_11;
-
- ld.shared.f32 %f25, [%rd3];
- ld.shared.f32 %f26, [%rd2];
- setp.nan.f32 %p16, %f26, %f26;
- setp.gt.f32 %p17, %f26, %f25;
- or.pred %p18, %p16, %p17;
- selp.f32 %f27, %f26, %f25, %p18;
- st.shared.f32 [%rd2], %f27;
+ setp.ge.u32 %p14, %r1, %r69;
+ @%p14 bra $L__BB0_10;
+
+ add.s32 %r59, %r69, %r7;
+ mul.wide.s32 %rd16, %r59, 4;
+ add.s64 %rd18, %rd14, %rd16;
+ ld.shared.f32 %f20, [%rd1];
+ setp.nan.f32 %p15, %f20, %f20;
+ ld.shared.f32 %f21, [%rd18];
+ setp.gt.f32 %p16, %f20, %f21;
+ or.pred %p17, %p15, %p16;
+ selp.f32 %f22, %f20, %f21, %p17;
+ st.shared.f32 [%rd1], %f22;
+
+$L__BB0_10:
+ bar.sync 0;
+ shr.u32 %r11, %r69, 1;
+ setp.gt.u32 %p18, %r69, 3;
+ mov.u32 %r69, %r11;
+ @%p18 bra $L__BB0_8;
$L__BB0_11:
- bar.sync 0;
- shr.u32 %r75, %r13, 31;
- add.s32 %r76, %r13, %r75;
- shr.s32 %r99, %r76, 1;
- setp.lt.s32 %p19, %r13, 4;
- @%p19 bra $L__BB0_16;
-
- mov.u32 %r98, %r99;
-
-$L__BB0_13:
- setp.ge.u32 %p20, %r1, %r98;
- @%p20 bra $L__BB0_15;
-
- add.s32 %r77, %r98, %r12;
- mul.wide.s32 %rd21, %r77, 4;
- add.s64 %rd23, %rd19, %rd21;
- ld.shared.f32 %f28, [%rd2];
- setp.nan.f32 %p21, %f28, %f28;
- ld.shared.f32 %f29, [%rd23];
- setp.gt.f32 %p22, %f28, %f29;
- or.pred %p23, %p21, %p22;
- selp.f32 %f30, %f28, %f29, %p23;
- st.shared.f32 [%rd2], %f30;
+ add.s32 %r60, %r7, 1;
+ mul.wide.u32 %rd19, %r60, 4;
+ add.s64 %rd3, %rd14, %rd19;
+ mov.f32 %f69, 0fFF800000;
+ @%p2 bra $L__BB0_14;
+
+ ld.shared.f32 %f69, [%rd1];
+ setp.lt.u32 %p20, %r6, 2;
+ @%p20 bra $L__BB0_14;
+
+ ld.shared.f32 %f24, [%rd3];
+ setp.gt.f32 %p21, %f69, %f24;
+ setp.nan.f32 %p22, %f69, %f69;
+ or.pred %p23, %p22, %p21;
+ selp.f32 %f69, %f69, %f24, %p23;
+
+$L__BB0_14:
+ bar.sync 0;
+ mul.wide.s32 %rd21, %r5, 4;
+ add.s64 %rd4, %rd14, %rd21;
+ setp.eq.s32 %p24, %r1, 0;
+ @%p24 bra $L__BB0_15;
+ bra.uni $L__BB0_16;
$L__BB0_15:
- bar.sync 0;
- shr.u32 %r16, %r98, 1;
- setp.gt.u32 %p24, %r98, 3;
- mov.u32 %r98, %r16;
- @%p24 bra $L__BB0_13;
+ st.shared.f32 [%rd4], %f69;
$L__BB0_16:
- add.s32 %r78, %r12, 1;
- mul.wide.u32 %rd24, %r78, 4;
- add.s64 %rd4, %rd19, %rd24;
- mov.f32 %f78, 0fFF800000;
- @%p3 bra $L__BB0_19;
-
- ld.shared.f32 %f78, [%rd2];
- setp.lt.u32 %p26, %r11, 2;
- @%p26 bra $L__BB0_19;
-
- ld.shared.f32 %f32, [%rd4];
- setp.gt.f32 %p27, %f78, %f32;
- setp.nan.f32 %p28, %f78, %f78;
- or.pred %p29, %p28, %p27;
- selp.f32 %f78, %f78, %f32, %p29;
+ setp.lt.s32 %p25, %r4, 27454;
+ bar.sync 0;
+ ld.shared.f32 %f25, [%rd4];
+ bar.sync 0;
+ sub.f32 %f26, %f67, %f25;
+ mov.f32 %f27, 0f3F000000;
+ mov.f32 %f28, 0f3BBB989D;
+ fma.rn.f32 %f29, %f26, %f28, %f27;
+ cvt.sat.f32.f32 %f30, %f29;
+ mov.f32 %f31, 0f4B400001;
+ mov.f32 %f32, 0f437C0000;
+ fma.rm.f32 %f33, %f30, %f32, %f31;
+ add.f32 %f34, %f33, 0fCB40007F;
+ neg.f32 %f35, %f34;
+ mov.f32 %f36, 0f3FB8AA3B;
+ fma.rn.f32 %f37, %f26, %f36, %f35;
+ mov.f32 %f38, 0f32A57060;
+ fma.rn.f32 %f39, %f26, %f38, %f37;
+ mov.b32 %r61, %f33;
+ shl.b32 %r62, %r61, 23;
+ mov.b32 %f40, %r62;
+ ex2.approx.ftz.f32 %f41, %f39;
+ mul.f32 %f8, %f41, %f40;
+ sub.f32 %f42, %f68, %f25;
+ fma.rn.f32 %f43, %f42, %f28, %f27;
+ cvt.sat.f32.f32 %f44, %f43;
+ fma.rm.f32 %f45, %f44, %f32, %f31;
+ add.f32 %f46, %f45, 0fCB40007F;
+ neg.f32 %f47, %f46;
+ fma.rn.f32 %f48, %f42, %f36, %f47;
+ fma.rn.f32 %f49, %f42, %f38, %f48;
+ mov.b32 %r63, %f45;
+ shl.b32 %r64, %r63, 23;
+ mov.b32 %f50, %r64;
+ ex2.approx.ftz.f32 %f51, %f49;
+ mul.f32 %f9, %f51, %f50;
+ add.f32 %f52, %f8, 0f00000000;
+ add.f32 %f53, %f52, %f9;
+ selp.f32 %f54, %f53, 0f00000000, %p25;
+ st.shared.f32 [%rd1], %f54;
+ bar.sync 0;
+ @%p9 bra $L__BB0_18;
+
+ ld.shared.f32 %f55, [%rd2];
+ ld.shared.f32 %f56, [%rd1];
+ add.f32 %f57, %f55, %f56;
+ st.shared.f32 [%rd1], %f57;
+
+$L__BB0_18:
+ bar.sync 0;
+ @%p13 bra $L__BB0_22;
$L__BB0_19:
- bar.sync 0;
- mul.wide.s32 %rd26, %r10, 4;
- add.s64 %rd5, %rd19, %rd26;
- setp.eq.s32 %p30, %r1, 0;
- @%p30 bra $L__BB0_20;
- bra.uni $L__BB0_21;
-
-$L__BB0_20:
- st.shared.f32 [%rd5], %f78;
+ setp.ge.u32 %p28, %r1, %r70;
+ @%p28 bra $L__BB0_21;
+
+ add.s32 %r65, %r70, %r7;
+ mul.wide.s32 %rd23, %r65, 4;
+ add.s64 %rd25, %rd14, %rd23;
+ ld.shared.f32 %f58, [%rd1];
+ ld.shared.f32 %f59, [%rd25];
+ add.f32 %f60, %f59, %f58;
+ st.shared.f32 [%rd1], %f60;
$L__BB0_21:
bar.sync 0;
- ld.shared.f32 %f33, [%rd5];
- bar.sync 0;
- sub.f32 %f34, %f76, %f33;
- mov.f32 %f35, 0f3F000000;
- mov.f32 %f36, 0f3BBB989D;
- fma.rn.f32 %f37, %f34, %f36, %f35;
- cvt.sat.f32.f32 %f38, %f37;
- mov.f32 %f39, 0f4B400001;
- mov.f32 %f40, 0f437C0000;
- fma.rm.f32 %f41, %f38, %f40, %f39;
- add.f32 %f42, %f41, 0fCB40007F;
- neg.f32 %f43, %f42;
- mov.f32 %f44, 0f3FB8AA3B;
- fma.rn.f32 %f45, %f34, %f44, %f43;
- mov.f32 %f46, 0f32A57060;
- fma.rn.f32 %f47, %f34, %f46, %f45;
- mov.b32 %r79, %f41;
- shl.b32 %r80, %r79, 23;
- mov.b32 %f48, %r80;
- ex2.approx.ftz.f32 %f49, %f47;
- mul.f32 %f11, %f49, %f48;
- sub.f32 %f50, %f77, %f33;
- fma.rn.f32 %f51, %f50, %f36, %f35;
- cvt.sat.f32.f32 %f52, %f51;
- fma.rm.f32 %f53, %f52, %f40, %f39;
- add.f32 %f54, %f53, 0fCB40007F;
- neg.f32 %f55, %f54;
- fma.rn.f32 %f56, %f50, %f44, %f55;
- fma.rn.f32 %f57, %f50, %f46, %f56;
- mov.b32 %r81, %f53;
- shl.b32 %r82, %r81, 23;
- mov.b32 %f58, %r82;
- ex2.approx.ftz.f32 %f59, %f57;
- mul.f32 %f12, %f59, %f58;
- @%p4 bra $L__BB0_22;
- bra.uni $L__BB0_23;
+ shr.u32 %r13, %r70, 1;
+ setp.gt.u32 %p29, %r70, 3;
+ mov.u32 %r70, %r13;
+ @%p29 bra $L__BB0_19;
$L__BB0_22:
- mov.u32 %r83, %ctaid.x;
- mad.lo.s32 %r84, %r8, %r83, %r9;
- setp.lt.s32 %p32, %r84, 27454;
- @%p32 bra $L__BB0_24;
- bra.uni $L__BB0_23;
-
-$L__BB0_24:
- add.f32 %f62, %f11, 0f00000000;
- add.f32 %f79, %f62, %f12;
- bra.uni $L__BB0_25;
-
-$L__BB0_23:
- mov.u32 %r85, %ctaid.x;
- mad.lo.s32 %r86, %r8, %r85, %r9;
- setp.lt.s32 %p34, %r86, 27454;
- and.pred %p35, %p4, %p34;
- add.f32 %f60, %f11, 0f00000000;
- add.f32 %f61, %f60, %f12;
- selp.f32 %f79, %f61, 0f00000000, %p35;
+ mov.f32 %f70, 0f00000000;
+ @%p2 bra $L__BB0_25;
+
+ ld.shared.f32 %f62, [%rd1];
+ add.f32 %f70, %f62, 0f00000000;
+ setp.lt.u32 %p31, %r6, 2;
+ @%p31 bra $L__BB0_25;
+
+ ld.shared.f32 %f63, [%rd3];
+ add.f32 %f70, %f70, %f63;
$L__BB0_25:
- st.shared.f32 [%rd2], %f79;
- bar.sync 0;
- @%p15 bra $L__BB0_27;
-
- ld.shared.f32 %f63, [%rd3];
- ld.shared.f32 %f64, [%rd2];
- add.f32 %f65, %f63, %f64;
- st.shared.f32 [%rd2], %f65;
+ bar.sync 0;
+ @%p2 bra $L__BB0_27;
+
+ st.shared.f32 [%rd4], %f70;
$L__BB0_27:
bar.sync 0;
- @%p19 bra $L__BB0_31;
-
-$L__BB0_28:
- setp.ge.u32 %p38, %r1, %r99;
- @%p38 bra $L__BB0_30;
-
- add.s32 %r87, %r99, %r12;
- mul.wide.s32 %rd28, %r87, 4;
- add.s64 %rd30, %rd19, %rd28;
- ld.shared.f32 %f66, [%rd2];
- ld.shared.f32 %f67, [%rd30];
- add.f32 %f68, %f67, %f66;
- st.shared.f32 [%rd2], %f68;
-
-$L__BB0_30:
- bar.sync 0;
- shr.u32 %r18, %r99, 1;
- setp.gt.u32 %p39, %r99, 3;
- mov.u32 %r99, %r18;
- @%p39 bra $L__BB0_28;
-
-$L__BB0_31:
- mov.f32 %f80, 0f00000000;
- @%p3 bra $L__BB0_34;
-
- ld.shared.f32 %f70, [%rd2];
- add.f32 %f80, %f70, 0f00000000;
- setp.lt.u32 %p41, %r11, 2;
- @%p41 bra $L__BB0_34;
-
- ld.shared.f32 %f71, [%rd4];
- add.f32 %f80, %f80, %f71;
-
-$L__BB0_34:
- bar.sync 0;
- @%p3 bra $L__BB0_36;
-
- st.shared.f32 [%rd5], %f80;
-
-$L__BB0_36:
- setp.gt.s32 %p43, %r1, 0;
- bar.sync 0;
- ld.shared.f32 %f72, [%rd5];
- bar.sync 0;
- rcp.rn.f32 %f19, %f72;
- @%p43 bra $L__BB0_38;
-
- mov.u32 %r88, %ctaid.x;
- mad.lo.s32 %r19, %r8, %r88, %r9;
- setp.lt.s32 %p44, %r19, 27454;
- @%p44 bra $L__BB0_41;
- bra.uni $L__BB0_38;
-
-$L__BB0_41:
- mul.f32 %f73, %f19, %f11;
- mov.b32 %r94, %f73;
- mul.f32 %f74, %f19, %f12;
- mov.b32 %r95, %f74;
- add.s32 %r96, %r19, %r1;
- shl.b32 %r97, %r96, 1;
- mul.wide.s32 %rd34, %r97, 4;
- add.s64 %rd33, %rd8, %rd34;
+ ld.shared.f32 %f13, [%rd4];
+ bar.sync 0;
+ @%p3 bra $L__BB0_29;
+
+ rcp.rn.f32 %f64, %f13;
+ mul.f32 %f65, %f64, %f8;
+ mov.b32 %r66, %f65;
+ mul.f32 %f66, %f64, %f9;
+ mov.b32 %r67, %f66;
+ shl.b32 %r68, %r4, 1;
+ mul.wide.s32 %rd27, %r68, 4;
+ add.s64 %rd26, %rd6, %rd27;
- st.global.cs.v2.s32 [%rd33], {%r94,%r95};
+ st.global.cs.v2.s32 [%rd26], {%r66,%r67};
- bra.uni $L__BB0_42;
-
-$L__BB0_38:
- mul.f32 %f20, %f19, %f11;
- mul.f32 %f21, %f19, %f12;
- @%p43 bra $L__BB0_42;
-
- mov.u32 %r89, %ctaid.x;
- mad.lo.s32 %r20, %r8, %r89, %r9;
- setp.gt.s32 %p46, %r20, 27453;
- @%p46 bra $L__BB0_42;
-
- add.s32 %r92, %r20, %r1;
- shl.b32 %r93, %r92, 1;
- mul.wide.s32 %rd32, %r93, 4;
- add.s64 %rd31, %rd8, %rd32;
- mov.b32 %r90, %f20;
- mov.b32 %r91, %f21;
-
- st.global.cs.v2.s32 [%rd31], {%r90,%r91};
-
-
-$L__BB0_42:
+
+$L__BB0_29:
ret;
}
2: GpuViewTest.FusionReshapeSplit
Kernel 1
CUDA
PTX
53997da5d
Diff
03a1b695e
-2
+2 index type: int
registers: 14
gmem: 3
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 1, 1> T0, Tensor<float, 1, 1> T1, Tensor<float, 3, 3> T8) {
if (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < (8 * (ceilDiv((ceilDiv(T0.logical_size[0LL], 2)), 4))))) {
Array<float, 1, 1> T10;
T10[0] = 0;
T10[0]
= T1[((T1.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (4 * (ceilDiv((ceilDiv(T0.logical_size[0LL], 2)), 4))))) + (((ceilDiv(T0.logical_size[0LL], 2)) * T1.alloc_stride[0LL]) * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / (4 * (ceilDiv((ceilDiv(T0.logical_size[0LL], 2)), 4))))))];
Array<float, 1, 1> T9;
T9[0] = 0;
T9[0]
= T0[((T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (4 * (ceilDiv((ceilDiv(T0.logical_size[0LL], 2)), 4))))) + ((T0.alloc_stride[0LL] * (ceilDiv(T0.logical_size[0LL], 2))) * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / (4 * (ceilDiv((ceilDiv(T0.logical_size[0LL], 2)), 4))))))];
Array<float, 1, 1> T2;
T2[0]
= T9[0]
+ T10[0];
Array<float, 1, 1> T3;
T3[0]
= T2[0];
Array<float, 1, 1> T4;
T4[0]
= T3[0]
* (float) 7.07106781186547573e-01;
Array<float, 1, 1> T5;
T5[0]
= erff(T4[0]);
Array<float, 1, 1> T6;
T6[0]
= (float) 1.00000000000000000e+00
+ T5[0];
Array<float, 1, 1> T7;
T7[0]
= (float) 5.00000000000000000e-01
* T6[0];
Array<float, 1, 1> T11;
T11[0]
= T3[0]
* T7[0];
T8[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
= T11[0];
}
}
__global__ void nvfuser_N(Tensor<float, 1, 1> T0, Tensor<float, 1, 1> T1, Tensor<float, 3, 3> T8) {
if (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < (8 * (ceilDiv((ceilDiv(T0.logical_size[0LL], 2)), 4))))) {
Array<float, 1, 1> T10;
T10[0] = 0;
T10[0]
= T1[((T1.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.x)) + ((128 * T1.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.x)))];
Array<float, 1, 1> T9;
T9[0] = 0;
T9[0]
= T0[((T0.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.x)) + ((128 * T0.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.x)))];
Array<float, 1, 1> T2;
T2[0]
= T9[0]
+ T10[0];
Array<float, 1, 1> T3;
T3[0]
= T2[0];
Array<float, 1, 1> T4;
T4[0]
= T3[0]
* (float) 7.07106781186547573e-01;
Array<float, 1, 1> T5;
T5[0]
= erff(T4[0]);
Array<float, 1, 1> T6;
T6[0]
= (float) 1.00000000000000000e+00
+ T5[0];
Array<float, 1, 1> T7;
T7[0]
= (float) 5.00000000000000000e-01
* T6[0];
Array<float, 1, 1> T11;
T11[0]
= T3[0]
* T7[0];
T8[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
= T11[0];
}
}
--- 53997da5d
+++ 03a1b695e
@@ -1,15 +1,15 @@
__global__ void nvfuser_N(Tensor<float, 1, 1> T0, Tensor<float, 1, 1> T1, Tensor<float, 3, 3> T8) {
if (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < (8 * (ceilDiv((ceilDiv(T0.logical_size[0LL], 2)), 4))))) {
Array<float, 1, 1> T10;
T10[0] = 0;
T10[0]
- = T1[((T1.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (4 * (ceilDiv((ceilDiv(T0.logical_size[0LL], 2)), 4))))) + (((ceilDiv(T0.logical_size[0LL], 2)) * T1.alloc_stride[0LL]) * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / (4 * (ceilDiv((ceilDiv(T0.logical_size[0LL], 2)), 4))))))];
+ = T1[((T1.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.x)) + ((128 * T1.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.x)))];
Array<float, 1, 1> T9;
T9[0] = 0;
T9[0]
- = T0[((T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (4 * (ceilDiv((ceilDiv(T0.logical_size[0LL], 2)), 4))))) + ((T0.alloc_stride[0LL] * (ceilDiv(T0.logical_size[0LL], 2))) * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / (4 * (ceilDiv((ceilDiv(T0.logical_size[0LL], 2)), 4))))))];
+ = T0[((T0.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.x)) + ((128 * T0.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.x)))];
Array<float, 1, 1> T2;
T2[0]
= T9[0]
+ T10[0];
Array<float, 1, 1> T3;
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_208_cu_af6ae103_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_208_cu_af6ae103_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_208_cu_af6ae103_191103std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_208_cu_af6ae103_1911011nvfuser_208ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_208_cu_af6ae103_1911011nvfuser_208ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_0[16],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_208_cu_af6ae103_1911011nvfuser_208ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_1[16],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_208_cu_af6ae103_1911011nvfuser_208ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_2[32]
)
{
.reg .pred %p<4>;
.reg .f32 %f<33>;
.reg .b32 %r<46>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r14, %r15}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_208_cu_af6ae103_1911011nvfuser_208ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_0+8];
ld.param.v2.u32 {%r16, %r17}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_208_cu_af6ae103_1911011nvfuser_208ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_1+8];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_208_cu_af6ae103_1911011nvfuser_208ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_2];
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_208_cu_af6ae103_1911011nvfuser_208ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_208_cu_af6ae103_1911011nvfuser_208ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_0];
mov.u32 %r24, %ctaid.x;
shl.b32 %r25, %r24, 7;
mov.u32 %r26, %tid.x;
add.s32 %r1, %r25, %r26;
add.s32 %r27, %r14, 1;
shr.u32 %r28, %r27, 31;
add.s32 %r29, %r27, %r28;
shr.s32 %r2, %r29, 1;
add.s32 %r30, %r2, 3;
shr.s32 %r31, %r30, 31;
shr.u32 %r32, %r31, 30;
add.s32 %r33, %r30, %r32;
shr.s32 %r3, %r33, 2;
shl.b32 %r34, %r3, 3;
setp.ge.s32 %p1, %r1, %r34;
@%p1 bra $L__BB0_4;
cvta.to.global.u64 %rd4, %rd2;
cvta.to.global.u64 %rd5, %rd1;
shl.b32 %r35, %r3, 2;
div.s32 %r36, %r1, %r35;
mul.lo.s32 %r37, %r36, %r35;
sub.s32 %r38, %r1, %r37;
mad.lo.s32 %r39, %r36, %r2, %r38;
mul.lo.s32 %r40, %r39, %r17;
mul.wide.s32 %rd6, %r40, 4;
add.s64 %rd7, %rd4, %rd6;
mul.lo.s32 %r41, %r39, %r15;
mul.wide.s32 %rd8, %r41, 4;
add.s64 %rd9, %rd5, %rd8;
ld.global.f32 %f6, [%rd9];
ld.global.f32 %f7, [%rd7];
add.f32 %f1, %f7, %f6;
mul.f32 %f2, %f1, 0f3F3504F3;
abs.f32 %f8, %f2;
setp.ltu.f32 %p2, %f8, 0f3F8060FE;
setp.ge.f32 %p3, %f8, 0f3F8060FE;
mul.f32 %f9, %f2, %f2;
selp.f32 %f10, %f8, %f9, %p3;
selp.f32 %f11, 0f38EB4C3A, 0f38B1E96A, %p3;
selp.f32 %f12, 0fBAAE005B, 0fBA574D20, %p3;
fma.rn.f32 %f13, %f11, %f10, %f12;
selp.f32 %f14, 0f3C09919F, 0f3BAAD5EA, %p3;
fma.rn.f32 %f15, %f13, %f10, %f14;
selp.f32 %f16, 0fBD24D99A, 0fBCDC1BE7, %p3;
fma.rn.f32 %f17, %f15, %f10, %f16;
selp.f32 %f18, 0f3E235519, 0f3DE718AF, %p3;
fma.rn.f32 %f19, %f17, %f10, %f18;
selp.f32 %f20, 0f3F69B4F9, 0fBEC093AC, %p3;
fma.rn.f32 %f21, %f19, %f10, %f20;
selp.f32 %f22, 0f3F210A14, 0f3E0375D3, %p3;
fma.rn.f32 %f23, %f21, %f10, %f22;
neg.f32 %f24, %f8;
selp.f32 %f25, %f24, %f2, %p3;
fma.rn.f32 %f32, %f23, %f25, %f25;
@%p2 bra $L__BB0_3;
ex2.approx.ftz.f32 %f26, %f32;
mov.f32 %f27, 0f3F800000;
sub.f32 %f28, %f27, %f26;
mov.b32 %r42, %f28;
mov.b32 %r43, %f2;
and.b32 %r44, %r43, -2147483648;
or.b32 %r45, %r44, %r42;
mov.b32 %f32, %r45;
$L__BB0_3:
add.f32 %f29, %f32, 0f3F800000;
mul.f32 %f30, %f29, 0f3F000000;
mul.f32 %f31, %f1, %f30;
cvta.to.global.u64 %rd10, %rd3;
mul.wide.s32 %rd11, %r1, 4;
add.s64 %rd12, %rd10, %rd11;
st.global.f32 [%rd12], %f31;
$L__BB0_4:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_208_cu_eb7d9607_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_208_cu_eb7d9607_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_208_cu_eb7d9607_160113std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_208_cu_eb7d9607_1601111nvfuser_208ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_208_cu_eb7d9607_1601111nvfuser_208ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_0[16],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_208_cu_eb7d9607_1601111nvfuser_208ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_1[16],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_208_cu_eb7d9607_1601111nvfuser_208ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_2[32]
)
{
.reg .pred %p<4>;
.reg .f32 %f<33>;
.reg .b32 %r<45>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r12, %r13}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_208_cu_eb7d9607_1601111nvfuser_208ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_0+8];
ld.param.v2.u32 {%r14, %r15}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_208_cu_eb7d9607_1601111nvfuser_208ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_1+8];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_208_cu_eb7d9607_1601111nvfuser_208ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_2];
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_208_cu_eb7d9607_1601111nvfuser_208ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_208_cu_eb7d9607_1601111nvfuser_208ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_0];
mov.u32 %r22, %ctaid.x;
shl.b32 %r23, %r22, 7;
mov.u32 %r24, %tid.x;
add.s32 %r1, %r23, %r24;
add.s32 %r25, %r12, 1;
shr.u32 %r26, %r25, 31;
add.s32 %r27, %r25, %r26;
shr.s32 %r28, %r27, 1;
add.s32 %r29, %r28, 3;
shr.s32 %r30, %r29, 31;
shr.u32 %r31, %r30, 30;
add.s32 %r32, %r29, %r31;
shl.b32 %r33, %r32, 1;
and.b32 %r34, %r33, -8;
setp.ge.s32 %p1, %r1, %r34;
@%p1 bra $L__BB0_4;
cvta.to.global.u64 %rd4, %rd2;
cvta.to.global.u64 %rd5, %rd1;
mul.lo.s32 %r35, %r1, %r15;
mul.wide.s32 %rd6, %r35, 4;
add.s64 %rd7, %rd4, %rd6;
mul.lo.s32 %r36, %r1, %r13;
mul.wide.s32 %rd8, %r36, 4;
add.s64 %rd9, %rd5, %rd8;
ld.global.f32 %f6, [%rd9];
ld.global.f32 %f7, [%rd7];
add.f32 %f1, %f7, %f6;
mul.f32 %f2, %f1, 0f3F3504F3;
abs.f32 %f8, %f2;
setp.ltu.f32 %p2, %f8, 0f3F8060FE;
setp.ge.f32 %p3, %f8, 0f3F8060FE;
mul.f32 %f9, %f2, %f2;
selp.f32 %f10, %f8, %f9, %p3;
selp.f32 %f11, 0f38EB4C3A, 0f38B1E96A, %p3;
selp.f32 %f12, 0fBAAE005B, 0fBA574D20, %p3;
fma.rn.f32 %f13, %f11, %f10, %f12;
selp.f32 %f14, 0f3C09919F, 0f3BAAD5EA, %p3;
fma.rn.f32 %f15, %f13, %f10, %f14;
selp.f32 %f16, 0fBD24D99A, 0fBCDC1BE7, %p3;
fma.rn.f32 %f17, %f15, %f10, %f16;
selp.f32 %f18, 0f3E235519, 0f3DE718AF, %p3;
fma.rn.f32 %f19, %f17, %f10, %f18;
selp.f32 %f20, 0f3F69B4F9, 0fBEC093AC, %p3;
fma.rn.f32 %f21, %f19, %f10, %f20;
selp.f32 %f22, 0f3F210A14, 0f3E0375D3, %p3;
fma.rn.f32 %f23, %f21, %f10, %f22;
neg.f32 %f24, %f8;
selp.f32 %f25, %f24, %f2, %p3;
fma.rn.f32 %f32, %f23, %f25, %f25;
@%p2 bra $L__BB0_3;
ex2.approx.ftz.f32 %f26, %f32;
mov.f32 %f27, 0f3F800000;
sub.f32 %f28, %f27, %f26;
mov.b32 %r37, %f28;
mov.b32 %r38, %f2;
and.b32 %r39, %r38, -2147483648;
or.b32 %r40, %r39, %r37;
mov.b32 %f32, %r40;
$L__BB0_3:
add.f32 %f29, %f32, 0f3F800000;
mul.f32 %f30, %f29, 0f3F000000;
mul.f32 %f31, %f1, %f30;
cvta.to.global.u64 %rd10, %rd3;
mul.wide.s32 %rd11, %r1, 4;
add.s64 %rd12, %rd10, %rd11;
st.global.f32 [%rd12], %f31;
$L__BB0_4:
ret;
}
--- 53997da5d
+++ 03a1b695e
@@ -20,48 +20,43 @@
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_2[32]
)
{
.reg .pred %p<4>;
.reg .f32 %f<33>;
- .reg .b32 %r<46>;
+ .reg .b32 %r<45>;
.reg .b64 %rd<13>;
- ld.param.v2.u32 {%r14, %r15}, [_ZN11kernelscope6kernelENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_0+8];
- ld.param.v2.u32 {%r16, %r17}, [_ZN11kernelscope6kernelENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_1+8];
+ ld.param.v2.u32 {%r12, %r13}, [_ZN11kernelscope6kernelENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_0+8];
+ ld.param.v2.u32 {%r14, %r15}, [_ZN11kernelscope6kernelENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_1+8];
ld.param.u64 %rd3, [_ZN11kernelscope6kernelENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_2];
ld.param.u64 %rd2, [_ZN11kernelscope6kernelENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_1];
ld.param.u64 %rd1, [_ZN11kernelscope6kernelENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_0];
- mov.u32 %r24, %ctaid.x;
- shl.b32 %r25, %r24, 7;
- mov.u32 %r26, %tid.x;
- add.s32 %r1, %r25, %r26;
- add.s32 %r27, %r14, 1;
- shr.u32 %r28, %r27, 31;
- add.s32 %r29, %r27, %r28;
- shr.s32 %r2, %r29, 1;
- add.s32 %r30, %r2, 3;
- shr.s32 %r31, %r30, 31;
- shr.u32 %r32, %r31, 30;
- add.s32 %r33, %r30, %r32;
- shr.s32 %r3, %r33, 2;
- shl.b32 %r34, %r3, 3;
+ mov.u32 %r22, %ctaid.x;
+ shl.b32 %r23, %r22, 7;
+ mov.u32 %r24, %tid.x;
+ add.s32 %r1, %r23, %r24;
+ add.s32 %r25, %r12, 1;
+ shr.u32 %r26, %r25, 31;
+ add.s32 %r27, %r25, %r26;
+ shr.s32 %r28, %r27, 1;
+ add.s32 %r29, %r28, 3;
+ shr.s32 %r30, %r29, 31;
+ shr.u32 %r31, %r30, 30;
+ add.s32 %r32, %r29, %r31;
+ shl.b32 %r33, %r32, 1;
+ and.b32 %r34, %r33, -8;
setp.ge.s32 %p1, %r1, %r34;
@%p1 bra $L__BB0_4;
cvta.to.global.u64 %rd4, %rd2;
cvta.to.global.u64 %rd5, %rd1;
- shl.b32 %r35, %r3, 2;
- div.s32 %r36, %r1, %r35;
- mul.lo.s32 %r37, %r36, %r35;
- sub.s32 %r38, %r1, %r37;
- mad.lo.s32 %r39, %r36, %r2, %r38;
- mul.lo.s32 %r40, %r39, %r17;
- mul.wide.s32 %rd6, %r40, 4;
+ mul.lo.s32 %r35, %r1, %r15;
+ mul.wide.s32 %rd6, %r35, 4;
add.s64 %rd7, %rd4, %rd6;
- mul.lo.s32 %r41, %r39, %r15;
- mul.wide.s32 %rd8, %r41, 4;
+ mul.lo.s32 %r36, %r1, %r13;
+ mul.wide.s32 %rd8, %r36, 4;
add.s64 %rd9, %rd5, %rd8;
ld.global.f32 %f6, [%rd9];
ld.global.f32 %f7, [%rd7];
add.f32 %f1, %f7, %f6;
mul.f32 %f2, %f1, 0f3F3504F3;
@@ -89,15 +84,15 @@
@%p2 bra $L__BB0_3;
ex2.approx.ftz.f32 %f26, %f32;
mov.f32 %f27, 0f3F800000;
sub.f32 %f28, %f27, %f26;
- mov.b32 %r42, %f28;
- mov.b32 %r43, %f2;
- and.b32 %r44, %r43, -2147483648;
- or.b32 %r45, %r44, %r42;
- mov.b32 %f32, %r45;
+ mov.b32 %r37, %f28;
+ mov.b32 %r38, %f2;
+ and.b32 %r39, %r38, -2147483648;
+ or.b32 %r40, %r39, %r37;
+ mov.b32 %f32, %r40;
$L__BB0_3:
add.f32 %f29, %f32, 0f3F800000;
mul.f32 %f30, %f29, 0f3F000000;
mul.f32 %f31, %f1, %f30;
Kernel 2
CUDA
PTX
53997da5d
Diff
03a1b695e
-4
+4 index type: int
registers: 14
gmem: 3
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 1, 1> T0, Tensor<float, 1, 1> T1, Tensor<float, 3, 3> T8) {
if (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < 80)) {
Array<float, 1, 1> T10;
T10[0] = 0;
T10[0]
= T1[((T1.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.x)) + ((128 * T1.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.x)))];
Array<float, 1, 1> T9;
T9[0] = 0;
T9[0]
= T0[((T0.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.x)) + ((128 * T0.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.x)))];
Array<float, 1, 1> T2;
T2[0]
= T9[0]
+ T10[0];
Array<float, 1, 1> T3;
T3[0]
= T2[0];
Array<float, 1, 1> T4;
T4[0]
= T3[0]
* (float) 7.07106781186547573e-01;
Array<float, 1, 1> T5;
T5[0]
= erff(T4[0]);
Array<float, 1, 1> T6;
T6[0]
= (float) 1.00000000000000000e+00
+ T5[0];
Array<float, 1, 1> T7;
T7[0]
= (float) 5.00000000000000000e-01
* T6[0];
Array<float, 1, 1> T11;
T11[0]
= T3[0]
* T7[0];
T8[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
= T11[0];
}
}
__global__ void nvfuser_N(Tensor<float, 1, 1> T0, Tensor<float, 1, 1> T1, Tensor<float, 3, 3> T8) {
if ((((nvfuser_index_t)threadIdx.x) < 80)) {
Array<float, 1, 1> T10;
T10[0] = 0;
T10[0]
= T1[(T1.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.x))];
Array<float, 1, 1> T9;
T9[0] = 0;
T9[0]
= T0[(T0.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.x))];
Array<float, 1, 1> T2;
T2[0]
= T9[0]
+ T10[0];
Array<float, 1, 1> T3;
T3[0]
= T2[0];
Array<float, 1, 1> T4;
T4[0]
= T3[0]
* (float) 7.07106781186547573e-01;
Array<float, 1, 1> T5;
T5[0]
= erff(T4[0]);
Array<float, 1, 1> T6;
T6[0]
= (float) 1.00000000000000000e+00
+ T5[0];
Array<float, 1, 1> T7;
T7[0]
= (float) 5.00000000000000000e-01
* T6[0];
Array<float, 1, 1> T11;
T11[0]
= T3[0]
* T7[0];
T8[((nvfuser_index_t)threadIdx.x)]
= T11[0];
}
}
--- 53997da5d
+++ 03a1b695e
@@ -1,15 +1,15 @@
__global__ void nvfuser_N(Tensor<float, 1, 1> T0, Tensor<float, 1, 1> T1, Tensor<float, 3, 3> T8) {
- if (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < 80)) {
+ if ((((nvfuser_index_t)threadIdx.x) < 80)) {
Array<float, 1, 1> T10;
T10[0] = 0;
T10[0]
- = T1[((T1.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.x)) + ((128 * T1.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.x)))];
+ = T1[(T1.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.x))];
Array<float, 1, 1> T9;
T9[0] = 0;
T9[0]
- = T0[((T0.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.x)) + ((128 * T0.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.x)))];
+ = T0[(T0.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.x))];
Array<float, 1, 1> T2;
T2[0]
= T9[0]
+ T10[0];
Array<float, 1, 1> T3;
@@ -32,9 +32,9 @@
* T6[0];
Array<float, 1, 1> T11;
T11[0]
= T3[0]
* T7[0];
- T8[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
+ T8[((nvfuser_index_t)threadIdx.x)]
= T11[0];
}
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_209_cu_af6ae103_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_209_cu_af6ae103_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_209_cu_af6ae103_191103std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_209_cu_af6ae103_1911011nvfuser_209ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_209_cu_af6ae103_1911011nvfuser_209ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_0[16],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_209_cu_af6ae103_1911011nvfuser_209ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_1[16],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_209_cu_af6ae103_1911011nvfuser_209ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_2[32]
)
{
.reg .pred %p<4>;
.reg .f32 %f<33>;
.reg .b32 %r<35>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r12, %r13}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_209_cu_af6ae103_1911011nvfuser_209ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_0+8];
ld.param.v2.u32 {%r14, %r15}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_209_cu_af6ae103_1911011nvfuser_209ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_1+8];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_209_cu_af6ae103_1911011nvfuser_209ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_2];
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_209_cu_af6ae103_1911011nvfuser_209ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_209_cu_af6ae103_1911011nvfuser_209ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_0];
mov.u32 %r22, %tid.x;
mov.u32 %r23, %ctaid.x;
shl.b32 %r24, %r23, 7;
add.s32 %r1, %r24, %r22;
setp.gt.s32 %p1, %r1, 79;
@%p1 bra $L__BB0_4;
cvta.to.global.u64 %rd4, %rd2;
cvta.to.global.u64 %rd5, %rd1;
mul.lo.s32 %r25, %r1, %r15;
mul.wide.s32 %rd6, %r25, 4;
add.s64 %rd7, %rd4, %rd6;
mul.lo.s32 %r26, %r1, %r13;
mul.wide.s32 %rd8, %r26, 4;
add.s64 %rd9, %rd5, %rd8;
ld.global.f32 %f6, [%rd9];
ld.global.f32 %f7, [%rd7];
add.f32 %f1, %f7, %f6;
mul.f32 %f2, %f1, 0f3F3504F3;
abs.f32 %f8, %f2;
setp.ltu.f32 %p2, %f8, 0f3F8060FE;
setp.ge.f32 %p3, %f8, 0f3F8060FE;
mul.f32 %f9, %f2, %f2;
selp.f32 %f10, %f8, %f9, %p3;
selp.f32 %f11, 0f38EB4C3A, 0f38B1E96A, %p3;
selp.f32 %f12, 0fBAAE005B, 0fBA574D20, %p3;
fma.rn.f32 %f13, %f11, %f10, %f12;
selp.f32 %f14, 0f3C09919F, 0f3BAAD5EA, %p3;
fma.rn.f32 %f15, %f13, %f10, %f14;
selp.f32 %f16, 0fBD24D99A, 0fBCDC1BE7, %p3;
fma.rn.f32 %f17, %f15, %f10, %f16;
selp.f32 %f18, 0f3E235519, 0f3DE718AF, %p3;
fma.rn.f32 %f19, %f17, %f10, %f18;
selp.f32 %f20, 0f3F69B4F9, 0fBEC093AC, %p3;
fma.rn.f32 %f21, %f19, %f10, %f20;
selp.f32 %f22, 0f3F210A14, 0f3E0375D3, %p3;
fma.rn.f32 %f23, %f21, %f10, %f22;
neg.f32 %f24, %f8;
selp.f32 %f25, %f24, %f2, %p3;
fma.rn.f32 %f32, %f23, %f25, %f25;
@%p2 bra $L__BB0_3;
ex2.approx.ftz.f32 %f26, %f32;
mov.f32 %f27, 0f3F800000;
sub.f32 %f28, %f27, %f26;
mov.b32 %r27, %f28;
mov.b32 %r28, %f2;
and.b32 %r29, %r28, -2147483648;
or.b32 %r30, %r29, %r27;
mov.b32 %f32, %r30;
$L__BB0_3:
add.f32 %f29, %f32, 0f3F800000;
mul.f32 %f30, %f29, 0f3F000000;
mul.f32 %f31, %f1, %f30;
cvta.to.global.u64 %rd10, %rd3;
mul.wide.s32 %rd11, %r1, 4;
add.s64 %rd12, %rd10, %rd11;
st.global.f32 [%rd12], %f31;
$L__BB0_4:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_209_cu_eb7d9607_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_209_cu_eb7d9607_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_209_cu_eb7d9607_160113std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_209_cu_eb7d9607_1601111nvfuser_209ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_209_cu_eb7d9607_1601111nvfuser_209ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_0[16],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_209_cu_eb7d9607_1601111nvfuser_209ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_1[16],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_209_cu_eb7d9607_1601111nvfuser_209ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_2[32]
)
{
.reg .pred %p<4>;
.reg .f32 %f<33>;
.reg .b32 %r<29>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r12, %r13}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_209_cu_eb7d9607_1601111nvfuser_209ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_0+8];
ld.param.v2.u32 {%r14, %r15}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_209_cu_eb7d9607_1601111nvfuser_209ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_1+8];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_209_cu_eb7d9607_1601111nvfuser_209ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_2];
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_209_cu_eb7d9607_1601111nvfuser_209ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_209_cu_eb7d9607_1601111nvfuser_209ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_0];
mov.u32 %r1, %tid.x;
setp.gt.s32 %p1, %r1, 79;
@%p1 bra $L__BB0_4;
cvta.to.global.u64 %rd4, %rd2;
cvta.to.global.u64 %rd5, %rd1;
mul.lo.s32 %r22, %r15, %r1;
mul.wide.s32 %rd6, %r22, 4;
add.s64 %rd7, %rd4, %rd6;
mul.lo.s32 %r23, %r13, %r1;
mul.wide.s32 %rd8, %r23, 4;
add.s64 %rd9, %rd5, %rd8;
ld.global.f32 %f6, [%rd9];
ld.global.f32 %f7, [%rd7];
add.f32 %f1, %f7, %f6;
mul.f32 %f2, %f1, 0f3F3504F3;
abs.f32 %f8, %f2;
setp.ltu.f32 %p2, %f8, 0f3F8060FE;
setp.ge.f32 %p3, %f8, 0f3F8060FE;
mul.f32 %f9, %f2, %f2;
selp.f32 %f10, %f8, %f9, %p3;
selp.f32 %f11, 0f38EB4C3A, 0f38B1E96A, %p3;
selp.f32 %f12, 0fBAAE005B, 0fBA574D20, %p3;
fma.rn.f32 %f13, %f11, %f10, %f12;
selp.f32 %f14, 0f3C09919F, 0f3BAAD5EA, %p3;
fma.rn.f32 %f15, %f13, %f10, %f14;
selp.f32 %f16, 0fBD24D99A, 0fBCDC1BE7, %p3;
fma.rn.f32 %f17, %f15, %f10, %f16;
selp.f32 %f18, 0f3E235519, 0f3DE718AF, %p3;
fma.rn.f32 %f19, %f17, %f10, %f18;
selp.f32 %f20, 0f3F69B4F9, 0fBEC093AC, %p3;
fma.rn.f32 %f21, %f19, %f10, %f20;
selp.f32 %f22, 0f3F210A14, 0f3E0375D3, %p3;
fma.rn.f32 %f23, %f21, %f10, %f22;
neg.f32 %f24, %f8;
selp.f32 %f25, %f24, %f2, %p3;
fma.rn.f32 %f32, %f23, %f25, %f25;
@%p2 bra $L__BB0_3;
ex2.approx.ftz.f32 %f26, %f32;
mov.f32 %f27, 0f3F800000;
sub.f32 %f28, %f27, %f26;
mov.b32 %r24, %f28;
mov.b32 %r25, %f2;
and.b32 %r26, %r25, -2147483648;
or.b32 %r27, %r26, %r24;
mov.b32 %f32, %r27;
$L__BB0_3:
add.f32 %f29, %f32, 0f3F800000;
mul.f32 %f30, %f29, 0f3F000000;
mul.f32 %f31, %f1, %f30;
cvta.to.global.u64 %rd10, %rd3;
mul.wide.s32 %rd11, %r1, 4;
add.s64 %rd12, %rd10, %rd11;
st.global.f32 [%rd12], %f31;
$L__BB0_4:
ret;
}
--- 53997da5d
+++ 03a1b695e
@@ -20,33 +20,30 @@
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_2[32]
)
{
.reg .pred %p<4>;
.reg .f32 %f<33>;
- .reg .b32 %r<35>;
+ .reg .b32 %r<29>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r12, %r13}, [_ZN11kernelscope6kernelENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_0+8];
ld.param.v2.u32 {%r14, %r15}, [_ZN11kernelscope6kernelENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_1+8];
ld.param.u64 %rd3, [_ZN11kernelscope6kernelENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_2];
ld.param.u64 %rd2, [_ZN11kernelscope6kernelENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_1];
ld.param.u64 %rd1, [_ZN11kernelscope6kernelENS_6TensorIfLi1ELi1EEES1_NS0_IfLi3ELi3EEE_param_0];
- mov.u32 %r22, %tid.x;
- mov.u32 %r23, %ctaid.x;
- shl.b32 %r24, %r23, 7;
- add.s32 %r1, %r24, %r22;
+ mov.u32 %r1, %tid.x;
setp.gt.s32 %p1, %r1, 79;
@%p1 bra $L__BB0_4;
cvta.to.global.u64 %rd4, %rd2;
cvta.to.global.u64 %rd5, %rd1;
- mul.lo.s32 %r25, %r1, %r15;
- mul.wide.s32 %rd6, %r25, 4;
+ mul.lo.s32 %r22, %r15, %r1;
+ mul.wide.s32 %rd6, %r22, 4;
add.s64 %rd7, %rd4, %rd6;
- mul.lo.s32 %r26, %r1, %r13;
- mul.wide.s32 %rd8, %r26, 4;
+ mul.lo.s32 %r23, %r13, %r1;
+ mul.wide.s32 %rd8, %r23, 4;
add.s64 %rd9, %rd5, %rd8;
ld.global.f32 %f6, [%rd9];
ld.global.f32 %f7, [%rd7];
add.f32 %f1, %f7, %f6;
mul.f32 %f2, %f1, 0f3F3504F3;
@@ -74,15 +71,15 @@
@%p2 bra $L__BB0_3;
ex2.approx.ftz.f32 %f26, %f32;
mov.f32 %f27, 0f3F800000;
sub.f32 %f28, %f27, %f26;
- mov.b32 %r27, %f28;
- mov.b32 %r28, %f2;
- and.b32 %r29, %r28, -2147483648;
- or.b32 %r30, %r29, %r27;
- mov.b32 %f32, %r30;
+ mov.b32 %r24, %f28;
+ mov.b32 %r25, %f2;
+ and.b32 %r26, %r25, -2147483648;
+ or.b32 %r27, %r26, %r24;
+ mov.b32 %f32, %r27;
$L__BB0_3:
add.f32 %f29, %f32, 0f3F800000;
mul.f32 %f30, %f29, 0f3F000000;
mul.f32 %f31, %f1, %f30;
3: GpuViewTest.FusionReshapeBroadcast
Kernel 2
CUDA
PTX
53997da5d
Diff
03a1b695e
-4
+4 index type: int
registers: 14
gmem: 3
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 1, 1> T0, Tensor<float, 1, 1> T1, Tensor<float, 2, 2> T8) {
if (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < 80)) {
Array<float, 1, 1> T10;
T10[0] = 0;
T10[0]
= T1[((T1.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.x)) + ((128 * T1.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.x)))];
Array<float, 1, 1> T9;
T9[0] = 0;
T9[0]
= T0[((T0.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.x)) + ((128 * T0.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.x)))];
Array<float, 1, 1> T2;
T2[0]
= T9[0]
+ T10[0];
Array<float, 1, 1> T3;
T3[0]
= T2[0];
Array<float, 1, 1> T4;
T4[0]
= T3[0]
* (float) 7.07106781186547573e-01;
Array<float, 1, 1> T5;
T5[0]
= erff(T4[0]);
Array<float, 1, 1> T6;
T6[0]
= (float) 1.00000000000000000e+00
+ T5[0];
Array<float, 1, 1> T7;
T7[0]
= (float) 5.00000000000000000e-01
* T6[0];
Array<float, 1, 1> T11;
T11[0]
= T3[0]
* T7[0];
T8[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
= T11[0];
}
}
__global__ void nvfuser_N(Tensor<float, 1, 1> T0, Tensor<float, 1, 1> T1, Tensor<float, 2, 2> T8) {
if ((((nvfuser_index_t)threadIdx.x) < 80)) {
Array<float, 1, 1> T10;
T10[0] = 0;
T10[0]
= T1[(T1.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.x))];
Array<float, 1, 1> T9;
T9[0] = 0;
T9[0]
= T0[(T0.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.x))];
Array<float, 1, 1> T2;
T2[0]
= T9[0]
+ T10[0];
Array<float, 1, 1> T3;
T3[0]
= T2[0];
Array<float, 1, 1> T4;
T4[0]
= T3[0]
* (float) 7.07106781186547573e-01;
Array<float, 1, 1> T5;
T5[0]
= erff(T4[0]);
Array<float, 1, 1> T6;
T6[0]
= (float) 1.00000000000000000e+00
+ T5[0];
Array<float, 1, 1> T7;
T7[0]
= (float) 5.00000000000000000e-01
* T6[0];
Array<float, 1, 1> T11;
T11[0]
= T3[0]
* T7[0];
T8[((nvfuser_index_t)threadIdx.x)]
= T11[0];
}
}
--- 53997da5d
+++ 03a1b695e
@@ -1,15 +1,15 @@
__global__ void nvfuser_N(Tensor<float, 1, 1> T0, Tensor<float, 1, 1> T1, Tensor<float, 2, 2> T8) {
- if (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < 80)) {
+ if ((((nvfuser_index_t)threadIdx.x) < 80)) {
Array<float, 1, 1> T10;
T10[0] = 0;
T10[0]
- = T1[((T1.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.x)) + ((128 * T1.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.x)))];
+ = T1[(T1.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.x))];
Array<float, 1, 1> T9;
T9[0] = 0;
T9[0]
- = T0[((T0.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.x)) + ((128 * T0.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.x)))];
+ = T0[(T0.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.x))];
Array<float, 1, 1> T2;
T2[0]
= T9[0]
+ T10[0];
Array<float, 1, 1> T3;
@@ -32,9 +32,9 @@
* T6[0];
Array<float, 1, 1> T11;
T11[0]
= T3[0]
* T7[0];
- T8[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
+ T8[((nvfuser_index_t)threadIdx.x)]
= T11[0];
}
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_211_cu_af6ae103_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_211_cu_af6ae103_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_211_cu_af6ae103_191103std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_211_cu_af6ae103_1911011nvfuser_211ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi2ELi2EEE(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_211_cu_af6ae103_1911011nvfuser_211ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi2ELi2EEE_param_0[16],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_211_cu_af6ae103_1911011nvfuser_211ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi2ELi2EEE_param_1[16],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_211_cu_af6ae103_1911011nvfuser_211ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi2ELi2EEE_param_2[24]
)
{
.reg .pred %p<4>;
.reg .f32 %f<33>;
.reg .b32 %r<31>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r10, %r11}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_211_cu_af6ae103_1911011nvfuser_211ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi2ELi2EEE_param_0+8];
ld.param.v2.u32 {%r12, %r13}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_211_cu_af6ae103_1911011nvfuser_211ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi2ELi2EEE_param_1+8];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_211_cu_af6ae103_1911011nvfuser_211ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi2ELi2EEE_param_2];
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_211_cu_af6ae103_1911011nvfuser_211ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi2ELi2EEE_param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_211_cu_af6ae103_1911011nvfuser_211ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi2ELi2EEE_param_0];
mov.u32 %r18, %tid.x;
mov.u32 %r19, %ctaid.x;
shl.b32 %r20, %r19, 7;
add.s32 %r1, %r20, %r18;
setp.gt.s32 %p1, %r1, 79;
@%p1 bra $L__BB0_4;
cvta.to.global.u64 %rd4, %rd2;
cvta.to.global.u64 %rd5, %rd1;
mul.lo.s32 %r21, %r1, %r13;
mul.wide.s32 %rd6, %r21, 4;
add.s64 %rd7, %rd4, %rd6;
mul.lo.s32 %r22, %r1, %r11;
mul.wide.s32 %rd8, %r22, 4;
add.s64 %rd9, %rd5, %rd8;
ld.global.f32 %f6, [%rd9];
ld.global.f32 %f7, [%rd7];
add.f32 %f1, %f7, %f6;
mul.f32 %f2, %f1, 0f3F3504F3;
abs.f32 %f8, %f2;
setp.ltu.f32 %p2, %f8, 0f3F8060FE;
setp.ge.f32 %p3, %f8, 0f3F8060FE;
mul.f32 %f9, %f2, %f2;
selp.f32 %f10, %f8, %f9, %p3;
selp.f32 %f11, 0f38EB4C3A, 0f38B1E96A, %p3;
selp.f32 %f12, 0fBAAE005B, 0fBA574D20, %p3;
fma.rn.f32 %f13, %f11, %f10, %f12;
selp.f32 %f14, 0f3C09919F, 0f3BAAD5EA, %p3;
fma.rn.f32 %f15, %f13, %f10, %f14;
selp.f32 %f16, 0fBD24D99A, 0fBCDC1BE7, %p3;
fma.rn.f32 %f17, %f15, %f10, %f16;
selp.f32 %f18, 0f3E235519, 0f3DE718AF, %p3;
fma.rn.f32 %f19, %f17, %f10, %f18;
selp.f32 %f20, 0f3F69B4F9, 0fBEC093AC, %p3;
fma.rn.f32 %f21, %f19, %f10, %f20;
selp.f32 %f22, 0f3F210A14, 0f3E0375D3, %p3;
fma.rn.f32 %f23, %f21, %f10, %f22;
neg.f32 %f24, %f8;
selp.f32 %f25, %f24, %f2, %p3;
fma.rn.f32 %f32, %f23, %f25, %f25;
@%p2 bra $L__BB0_3;
ex2.approx.ftz.f32 %f26, %f32;
mov.f32 %f27, 0f3F800000;
sub.f32 %f28, %f27, %f26;
mov.b32 %r23, %f28;
mov.b32 %r24, %f2;
and.b32 %r25, %r24, -2147483648;
or.b32 %r26, %r25, %r23;
mov.b32 %f32, %r26;
$L__BB0_3:
add.f32 %f29, %f32, 0f3F800000;
mul.f32 %f30, %f29, 0f3F000000;
mul.f32 %f31, %f1, %f30;
cvta.to.global.u64 %rd10, %rd3;
mul.wide.s32 %rd11, %r1, 4;
add.s64 %rd12, %rd10, %rd11;
st.global.f32 [%rd12], %f31;
$L__BB0_4:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_211_cu_eb7d9607_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_211_cu_eb7d9607_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_211_cu_eb7d9607_160113std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_211_cu_eb7d9607_1601111nvfuser_211ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi2ELi2EEE(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_211_cu_eb7d9607_1601111nvfuser_211ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi2ELi2EEE_param_0[16],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_211_cu_eb7d9607_1601111nvfuser_211ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi2ELi2EEE_param_1[16],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_211_cu_eb7d9607_1601111nvfuser_211ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi2ELi2EEE_param_2[24]
)
{
.reg .pred %p<4>;
.reg .f32 %f<33>;
.reg .b32 %r<25>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r10, %r11}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_211_cu_eb7d9607_1601111nvfuser_211ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi2ELi2EEE_param_0+8];
ld.param.v2.u32 {%r12, %r13}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_211_cu_eb7d9607_1601111nvfuser_211ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi2ELi2EEE_param_1+8];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_211_cu_eb7d9607_1601111nvfuser_211ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi2ELi2EEE_param_2];
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_211_cu_eb7d9607_1601111nvfuser_211ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi2ELi2EEE_param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_211_cu_eb7d9607_1601111nvfuser_211ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi2ELi2EEE_param_0];
mov.u32 %r1, %tid.x;
setp.gt.s32 %p1, %r1, 79;
@%p1 bra $L__BB0_4;
cvta.to.global.u64 %rd4, %rd2;
cvta.to.global.u64 %rd5, %rd1;
mul.lo.s32 %r18, %r13, %r1;
mul.wide.s32 %rd6, %r18, 4;
add.s64 %rd7, %rd4, %rd6;
mul.lo.s32 %r19, %r11, %r1;
mul.wide.s32 %rd8, %r19, 4;
add.s64 %rd9, %rd5, %rd8;
ld.global.f32 %f6, [%rd9];
ld.global.f32 %f7, [%rd7];
add.f32 %f1, %f7, %f6;
mul.f32 %f2, %f1, 0f3F3504F3;
abs.f32 %f8, %f2;
setp.ltu.f32 %p2, %f8, 0f3F8060FE;
setp.ge.f32 %p3, %f8, 0f3F8060FE;
mul.f32 %f9, %f2, %f2;
selp.f32 %f10, %f8, %f9, %p3;
selp.f32 %f11, 0f38EB4C3A, 0f38B1E96A, %p3;
selp.f32 %f12, 0fBAAE005B, 0fBA574D20, %p3;
fma.rn.f32 %f13, %f11, %f10, %f12;
selp.f32 %f14, 0f3C09919F, 0f3BAAD5EA, %p3;
fma.rn.f32 %f15, %f13, %f10, %f14;
selp.f32 %f16, 0fBD24D99A, 0fBCDC1BE7, %p3;
fma.rn.f32 %f17, %f15, %f10, %f16;
selp.f32 %f18, 0f3E235519, 0f3DE718AF, %p3;
fma.rn.f32 %f19, %f17, %f10, %f18;
selp.f32 %f20, 0f3F69B4F9, 0fBEC093AC, %p3;
fma.rn.f32 %f21, %f19, %f10, %f20;
selp.f32 %f22, 0f3F210A14, 0f3E0375D3, %p3;
fma.rn.f32 %f23, %f21, %f10, %f22;
neg.f32 %f24, %f8;
selp.f32 %f25, %f24, %f2, %p3;
fma.rn.f32 %f32, %f23, %f25, %f25;
@%p2 bra $L__BB0_3;
ex2.approx.ftz.f32 %f26, %f32;
mov.f32 %f27, 0f3F800000;
sub.f32 %f28, %f27, %f26;
mov.b32 %r20, %f28;
mov.b32 %r21, %f2;
and.b32 %r22, %r21, -2147483648;
or.b32 %r23, %r22, %r20;
mov.b32 %f32, %r23;
$L__BB0_3:
add.f32 %f29, %f32, 0f3F800000;
mul.f32 %f30, %f29, 0f3F000000;
mul.f32 %f31, %f1, %f30;
cvta.to.global.u64 %rd10, %rd3;
mul.wide.s32 %rd11, %r1, 4;
add.s64 %rd12, %rd10, %rd11;
st.global.f32 [%rd12], %f31;
$L__BB0_4:
ret;
}
--- 53997da5d
+++ 03a1b695e
@@ -20,33 +20,30 @@
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi1ELi1EEES1_NS0_IfLi2ELi2EEE_param_2[24]
)
{
.reg .pred %p<4>;
.reg .f32 %f<33>;
- .reg .b32 %r<31>;
+ .reg .b32 %r<25>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r10, %r11}, [_ZN11kernelscope6kernelENS_6TensorIfLi1ELi1EEES1_NS0_IfLi2ELi2EEE_param_0+8];
ld.param.v2.u32 {%r12, %r13}, [_ZN11kernelscope6kernelENS_6TensorIfLi1ELi1EEES1_NS0_IfLi2ELi2EEE_param_1+8];
ld.param.u64 %rd3, [_ZN11kernelscope6kernelENS_6TensorIfLi1ELi1EEES1_NS0_IfLi2ELi2EEE_param_2];
ld.param.u64 %rd2, [_ZN11kernelscope6kernelENS_6TensorIfLi1ELi1EEES1_NS0_IfLi2ELi2EEE_param_1];
ld.param.u64 %rd1, [_ZN11kernelscope6kernelENS_6TensorIfLi1ELi1EEES1_NS0_IfLi2ELi2EEE_param_0];
- mov.u32 %r18, %tid.x;
- mov.u32 %r19, %ctaid.x;
- shl.b32 %r20, %r19, 7;
- add.s32 %r1, %r20, %r18;
+ mov.u32 %r1, %tid.x;
setp.gt.s32 %p1, %r1, 79;
@%p1 bra $L__BB0_4;
cvta.to.global.u64 %rd4, %rd2;
cvta.to.global.u64 %rd5, %rd1;
- mul.lo.s32 %r21, %r1, %r13;
- mul.wide.s32 %rd6, %r21, 4;
+ mul.lo.s32 %r18, %r13, %r1;
+ mul.wide.s32 %rd6, %r18, 4;
add.s64 %rd7, %rd4, %rd6;
- mul.lo.s32 %r22, %r1, %r11;
- mul.wide.s32 %rd8, %r22, 4;
+ mul.lo.s32 %r19, %r11, %r1;
+ mul.wide.s32 %rd8, %r19, 4;
add.s64 %rd9, %rd5, %rd8;
ld.global.f32 %f6, [%rd9];
ld.global.f32 %f7, [%rd7];
add.f32 %f1, %f7, %f6;
mul.f32 %f2, %f1, 0f3F3504F3;
@@ -74,15 +71,15 @@
@%p2 bra $L__BB0_3;
ex2.approx.ftz.f32 %f26, %f32;
mov.f32 %f27, 0f3F800000;
sub.f32 %f28, %f27, %f26;
- mov.b32 %r23, %f28;
- mov.b32 %r24, %f2;
- and.b32 %r25, %r24, -2147483648;
- or.b32 %r26, %r25, %r23;
- mov.b32 %f32, %r26;
+ mov.b32 %r20, %f28;
+ mov.b32 %r21, %f2;
+ and.b32 %r22, %r21, -2147483648;
+ or.b32 %r23, %r22, %r20;
+ mov.b32 %f32, %r23;
$L__BB0_3:
add.f32 %f29, %f32, 0f3F800000;
mul.f32 %f30, %f29, 0f3F000000;
mul.f32 %f31, %f1, %f30;
4: GpuViewTest.FusionReshapeAllShmoo
Kernel 20
CUDA
PTX
53997da5d
Diff
03a1b695e
-4
+4 index type: int
registers: 18
gmem: 3
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 4, 4> T0, Tensor<float, 4, 4> T1, Tensor<float, 7, 7> T9) {
if (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < 120)) {
Array<float, 1, 1> T11;
T11[0] = 0;
T11[0]
= T1[(((((T1.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 60)) + (T1.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 20) % 3))) + ((2 * T1.alloc_stride[2LL]) * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) / 10))) + (T1.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) / 5))) + (T1.alloc_stride[3LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) % 5)))];
Array<float, 1, 1> T10;
T10[0] = 0;
T10[0]
= T0[(((((T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 60)) + (T0.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 20) % 3))) + ((2 * T0.alloc_stride[2LL]) * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) / 10))) + (T0.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) / 5))) + (T0.alloc_stride[3LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) % 5)))];
Array<float, 1, 1> T2;
T2[0]
= T10[0]
+ T11[0];
Array<float, 1, 1> T3;
T3[0]
= T2[0];
Array<float, 1, 1> T4;
T4[0]
= T3[0];
Array<float, 1, 1> T5;
T5[0]
= T4[0]
* (float) 7.07106781186547573e-01;
Array<float, 1, 1> T6;
T6[0]
= erff(T5[0]);
Array<float, 1, 1> T7;
T7[0]
= (float) 1.00000000000000000e+00
+ T6[0];
Array<float, 1, 1> T8;
T8[0]
= (float) 5.00000000000000000e-01
* T7[0];
Array<float, 1, 1> T12;
T12[0]
= T4[0]
* T8[0];
T9[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
= T12[0];
}
}
__global__ void nvfuser_N(Tensor<float, 4, 4> T0, Tensor<float, 4, 4> T1, Tensor<float, 7, 7> T9) {
if ((((nvfuser_index_t)threadIdx.x) < 120)) {
Array<float, 1, 1> T11;
T11[0] = 0;
T11[0]
= T1[(((((T1.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) / 60)) + (T1.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.x) / 20) % 3))) + ((2 * T1.alloc_stride[2LL]) * ((((nvfuser_index_t)threadIdx.x) % 20) / 10))) + (T1.alloc_stride[2LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) / 5))) + (T1.alloc_stride[3LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) % 5)))];
Array<float, 1, 1> T10;
T10[0] = 0;
T10[0]
= T0[(((((T0.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) / 60)) + (T0.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.x) / 20) % 3))) + ((2 * T0.alloc_stride[2LL]) * ((((nvfuser_index_t)threadIdx.x) % 20) / 10))) + (T0.alloc_stride[2LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) / 5))) + (T0.alloc_stride[3LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) % 5)))];
Array<float, 1, 1> T2;
T2[0]
= T10[0]
+ T11[0];
Array<float, 1, 1> T3;
T3[0]
= T2[0];
Array<float, 1, 1> T4;
T4[0]
= T3[0];
Array<float, 1, 1> T5;
T5[0]
= T4[0]
* (float) 7.07106781186547573e-01;
Array<float, 1, 1> T6;
T6[0]
= erff(T5[0]);
Array<float, 1, 1> T7;
T7[0]
= (float) 1.00000000000000000e+00
+ T6[0];
Array<float, 1, 1> T8;
T8[0]
= (float) 5.00000000000000000e-01
* T7[0];
Array<float, 1, 1> T12;
T12[0]
= T4[0]
* T8[0];
T9[((nvfuser_index_t)threadIdx.x)]
= T12[0];
}
}
--- 53997da5d
+++ 03a1b695e
@@ -1,15 +1,15 @@
__global__ void nvfuser_N(Tensor<float, 4, 4> T0, Tensor<float, 4, 4> T1, Tensor<float, 7, 7> T9) {
- if (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < 120)) {
+ if ((((nvfuser_index_t)threadIdx.x) < 120)) {
Array<float, 1, 1> T11;
T11[0] = 0;
T11[0]
- = T1[(((((T1.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 60)) + (T1.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 20) % 3))) + ((2 * T1.alloc_stride[2LL]) * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) / 10))) + (T1.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) / 5))) + (T1.alloc_stride[3LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) % 5)))];
+ = T1[(((((T1.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) / 60)) + (T1.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.x) / 20) % 3))) + ((2 * T1.alloc_stride[2LL]) * ((((nvfuser_index_t)threadIdx.x) % 20) / 10))) + (T1.alloc_stride[2LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) / 5))) + (T1.alloc_stride[3LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) % 5)))];
Array<float, 1, 1> T10;
T10[0] = 0;
T10[0]
- = T0[(((((T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 60)) + (T0.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 20) % 3))) + ((2 * T0.alloc_stride[2LL]) * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) / 10))) + (T0.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) / 5))) + (T0.alloc_stride[3LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) % 5)))];
+ = T0[(((((T0.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) / 60)) + (T0.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.x) / 20) % 3))) + ((2 * T0.alloc_stride[2LL]) * ((((nvfuser_index_t)threadIdx.x) % 20) / 10))) + (T0.alloc_stride[2LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) / 5))) + (T0.alloc_stride[3LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) % 5)))];
Array<float, 1, 1> T2;
T2[0]
= T10[0]
+ T11[0];
Array<float, 1, 1> T3;
@@ -35,9 +35,9 @@
* T7[0];
Array<float, 1, 1> T12;
T12[0]
= T4[0]
* T8[0];
- T9[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
+ T9[((nvfuser_index_t)threadIdx.x)]
= T12[0];
}
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_233_cu_3a580feb_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_233_cu_3a580feb_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_233_cu_3a580feb_191103std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_233_cu_3a580feb_1911011nvfuser_233ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_233_cu_3a580feb_1911011nvfuser_233ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_233_cu_3a580feb_1911011nvfuser_233ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_233_cu_3a580feb_1911011nvfuser_233ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_2[64]
)
{
.reg .pred %p<4>;
.reg .f32 %f<33>;
.reg .b32 %r<115>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r44, %r45}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_233_cu_3a580feb_1911011nvfuser_233ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0+24];
ld.param.v2.u32 {%r46, %r47}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_233_cu_3a580feb_1911011nvfuser_233ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0+32];
ld.param.v2.u32 {%r52, %r53}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_233_cu_3a580feb_1911011nvfuser_233ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1+24];
ld.param.v2.u32 {%r54, %r55}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_233_cu_3a580feb_1911011nvfuser_233ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1+32];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_233_cu_3a580feb_1911011nvfuser_233ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_2];
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_233_cu_3a580feb_1911011nvfuser_233ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_233_cu_3a580feb_1911011nvfuser_233ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0];
mov.u32 %r70, %ctaid.x;
shl.b32 %r71, %r70, 7;
mov.u32 %r72, %tid.x;
add.s32 %r9, %r71, %r72;
setp.gt.s32 %p1, %r9, 119;
@%p1 bra $L__BB0_4;
cvta.to.global.u64 %rd4, %rd2;
cvta.to.global.u64 %rd5, %rd1;
mul.hi.s32 %r73, %r9, -2004318071;
add.s32 %r74, %r73, %r9;
shr.u32 %r75, %r74, 31;
shr.s32 %r76, %r74, 5;
add.s32 %r77, %r76, %r75;
mul.hi.s32 %r78, %r9, 1717986919;
shr.u32 %r79, %r78, 31;
shr.s32 %r80, %r78, 3;
add.s32 %r81, %r80, %r79;
mul.hi.s32 %r82, %r81, 1431655766;
shr.u32 %r83, %r82, 31;
add.s32 %r84, %r82, %r83;
mul.lo.s32 %r85, %r84, 3;
sub.s32 %r86, %r81, %r85;
mul.lo.s32 %r87, %r53, %r86;
mul.lo.s32 %r88, %r81, 20;
sub.s32 %r89, %r9, %r88;
mul.hi.s32 %r90, %r89, 1717986919;
shr.u32 %r91, %r90, 31;
shr.s32 %r92, %r90, 2;
add.s32 %r93, %r92, %r91;
shl.b32 %r94, %r93, 1;
mul.lo.s32 %r95, %r93, 10;
sub.s32 %r96, %r89, %r95;
mul.hi.s32 %r97, %r96, 1717986919;
shr.u32 %r98, %r97, 31;
shr.s32 %r99, %r97, 1;
add.s32 %r100, %r99, %r98;
mul.lo.s32 %r101, %r100, 5;
sub.s32 %r102, %r96, %r101;
add.s32 %r103, %r94, %r100;
mad.lo.s32 %r104, %r52, %r77, %r87;
mad.lo.s32 %r105, %r55, %r102, %r104;
mad.lo.s32 %r106, %r103, %r54, %r105;
mul.wide.s32 %rd6, %r106, 4;
add.s64 %rd7, %rd4, %rd6;
mul.lo.s32 %r107, %r45, %r86;
mad.lo.s32 %r108, %r44, %r77, %r107;
mad.lo.s32 %r109, %r102, %r47, %r108;
mad.lo.s32 %r110, %r103, %r46, %r109;
mul.wide.s32 %rd8, %r110, 4;
add.s64 %rd9, %rd5, %rd8;
ld.global.f32 %f6, [%rd9];
ld.global.f32 %f7, [%rd7];
add.f32 %f1, %f7, %f6;
mul.f32 %f2, %f1, 0f3F3504F3;
abs.f32 %f8, %f2;
setp.ltu.f32 %p2, %f8, 0f3F8060FE;
setp.ge.f32 %p3, %f8, 0f3F8060FE;
mul.f32 %f9, %f2, %f2;
selp.f32 %f10, %f8, %f9, %p3;
selp.f32 %f11, 0f38EB4C3A, 0f38B1E96A, %p3;
selp.f32 %f12, 0fBAAE005B, 0fBA574D20, %p3;
fma.rn.f32 %f13, %f11, %f10, %f12;
selp.f32 %f14, 0f3C09919F, 0f3BAAD5EA, %p3;
fma.rn.f32 %f15, %f13, %f10, %f14;
selp.f32 %f16, 0fBD24D99A, 0fBCDC1BE7, %p3;
fma.rn.f32 %f17, %f15, %f10, %f16;
selp.f32 %f18, 0f3E235519, 0f3DE718AF, %p3;
fma.rn.f32 %f19, %f17, %f10, %f18;
selp.f32 %f20, 0f3F69B4F9, 0fBEC093AC, %p3;
fma.rn.f32 %f21, %f19, %f10, %f20;
selp.f32 %f22, 0f3F210A14, 0f3E0375D3, %p3;
fma.rn.f32 %f23, %f21, %f10, %f22;
neg.f32 %f24, %f8;
selp.f32 %f25, %f24, %f2, %p3;
fma.rn.f32 %f32, %f23, %f25, %f25;
@%p2 bra $L__BB0_3;
ex2.approx.ftz.f32 %f26, %f32;
mov.f32 %f27, 0f3F800000;
sub.f32 %f28, %f27, %f26;
mov.b32 %r111, %f28;
mov.b32 %r112, %f2;
and.b32 %r113, %r112, -2147483648;
or.b32 %r114, %r113, %r111;
mov.b32 %f32, %r114;
$L__BB0_3:
add.f32 %f29, %f32, 0f3F800000;
mul.f32 %f30, %f29, 0f3F000000;
mul.f32 %f31, %f1, %f30;
cvta.to.global.u64 %rd10, %rd3;
mul.wide.s32 %rd11, %r9, 4;
add.s64 %rd12, %rd10, %rd11;
st.global.f32 [%rd12], %f31;
$L__BB0_4:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_233_cu_7e4f78ef_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_233_cu_7e4f78ef_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_233_cu_7e4f78ef_160113std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_233_cu_7e4f78ef_1601111nvfuser_233ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_233_cu_7e4f78ef_1601111nvfuser_233ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_233_cu_7e4f78ef_1601111nvfuser_233ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_233_cu_7e4f78ef_1601111nvfuser_233ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_2[64]
)
{
.reg .pred %p<4>;
.reg .f32 %f<33>;
.reg .b32 %r<112>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r44, %r45}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_233_cu_7e4f78ef_1601111nvfuser_233ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0+24];
ld.param.v2.u32 {%r46, %r47}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_233_cu_7e4f78ef_1601111nvfuser_233ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0+32];
ld.param.v2.u32 {%r52, %r53}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_233_cu_7e4f78ef_1601111nvfuser_233ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1+24];
ld.param.v2.u32 {%r54, %r55}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_233_cu_7e4f78ef_1601111nvfuser_233ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1+32];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_233_cu_7e4f78ef_1601111nvfuser_233ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_2];
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_233_cu_7e4f78ef_1601111nvfuser_233ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_233_cu_7e4f78ef_1601111nvfuser_233ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0];
mov.u32 %r9, %tid.x;
setp.gt.s32 %p1, %r9, 119;
@%p1 bra $L__BB0_4;
cvta.to.global.u64 %rd4, %rd2;
cvta.to.global.u64 %rd5, %rd1;
mul.hi.s32 %r70, %r9, -2004318071;
add.s32 %r71, %r70, %r9;
shr.u32 %r72, %r71, 31;
shr.s32 %r73, %r71, 5;
add.s32 %r74, %r73, %r72;
mul.hi.s32 %r75, %r9, 1717986919;
shr.u32 %r76, %r75, 31;
shr.s32 %r77, %r75, 3;
add.s32 %r78, %r77, %r76;
mul.hi.s32 %r79, %r78, 1431655766;
shr.u32 %r80, %r79, 31;
add.s32 %r81, %r79, %r80;
mul.lo.s32 %r82, %r81, 3;
sub.s32 %r83, %r78, %r82;
mul.lo.s32 %r84, %r53, %r83;
mul.lo.s32 %r85, %r78, 20;
sub.s32 %r86, %r9, %r85;
mul.hi.s32 %r87, %r86, 1717986919;
shr.u32 %r88, %r87, 31;
shr.s32 %r89, %r87, 2;
add.s32 %r90, %r89, %r88;
shl.b32 %r91, %r90, 1;
mul.lo.s32 %r92, %r90, 10;
sub.s32 %r93, %r86, %r92;
mul.hi.s32 %r94, %r93, 1717986919;
shr.u32 %r95, %r94, 31;
shr.s32 %r96, %r94, 1;
add.s32 %r97, %r96, %r95;
mul.lo.s32 %r98, %r97, 5;
sub.s32 %r99, %r93, %r98;
add.s32 %r100, %r91, %r97;
mad.lo.s32 %r101, %r52, %r74, %r84;
mad.lo.s32 %r102, %r55, %r99, %r101;
mad.lo.s32 %r103, %r100, %r54, %r102;
mul.wide.s32 %rd6, %r103, 4;
add.s64 %rd7, %rd4, %rd6;
mul.lo.s32 %r104, %r45, %r83;
mad.lo.s32 %r105, %r44, %r74, %r104;
mad.lo.s32 %r106, %r47, %r99, %r105;
mad.lo.s32 %r107, %r100, %r46, %r106;
mul.wide.s32 %rd8, %r107, 4;
add.s64 %rd9, %rd5, %rd8;
ld.global.f32 %f6, [%rd9];
ld.global.f32 %f7, [%rd7];
add.f32 %f1, %f7, %f6;
mul.f32 %f2, %f1, 0f3F3504F3;
abs.f32 %f8, %f2;
setp.ltu.f32 %p2, %f8, 0f3F8060FE;
setp.ge.f32 %p3, %f8, 0f3F8060FE;
mul.f32 %f9, %f2, %f2;
selp.f32 %f10, %f8, %f9, %p3;
selp.f32 %f11, 0f38EB4C3A, 0f38B1E96A, %p3;
selp.f32 %f12, 0fBAAE005B, 0fBA574D20, %p3;
fma.rn.f32 %f13, %f11, %f10, %f12;
selp.f32 %f14, 0f3C09919F, 0f3BAAD5EA, %p3;
fma.rn.f32 %f15, %f13, %f10, %f14;
selp.f32 %f16, 0fBD24D99A, 0fBCDC1BE7, %p3;
fma.rn.f32 %f17, %f15, %f10, %f16;
selp.f32 %f18, 0f3E235519, 0f3DE718AF, %p3;
fma.rn.f32 %f19, %f17, %f10, %f18;
selp.f32 %f20, 0f3F69B4F9, 0fBEC093AC, %p3;
fma.rn.f32 %f21, %f19, %f10, %f20;
selp.f32 %f22, 0f3F210A14, 0f3E0375D3, %p3;
fma.rn.f32 %f23, %f21, %f10, %f22;
neg.f32 %f24, %f8;
selp.f32 %f25, %f24, %f2, %p3;
fma.rn.f32 %f32, %f23, %f25, %f25;
@%p2 bra $L__BB0_3;
ex2.approx.ftz.f32 %f26, %f32;
mov.f32 %f27, 0f3F800000;
sub.f32 %f28, %f27, %f26;
mov.b32 %r108, %f28;
mov.b32 %r109, %f2;
and.b32 %r110, %r109, -2147483648;
or.b32 %r111, %r110, %r108;
mov.b32 %f32, %r111;
$L__BB0_3:
add.f32 %f29, %f32, 0f3F800000;
mul.f32 %f30, %f29, 0f3F000000;
mul.f32 %f31, %f1, %f30;
cvta.to.global.u64 %rd10, %rd3;
mul.wide.s32 %rd11, %r9, 4;
add.s64 %rd12, %rd10, %rd11;
st.global.f32 [%rd12], %f31;
$L__BB0_4:
ret;
}
--- 53997da5d
+++ 03a1b695e
@@ -20,71 +20,68 @@
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_2[64]
)
{
.reg .pred %p<4>;
.reg .f32 %f<33>;
- .reg .b32 %r<115>;
+ .reg .b32 %r<112>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r44, %r45}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0+24];
ld.param.v2.u32 {%r46, %r47}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0+32];
ld.param.v2.u32 {%r52, %r53}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1+24];
ld.param.v2.u32 {%r54, %r55}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1+32];
ld.param.u64 %rd3, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_2];
ld.param.u64 %rd2, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1];
ld.param.u64 %rd1, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0];
- mov.u32 %r70, %ctaid.x;
- shl.b32 %r71, %r70, 7;
- mov.u32 %r72, %tid.x;
- add.s32 %r9, %r71, %r72;
+ mov.u32 %r9, %tid.x;
setp.gt.s32 %p1, %r9, 119;
@%p1 bra $L__BB0_4;
cvta.to.global.u64 %rd4, %rd2;
cvta.to.global.u64 %rd5, %rd1;
- mul.hi.s32 %r73, %r9, -2004318071;
- add.s32 %r74, %r73, %r9;
- shr.u32 %r75, %r74, 31;
- shr.s32 %r76, %r74, 5;
- add.s32 %r77, %r76, %r75;
- mul.hi.s32 %r78, %r9, 1717986919;
- shr.u32 %r79, %r78, 31;
- shr.s32 %r80, %r78, 3;
- add.s32 %r81, %r80, %r79;
- mul.hi.s32 %r82, %r81, 1431655766;
- shr.u32 %r83, %r82, 31;
- add.s32 %r84, %r82, %r83;
- mul.lo.s32 %r85, %r84, 3;
- sub.s32 %r86, %r81, %r85;
- mul.lo.s32 %r87, %r53, %r86;
- mul.lo.s32 %r88, %r81, 20;
- sub.s32 %r89, %r9, %r88;
- mul.hi.s32 %r90, %r89, 1717986919;
- shr.u32 %r91, %r90, 31;
- shr.s32 %r92, %r90, 2;
- add.s32 %r93, %r92, %r91;
- shl.b32 %r94, %r93, 1;
- mul.lo.s32 %r95, %r93, 10;
- sub.s32 %r96, %r89, %r95;
- mul.hi.s32 %r97, %r96, 1717986919;
- shr.u32 %r98, %r97, 31;
- shr.s32 %r99, %r97, 1;
- add.s32 %r100, %r99, %r98;
- mul.lo.s32 %r101, %r100, 5;
- sub.s32 %r102, %r96, %r101;
- add.s32 %r103, %r94, %r100;
- mad.lo.s32 %r104, %r52, %r77, %r87;
- mad.lo.s32 %r105, %r55, %r102, %r104;
- mad.lo.s32 %r106, %r103, %r54, %r105;
- mul.wide.s32 %rd6, %r106, 4;
+ mul.hi.s32 %r70, %r9, -2004318071;
+ add.s32 %r71, %r70, %r9;
+ shr.u32 %r72, %r71, 31;
+ shr.s32 %r73, %r71, 5;
+ add.s32 %r74, %r73, %r72;
+ mul.hi.s32 %r75, %r9, 1717986919;
+ shr.u32 %r76, %r75, 31;
+ shr.s32 %r77, %r75, 3;
+ add.s32 %r78, %r77, %r76;
+ mul.hi.s32 %r79, %r78, 1431655766;
+ shr.u32 %r80, %r79, 31;
+ add.s32 %r81, %r79, %r80;
+ mul.lo.s32 %r82, %r81, 3;
+ sub.s32 %r83, %r78, %r82;
+ mul.lo.s32 %r84, %r53, %r83;
+ mul.lo.s32 %r85, %r78, 20;
+ sub.s32 %r86, %r9, %r85;
+ mul.hi.s32 %r87, %r86, 1717986919;
+ shr.u32 %r88, %r87, 31;
+ shr.s32 %r89, %r87, 2;
+ add.s32 %r90, %r89, %r88;
+ shl.b32 %r91, %r90, 1;
+ mul.lo.s32 %r92, %r90, 10;
+ sub.s32 %r93, %r86, %r92;
+ mul.hi.s32 %r94, %r93, 1717986919;
+ shr.u32 %r95, %r94, 31;
+ shr.s32 %r96, %r94, 1;
+ add.s32 %r97, %r96, %r95;
+ mul.lo.s32 %r98, %r97, 5;
+ sub.s32 %r99, %r93, %r98;
+ add.s32 %r100, %r91, %r97;
+ mad.lo.s32 %r101, %r52, %r74, %r84;
+ mad.lo.s32 %r102, %r55, %r99, %r101;
+ mad.lo.s32 %r103, %r100, %r54, %r102;
+ mul.wide.s32 %rd6, %r103, 4;
add.s64 %rd7, %rd4, %rd6;
- mul.lo.s32 %r107, %r45, %r86;
- mad.lo.s32 %r108, %r44, %r77, %r107;
- mad.lo.s32 %r109, %r102, %r47, %r108;
- mad.lo.s32 %r110, %r103, %r46, %r109;
- mul.wide.s32 %rd8, %r110, 4;
+ mul.lo.s32 %r104, %r45, %r83;
+ mad.lo.s32 %r105, %r44, %r74, %r104;
+ mad.lo.s32 %r106, %r47, %r99, %r105;
+ mad.lo.s32 %r107, %r100, %r46, %r106;
+ mul.wide.s32 %rd8, %r107, 4;
add.s64 %rd9, %rd5, %rd8;
ld.global.f32 %f6, [%rd9];
ld.global.f32 %f7, [%rd7];
add.f32 %f1, %f7, %f6;
mul.f32 %f2, %f1, 0f3F3504F3;
@@ -112,15 +109,15 @@
@%p2 bra $L__BB0_3;
ex2.approx.ftz.f32 %f26, %f32;
mov.f32 %f27, 0f3F800000;
sub.f32 %f28, %f27, %f26;
- mov.b32 %r111, %f28;
- mov.b32 %r112, %f2;
- and.b32 %r113, %r112, -2147483648;
- or.b32 %r114, %r113, %r111;
- mov.b32 %f32, %r114;
+ mov.b32 %r108, %f28;
+ mov.b32 %r109, %f2;
+ and.b32 %r110, %r109, -2147483648;
+ or.b32 %r111, %r110, %r108;
+ mov.b32 %f32, %r111;
$L__BB0_3:
add.f32 %f29, %f32, 0f3F800000;
mul.f32 %f30, %f29, 0f3F000000;
mul.f32 %f31, %f1, %f30;
Kernel 60
CUDA
PTX
53997da5d
Diff
03a1b695e
-4
+4 index type: int
registers: 18
gmem: 3
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 4, 4> T0, Tensor<float, 4, 4> T1, Tensor<float, 7, 7> T9) {
if (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < 120)) {
Array<float, 1, 1> T11;
T11[0] = 0;
T11[0]
= T1[(((((T1.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 60)) + (T1.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 20) % 3))) + ((2 * T1.alloc_stride[2LL]) * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) / 10))) + (T1.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) / 5))) + (T1.alloc_stride[3LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) % 5)))];
Array<float, 1, 1> T10;
T10[0] = 0;
T10[0]
= T0[(((((T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 60)) + (T0.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 20) % 3))) + ((2 * T0.alloc_stride[2LL]) * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) / 10))) + (T0.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) / 5))) + (T0.alloc_stride[3LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) % 5)))];
Array<float, 1, 1> T2;
T2[0]
= T10[0]
+ T11[0];
Array<float, 1, 1> T3;
T3[0]
= T2[0];
Array<float, 1, 1> T4;
T4[0]
= T3[0];
Array<float, 1, 1> T5;
T5[0]
= T4[0]
* (float) 7.07106781186547573e-01;
Array<float, 1, 1> T6;
T6[0]
= erff(T5[0]);
Array<float, 1, 1> T7;
T7[0]
= (float) 1.00000000000000000e+00
+ T6[0];
Array<float, 1, 1> T8;
T8[0]
= (float) 5.00000000000000000e-01
* T7[0];
Array<float, 1, 1> T12;
T12[0]
= T4[0]
* T8[0];
T9[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
= T12[0];
}
}
__global__ void nvfuser_N(Tensor<float, 4, 4> T0, Tensor<float, 4, 4> T1, Tensor<float, 7, 7> T9) {
if ((((nvfuser_index_t)threadIdx.x) < 120)) {
Array<float, 1, 1> T11;
T11[0] = 0;
T11[0]
= T1[(((((T1.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) / 60)) + (T1.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.x) / 20) % 3))) + ((2 * T1.alloc_stride[2LL]) * ((((nvfuser_index_t)threadIdx.x) % 20) / 10))) + (T1.alloc_stride[2LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) / 5))) + (T1.alloc_stride[3LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) % 5)))];
Array<float, 1, 1> T10;
T10[0] = 0;
T10[0]
= T0[(((((T0.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) / 60)) + (T0.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.x) / 20) % 3))) + ((2 * T0.alloc_stride[2LL]) * ((((nvfuser_index_t)threadIdx.x) % 20) / 10))) + (T0.alloc_stride[2LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) / 5))) + (T0.alloc_stride[3LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) % 5)))];
Array<float, 1, 1> T2;
T2[0]
= T10[0]
+ T11[0];
Array<float, 1, 1> T3;
T3[0]
= T2[0];
Array<float, 1, 1> T4;
T4[0]
= T3[0];
Array<float, 1, 1> T5;
T5[0]
= T4[0]
* (float) 7.07106781186547573e-01;
Array<float, 1, 1> T6;
T6[0]
= erff(T5[0]);
Array<float, 1, 1> T7;
T7[0]
= (float) 1.00000000000000000e+00
+ T6[0];
Array<float, 1, 1> T8;
T8[0]
= (float) 5.00000000000000000e-01
* T7[0];
Array<float, 1, 1> T12;
T12[0]
= T4[0]
* T8[0];
T9[((nvfuser_index_t)threadIdx.x)]
= T12[0];
}
}
--- 53997da5d
+++ 03a1b695e
@@ -1,15 +1,15 @@
__global__ void nvfuser_N(Tensor<float, 4, 4> T0, Tensor<float, 4, 4> T1, Tensor<float, 7, 7> T9) {
- if (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < 120)) {
+ if ((((nvfuser_index_t)threadIdx.x) < 120)) {
Array<float, 1, 1> T11;
T11[0] = 0;
T11[0]
- = T1[(((((T1.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 60)) + (T1.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 20) % 3))) + ((2 * T1.alloc_stride[2LL]) * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) / 10))) + (T1.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) / 5))) + (T1.alloc_stride[3LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) % 5)))];
+ = T1[(((((T1.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) / 60)) + (T1.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.x) / 20) % 3))) + ((2 * T1.alloc_stride[2LL]) * ((((nvfuser_index_t)threadIdx.x) % 20) / 10))) + (T1.alloc_stride[2LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) / 5))) + (T1.alloc_stride[3LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) % 5)))];
Array<float, 1, 1> T10;
T10[0] = 0;
T10[0]
- = T0[(((((T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 60)) + (T0.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 20) % 3))) + ((2 * T0.alloc_stride[2LL]) * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) / 10))) + (T0.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) / 5))) + (T0.alloc_stride[3LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) % 5)))];
+ = T0[(((((T0.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) / 60)) + (T0.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.x) / 20) % 3))) + ((2 * T0.alloc_stride[2LL]) * ((((nvfuser_index_t)threadIdx.x) % 20) / 10))) + (T0.alloc_stride[2LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) / 5))) + (T0.alloc_stride[3LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) % 5)))];
Array<float, 1, 1> T2;
T2[0]
= T10[0]
+ T11[0];
Array<float, 1, 1> T3;
@@ -35,9 +35,9 @@
* T7[0];
Array<float, 1, 1> T12;
T12[0]
= T4[0]
* T8[0];
- T9[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
+ T9[((nvfuser_index_t)threadIdx.x)]
= T12[0];
}
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_273_cu_f4f0616e_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_273_cu_f4f0616e_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_273_cu_f4f0616e_191103std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_273_cu_f4f0616e_1911011nvfuser_273ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_273_cu_f4f0616e_1911011nvfuser_273ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_273_cu_f4f0616e_1911011nvfuser_273ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_273_cu_f4f0616e_1911011nvfuser_273ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_2[64]
)
{
.reg .pred %p<4>;
.reg .f32 %f<33>;
.reg .b32 %r<115>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r44, %r45}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_273_cu_f4f0616e_1911011nvfuser_273ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0+24];
ld.param.v2.u32 {%r46, %r47}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_273_cu_f4f0616e_1911011nvfuser_273ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0+32];
ld.param.v2.u32 {%r52, %r53}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_273_cu_f4f0616e_1911011nvfuser_273ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1+24];
ld.param.v2.u32 {%r54, %r55}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_273_cu_f4f0616e_1911011nvfuser_273ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1+32];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_273_cu_f4f0616e_1911011nvfuser_273ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_2];
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_273_cu_f4f0616e_1911011nvfuser_273ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_273_cu_f4f0616e_1911011nvfuser_273ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0];
mov.u32 %r70, %ctaid.x;
shl.b32 %r71, %r70, 7;
mov.u32 %r72, %tid.x;
add.s32 %r9, %r71, %r72;
setp.gt.s32 %p1, %r9, 119;
@%p1 bra $L__BB0_4;
cvta.to.global.u64 %rd4, %rd2;
cvta.to.global.u64 %rd5, %rd1;
mul.hi.s32 %r73, %r9, -2004318071;
add.s32 %r74, %r73, %r9;
shr.u32 %r75, %r74, 31;
shr.s32 %r76, %r74, 5;
add.s32 %r77, %r76, %r75;
mul.hi.s32 %r78, %r9, 1717986919;
shr.u32 %r79, %r78, 31;
shr.s32 %r80, %r78, 3;
add.s32 %r81, %r80, %r79;
mul.hi.s32 %r82, %r81, 1431655766;
shr.u32 %r83, %r82, 31;
add.s32 %r84, %r82, %r83;
mul.lo.s32 %r85, %r84, 3;
sub.s32 %r86, %r81, %r85;
mul.lo.s32 %r87, %r53, %r86;
mul.lo.s32 %r88, %r81, 20;
sub.s32 %r89, %r9, %r88;
mul.hi.s32 %r90, %r89, 1717986919;
shr.u32 %r91, %r90, 31;
shr.s32 %r92, %r90, 2;
add.s32 %r93, %r92, %r91;
shl.b32 %r94, %r93, 1;
mul.lo.s32 %r95, %r93, 10;
sub.s32 %r96, %r89, %r95;
mul.hi.s32 %r97, %r96, 1717986919;
shr.u32 %r98, %r97, 31;
shr.s32 %r99, %r97, 1;
add.s32 %r100, %r99, %r98;
mul.lo.s32 %r101, %r100, 5;
sub.s32 %r102, %r96, %r101;
add.s32 %r103, %r94, %r100;
mad.lo.s32 %r104, %r52, %r77, %r87;
mad.lo.s32 %r105, %r55, %r102, %r104;
mad.lo.s32 %r106, %r103, %r54, %r105;
mul.wide.s32 %rd6, %r106, 4;
add.s64 %rd7, %rd4, %rd6;
mul.lo.s32 %r107, %r45, %r86;
mad.lo.s32 %r108, %r44, %r77, %r107;
mad.lo.s32 %r109, %r102, %r47, %r108;
mad.lo.s32 %r110, %r103, %r46, %r109;
mul.wide.s32 %rd8, %r110, 4;
add.s64 %rd9, %rd5, %rd8;
ld.global.f32 %f6, [%rd9];
ld.global.f32 %f7, [%rd7];
add.f32 %f1, %f7, %f6;
mul.f32 %f2, %f1, 0f3F3504F3;
abs.f32 %f8, %f2;
setp.ltu.f32 %p2, %f8, 0f3F8060FE;
setp.ge.f32 %p3, %f8, 0f3F8060FE;
mul.f32 %f9, %f2, %f2;
selp.f32 %f10, %f8, %f9, %p3;
selp.f32 %f11, 0f38EB4C3A, 0f38B1E96A, %p3;
selp.f32 %f12, 0fBAAE005B, 0fBA574D20, %p3;
fma.rn.f32 %f13, %f11, %f10, %f12;
selp.f32 %f14, 0f3C09919F, 0f3BAAD5EA, %p3;
fma.rn.f32 %f15, %f13, %f10, %f14;
selp.f32 %f16, 0fBD24D99A, 0fBCDC1BE7, %p3;
fma.rn.f32 %f17, %f15, %f10, %f16;
selp.f32 %f18, 0f3E235519, 0f3DE718AF, %p3;
fma.rn.f32 %f19, %f17, %f10, %f18;
selp.f32 %f20, 0f3F69B4F9, 0fBEC093AC, %p3;
fma.rn.f32 %f21, %f19, %f10, %f20;
selp.f32 %f22, 0f3F210A14, 0f3E0375D3, %p3;
fma.rn.f32 %f23, %f21, %f10, %f22;
neg.f32 %f24, %f8;
selp.f32 %f25, %f24, %f2, %p3;
fma.rn.f32 %f32, %f23, %f25, %f25;
@%p2 bra $L__BB0_3;
ex2.approx.ftz.f32 %f26, %f32;
mov.f32 %f27, 0f3F800000;
sub.f32 %f28, %f27, %f26;
mov.b32 %r111, %f28;
mov.b32 %r112, %f2;
and.b32 %r113, %r112, -2147483648;
or.b32 %r114, %r113, %r111;
mov.b32 %f32, %r114;
$L__BB0_3:
add.f32 %f29, %f32, 0f3F800000;
mul.f32 %f30, %f29, 0f3F000000;
mul.f32 %f31, %f1, %f30;
cvta.to.global.u64 %rd10, %rd3;
mul.wide.s32 %rd11, %r9, 4;
add.s64 %rd12, %rd10, %rd11;
st.global.f32 [%rd12], %f31;
$L__BB0_4:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_273_cu_1ea34ece_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_273_cu_1ea34ece_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_273_cu_1ea34ece_160113std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_273_cu_1ea34ece_1601111nvfuser_273ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_273_cu_1ea34ece_1601111nvfuser_273ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_273_cu_1ea34ece_1601111nvfuser_273ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_273_cu_1ea34ece_1601111nvfuser_273ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_2[64]
)
{
.reg .pred %p<4>;
.reg .f32 %f<33>;
.reg .b32 %r<112>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r44, %r45}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_273_cu_1ea34ece_1601111nvfuser_273ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0+24];
ld.param.v2.u32 {%r46, %r47}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_273_cu_1ea34ece_1601111nvfuser_273ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0+32];
ld.param.v2.u32 {%r52, %r53}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_273_cu_1ea34ece_1601111nvfuser_273ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1+24];
ld.param.v2.u32 {%r54, %r55}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_273_cu_1ea34ece_1601111nvfuser_273ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1+32];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_273_cu_1ea34ece_1601111nvfuser_273ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_2];
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_273_cu_1ea34ece_1601111nvfuser_273ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_273_cu_1ea34ece_1601111nvfuser_273ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0];
mov.u32 %r9, %tid.x;
setp.gt.s32 %p1, %r9, 119;
@%p1 bra $L__BB0_4;
cvta.to.global.u64 %rd4, %rd2;
cvta.to.global.u64 %rd5, %rd1;
mul.hi.s32 %r70, %r9, -2004318071;
add.s32 %r71, %r70, %r9;
shr.u32 %r72, %r71, 31;
shr.s32 %r73, %r71, 5;
add.s32 %r74, %r73, %r72;
mul.hi.s32 %r75, %r9, 1717986919;
shr.u32 %r76, %r75, 31;
shr.s32 %r77, %r75, 3;
add.s32 %r78, %r77, %r76;
mul.hi.s32 %r79, %r78, 1431655766;
shr.u32 %r80, %r79, 31;
add.s32 %r81, %r79, %r80;
mul.lo.s32 %r82, %r81, 3;
sub.s32 %r83, %r78, %r82;
mul.lo.s32 %r84, %r53, %r83;
mul.lo.s32 %r85, %r78, 20;
sub.s32 %r86, %r9, %r85;
mul.hi.s32 %r87, %r86, 1717986919;
shr.u32 %r88, %r87, 31;
shr.s32 %r89, %r87, 2;
add.s32 %r90, %r89, %r88;
shl.b32 %r91, %r90, 1;
mul.lo.s32 %r92, %r90, 10;
sub.s32 %r93, %r86, %r92;
mul.hi.s32 %r94, %r93, 1717986919;
shr.u32 %r95, %r94, 31;
shr.s32 %r96, %r94, 1;
add.s32 %r97, %r96, %r95;
mul.lo.s32 %r98, %r97, 5;
sub.s32 %r99, %r93, %r98;
add.s32 %r100, %r91, %r97;
mad.lo.s32 %r101, %r52, %r74, %r84;
mad.lo.s32 %r102, %r55, %r99, %r101;
mad.lo.s32 %r103, %r100, %r54, %r102;
mul.wide.s32 %rd6, %r103, 4;
add.s64 %rd7, %rd4, %rd6;
mul.lo.s32 %r104, %r45, %r83;
mad.lo.s32 %r105, %r44, %r74, %r104;
mad.lo.s32 %r106, %r47, %r99, %r105;
mad.lo.s32 %r107, %r100, %r46, %r106;
mul.wide.s32 %rd8, %r107, 4;
add.s64 %rd9, %rd5, %rd8;
ld.global.f32 %f6, [%rd9];
ld.global.f32 %f7, [%rd7];
add.f32 %f1, %f7, %f6;
mul.f32 %f2, %f1, 0f3F3504F3;
abs.f32 %f8, %f2;
setp.ltu.f32 %p2, %f8, 0f3F8060FE;
setp.ge.f32 %p3, %f8, 0f3F8060FE;
mul.f32 %f9, %f2, %f2;
selp.f32 %f10, %f8, %f9, %p3;
selp.f32 %f11, 0f38EB4C3A, 0f38B1E96A, %p3;
selp.f32 %f12, 0fBAAE005B, 0fBA574D20, %p3;
fma.rn.f32 %f13, %f11, %f10, %f12;
selp.f32 %f14, 0f3C09919F, 0f3BAAD5EA, %p3;
fma.rn.f32 %f15, %f13, %f10, %f14;
selp.f32 %f16, 0fBD24D99A, 0fBCDC1BE7, %p3;
fma.rn.f32 %f17, %f15, %f10, %f16;
selp.f32 %f18, 0f3E235519, 0f3DE718AF, %p3;
fma.rn.f32 %f19, %f17, %f10, %f18;
selp.f32 %f20, 0f3F69B4F9, 0fBEC093AC, %p3;
fma.rn.f32 %f21, %f19, %f10, %f20;
selp.f32 %f22, 0f3F210A14, 0f3E0375D3, %p3;
fma.rn.f32 %f23, %f21, %f10, %f22;
neg.f32 %f24, %f8;
selp.f32 %f25, %f24, %f2, %p3;
fma.rn.f32 %f32, %f23, %f25, %f25;
@%p2 bra $L__BB0_3;
ex2.approx.ftz.f32 %f26, %f32;
mov.f32 %f27, 0f3F800000;
sub.f32 %f28, %f27, %f26;
mov.b32 %r108, %f28;
mov.b32 %r109, %f2;
and.b32 %r110, %r109, -2147483648;
or.b32 %r111, %r110, %r108;
mov.b32 %f32, %r111;
$L__BB0_3:
add.f32 %f29, %f32, 0f3F800000;
mul.f32 %f30, %f29, 0f3F000000;
mul.f32 %f31, %f1, %f30;
cvta.to.global.u64 %rd10, %rd3;
mul.wide.s32 %rd11, %r9, 4;
add.s64 %rd12, %rd10, %rd11;
st.global.f32 [%rd12], %f31;
$L__BB0_4:
ret;
}
--- 53997da5d
+++ 03a1b695e
@@ -20,71 +20,68 @@
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_2[64]
)
{
.reg .pred %p<4>;
.reg .f32 %f<33>;
- .reg .b32 %r<115>;
+ .reg .b32 %r<112>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r44, %r45}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0+24];
ld.param.v2.u32 {%r46, %r47}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0+32];
ld.param.v2.u32 {%r52, %r53}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1+24];
ld.param.v2.u32 {%r54, %r55}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1+32];
ld.param.u64 %rd3, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_2];
ld.param.u64 %rd2, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_1];
ld.param.u64 %rd1, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi7ELi7EEE_param_0];
- mov.u32 %r70, %ctaid.x;
- shl.b32 %r71, %r70, 7;
- mov.u32 %r72, %tid.x;
- add.s32 %r9, %r71, %r72;
+ mov.u32 %r9, %tid.x;
setp.gt.s32 %p1, %r9, 119;
@%p1 bra $L__BB0_4;
cvta.to.global.u64 %rd4, %rd2;
cvta.to.global.u64 %rd5, %rd1;
- mul.hi.s32 %r73, %r9, -2004318071;
- add.s32 %r74, %r73, %r9;
- shr.u32 %r75, %r74, 31;
- shr.s32 %r76, %r74, 5;
- add.s32 %r77, %r76, %r75;
- mul.hi.s32 %r78, %r9, 1717986919;
- shr.u32 %r79, %r78, 31;
- shr.s32 %r80, %r78, 3;
- add.s32 %r81, %r80, %r79;
- mul.hi.s32 %r82, %r81, 1431655766;
- shr.u32 %r83, %r82, 31;
- add.s32 %r84, %r82, %r83;
- mul.lo.s32 %r85, %r84, 3;
- sub.s32 %r86, %r81, %r85;
- mul.lo.s32 %r87, %r53, %r86;
- mul.lo.s32 %r88, %r81, 20;
- sub.s32 %r89, %r9, %r88;
- mul.hi.s32 %r90, %r89, 1717986919;
- shr.u32 %r91, %r90, 31;
- shr.s32 %r92, %r90, 2;
- add.s32 %r93, %r92, %r91;
- shl.b32 %r94, %r93, 1;
- mul.lo.s32 %r95, %r93, 10;
- sub.s32 %r96, %r89, %r95;
- mul.hi.s32 %r97, %r96, 1717986919;
- shr.u32 %r98, %r97, 31;
- shr.s32 %r99, %r97, 1;
- add.s32 %r100, %r99, %r98;
- mul.lo.s32 %r101, %r100, 5;
- sub.s32 %r102, %r96, %r101;
- add.s32 %r103, %r94, %r100;
- mad.lo.s32 %r104, %r52, %r77, %r87;
- mad.lo.s32 %r105, %r55, %r102, %r104;
- mad.lo.s32 %r106, %r103, %r54, %r105;
- mul.wide.s32 %rd6, %r106, 4;
+ mul.hi.s32 %r70, %r9, -2004318071;
+ add.s32 %r71, %r70, %r9;
+ shr.u32 %r72, %r71, 31;
+ shr.s32 %r73, %r71, 5;
+ add.s32 %r74, %r73, %r72;
+ mul.hi.s32 %r75, %r9, 1717986919;
+ shr.u32 %r76, %r75, 31;
+ shr.s32 %r77, %r75, 3;
+ add.s32 %r78, %r77, %r76;
+ mul.hi.s32 %r79, %r78, 1431655766;
+ shr.u32 %r80, %r79, 31;
+ add.s32 %r81, %r79, %r80;
+ mul.lo.s32 %r82, %r81, 3;
+ sub.s32 %r83, %r78, %r82;
+ mul.lo.s32 %r84, %r53, %r83;
+ mul.lo.s32 %r85, %r78, 20;
+ sub.s32 %r86, %r9, %r85;
+ mul.hi.s32 %r87, %r86, 1717986919;
+ shr.u32 %r88, %r87, 31;
+ shr.s32 %r89, %r87, 2;
+ add.s32 %r90, %r89, %r88;
+ shl.b32 %r91, %r90, 1;
+ mul.lo.s32 %r92, %r90, 10;
+ sub.s32 %r93, %r86, %r92;
+ mul.hi.s32 %r94, %r93, 1717986919;
+ shr.u32 %r95, %r94, 31;
+ shr.s32 %r96, %r94, 1;
+ add.s32 %r97, %r96, %r95;
+ mul.lo.s32 %r98, %r97, 5;
+ sub.s32 %r99, %r93, %r98;
+ add.s32 %r100, %r91, %r97;
+ mad.lo.s32 %r101, %r52, %r74, %r84;
+ mad.lo.s32 %r102, %r55, %r99, %r101;
+ mad.lo.s32 %r103, %r100, %r54, %r102;
+ mul.wide.s32 %rd6, %r103, 4;
add.s64 %rd7, %rd4, %rd6;
- mul.lo.s32 %r107, %r45, %r86;
- mad.lo.s32 %r108, %r44, %r77, %r107;
- mad.lo.s32 %r109, %r102, %r47, %r108;
- mad.lo.s32 %r110, %r103, %r46, %r109;
- mul.wide.s32 %rd8, %r110, 4;
+ mul.lo.s32 %r104, %r45, %r83;
+ mad.lo.s32 %r105, %r44, %r74, %r104;
+ mad.lo.s32 %r106, %r47, %r99, %r105;
+ mad.lo.s32 %r107, %r100, %r46, %r106;
+ mul.wide.s32 %rd8, %r107, 4;
add.s64 %rd9, %rd5, %rd8;
ld.global.f32 %f6, [%rd9];
ld.global.f32 %f7, [%rd7];
add.f32 %f1, %f7, %f6;
mul.f32 %f2, %f1, 0f3F3504F3;
@@ -112,15 +109,15 @@
@%p2 bra $L__BB0_3;
ex2.approx.ftz.f32 %f26, %f32;
mov.f32 %f27, 0f3F800000;
sub.f32 %f28, %f27, %f26;
- mov.b32 %r111, %f28;
- mov.b32 %r112, %f2;
- and.b32 %r113, %r112, -2147483648;
- or.b32 %r114, %r113, %r111;
- mov.b32 %f32, %r114;
+ mov.b32 %r108, %f28;
+ mov.b32 %r109, %f2;
+ and.b32 %r110, %r109, -2147483648;
+ or.b32 %r111, %r110, %r108;
+ mov.b32 %f32, %r111;
$L__BB0_3:
add.f32 %f29, %f32, 0f3F800000;
mul.f32 %f30, %f29, 0f3F000000;
mul.f32 %f31, %f1, %f30;
5: GpuViewTest.FusionReshapeStride
Kernel 19
CUDA
PTX
53997da5d
Diff
03a1b695e
-1
+1 index type: int
registers: 32→ 30
gmem: 3
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 4, 4> T0, Tensor<float, 7, 7> T1, Tensor<float, 7, 7> T9) {
if ((((((nvfuser_index_t)blockIdx.x) < (ceilDiv(((((((2 * T1.logical_size[0LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL]), 128))) && ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < ((((((2 * T1.logical_size[0LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL]))) && (((T1.logical_size[4LL] * (((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % ((((2 * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % (((2 * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) / ((T1.logical_size[4LL] * T1.logical_size[6LL]) * T0.logical_size[3LL]))) + ((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % ((((2 * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % (((2 * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % ((T1.logical_size[4LL] * T1.logical_size[6LL]) * T0.logical_size[3LL])) / (T1.logical_size[6LL] * T0.logical_size[3LL]))) < T0.logical_size[2LL]))) {
Array<float, 1, 1> T11;
T11[0] = 0;
T11[0]
= T1[(((((((T1.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]))) + (T1.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) / ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])))) + (T1.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) / (((2 * T0.logical_size[3LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])))) + (T1.alloc_stride[3LL] * (((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (((2 * T0.logical_size[3LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) / ((T0.logical_size[3LL] * T1.logical_size[4LL]) * T1.logical_size[6LL])))) + (T1.alloc_stride[4LL] * ((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (((2 * T0.logical_size[3LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((T0.logical_size[3LL] * T1.logical_size[4LL]) * T1.logical_size[6LL])) / (T0.logical_size[3LL] * T1.logical_size[6LL])))) + (T1.alloc_stride[5LL] * (((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (((2 * T0.logical_size[3LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((T0.logical_size[3LL] * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (T0.logical_size[3LL] * T1.logical_size[6LL])) / T1.logical_size[6LL]))) + (T1.alloc_stride[6LL] * (((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (((2 * T0.logical_size[3LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((T0.logical_size[3LL] * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (T0.logical_size[3LL] * T1.logical_size[6LL])) % T1.logical_size[6LL])))];
Array<float, 1, 1> T10;
T10[0] = 0;
T10[0]
= T0[(((((T0.alloc_stride[0LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) / ((((2 * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) / T0.logical_size[1LL])) + (T0.alloc_stride[1LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) / ((((2 * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % T0.logical_size[1LL]))) + ((T1.logical_size[4LL] * T0.alloc_stride[2LL]) * (((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % ((((2 * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % (((2 * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) / ((T1.logical_size[4LL] * T1.logical_size[6LL]) * T0.logical_size[3LL])))) + (T0.alloc_stride[2LL] * ((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % ((((2 * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % (((2 * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % ((T1.logical_size[4LL] * T1.logical_size[6LL]) * T0.logical_size[3LL])) / (T1.logical_size[6LL] * T0.logical_size[3LL])))) + (T0.alloc_stride[3LL] * (((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % ((((2 * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % (((2 * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % ((T1.logical_size[4LL] * T1.logical_size[6LL]) * T0.logical_size[3LL])) % (T1.logical_size[6LL] * T0.logical_size[3LL])) / T1.logical_size[6LL])))];
Array<float, 1, 1> T2;
T2[0]
= T10[0]
* (float) 7.07106781186547573e-01;
Array<float, 1, 1> T3;
T3[0]
= erff(T2[0]);
Array<float, 1, 1> T4;
T4[0]
= (float) 1.00000000000000000e+00
+ T3[0];
Array<float, 1, 1> T5;
T5[0]
= (float) 5.00000000000000000e-01
* T4[0];
Array<float, 1, 1> T6;
T6[0]
= T10[0]
* T5[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0];
Array<float, 1, 1> T8;
T8[0]
= T7[0];
Array<float, 1, 1> T12;
T12[0]
= T8[0]
+ T11[0];
T9[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
= T12[0];
}
}
__global__ void nvfuser_N(Tensor<float, 4, 4> T0, Tensor<float, 7, 7> T1, Tensor<float, 7, 7> T9) {
if ((((((nvfuser_index_t)blockIdx.x) < (ceilDiv(((((((2 * T0.logical_size[3LL]) * T1.logical_size[0LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]), 128))) && ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < ((((((2 * T0.logical_size[3LL]) * T1.logical_size[0LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]))) && (((T1.logical_size[4LL] * (((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (((2 * T0.logical_size[3LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) / ((T0.logical_size[3LL] * T1.logical_size[4LL]) * T1.logical_size[6LL]))) + ((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (((2 * T0.logical_size[3LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((T0.logical_size[3LL] * T1.logical_size[4LL]) * T1.logical_size[6LL])) / (T0.logical_size[3LL] * T1.logical_size[6LL]))) < T0.logical_size[2LL]))) {
Array<float, 1, 1> T11;
T11[0] = 0;
T11[0]
= T1[(((((((T1.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]))) + (T1.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) / ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])))) + (T1.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) / (((2 * T0.logical_size[3LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])))) + (T1.alloc_stride[3LL] * (((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (((2 * T0.logical_size[3LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) / ((T0.logical_size[3LL] * T1.logical_size[4LL]) * T1.logical_size[6LL])))) + (T1.alloc_stride[4LL] * ((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (((2 * T0.logical_size[3LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((T0.logical_size[3LL] * T1.logical_size[4LL]) * T1.logical_size[6LL])) / (T0.logical_size[3LL] * T1.logical_size[6LL])))) + (T1.alloc_stride[5LL] * (((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (((2 * T0.logical_size[3LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((T0.logical_size[3LL] * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (T0.logical_size[3LL] * T1.logical_size[6LL])) / T1.logical_size[6LL]))) + (T1.alloc_stride[6LL] * (((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (((2 * T0.logical_size[3LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((T0.logical_size[3LL] * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (T0.logical_size[3LL] * T1.logical_size[6LL])) % T1.logical_size[6LL])))];
Array<float, 1, 1> T10;
T10[0] = 0;
T10[0]
= T0[(((((T0.alloc_stride[0LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) / ((((2 * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) / T0.logical_size[1LL])) + (T0.alloc_stride[1LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) / ((((2 * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % T0.logical_size[1LL]))) + ((T1.logical_size[4LL] * T0.alloc_stride[2LL]) * (((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % ((((2 * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % (((2 * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) / ((T1.logical_size[4LL] * T1.logical_size[6LL]) * T0.logical_size[3LL])))) + (T0.alloc_stride[2LL] * ((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % ((((2 * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % (((2 * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % ((T1.logical_size[4LL] * T1.logical_size[6LL]) * T0.logical_size[3LL])) / (T1.logical_size[6LL] * T0.logical_size[3LL])))) + (T0.alloc_stride[3LL] * (((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % ((((2 * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % (((2 * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % ((T1.logical_size[4LL] * T1.logical_size[6LL]) * T0.logical_size[3LL])) % (T1.logical_size[6LL] * T0.logical_size[3LL])) / T1.logical_size[6LL])))];
Array<float, 1, 1> T2;
T2[0]
= T10[0]
* (float) 7.07106781186547573e-01;
Array<float, 1, 1> T3;
T3[0]
= erff(T2[0]);
Array<float, 1, 1> T4;
T4[0]
= (float) 1.00000000000000000e+00
+ T3[0];
Array<float, 1, 1> T5;
T5[0]
= (float) 5.00000000000000000e-01
* T4[0];
Array<float, 1, 1> T6;
T6[0]
= T10[0]
* T5[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0];
Array<float, 1, 1> T8;
T8[0]
= T7[0];
Array<float, 1, 1> T12;
T12[0]
= T8[0]
+ T11[0];
T9[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
= T12[0];
}
}
--- 53997da5d
+++ 03a1b695e
@@ -1,7 +1,7 @@
__global__ void nvfuser_N(Tensor<float, 4, 4> T0, Tensor<float, 7, 7> T1, Tensor<float, 7, 7> T9) {
- if ((((((nvfuser_index_t)blockIdx.x) < (ceilDiv(((((((2 * T1.logical_size[0LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL]), 128))) && ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < ((((((2 * T1.logical_size[0LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL]))) && (((T1.logical_size[4LL] * (((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % ((((2 * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % (((2 * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) / ((T1.logical_size[4LL] * T1.logical_size[6LL]) * T0.logical_size[3LL]))) + ((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % ((((2 * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % (((2 * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % ((T1.logical_size[4LL] * T1.logical_size[6LL]) * T0.logical_size[3LL])) / (T1.logical_size[6LL] * T0.logical_size[3LL]))) < T0.logical_size[2LL]))) {
+ if ((((((nvfuser_index_t)blockIdx.x) < (ceilDiv(((((((2 * T0.logical_size[3LL]) * T1.logical_size[0LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]), 128))) && ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < ((((((2 * T0.logical_size[3LL]) * T1.logical_size[0LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]))) && (((T1.logical_size[4LL] * (((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (((2 * T0.logical_size[3LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) / ((T0.logical_size[3LL] * T1.logical_size[4LL]) * T1.logical_size[6LL]))) + ((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (((2 * T0.logical_size[3LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((T0.logical_size[3LL] * T1.logical_size[4LL]) * T1.logical_size[6LL])) / (T0.logical_size[3LL] * T1.logical_size[6LL]))) < T0.logical_size[2LL]))) {
Array<float, 1, 1> T11;
T11[0] = 0;
T11[0]
= T1[(((((((T1.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]))) + (T1.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) / ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])))) + (T1.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) / (((2 * T0.logical_size[3LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])))) + (T1.alloc_stride[3LL] * (((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (((2 * T0.logical_size[3LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) / ((T0.logical_size[3LL] * T1.logical_size[4LL]) * T1.logical_size[6LL])))) + (T1.alloc_stride[4LL] * ((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (((2 * T0.logical_size[3LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((T0.logical_size[3LL] * T1.logical_size[4LL]) * T1.logical_size[6LL])) / (T0.logical_size[3LL] * T1.logical_size[6LL])))) + (T1.alloc_stride[5LL] * (((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (((2 * T0.logical_size[3LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((T0.logical_size[3LL] * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (T0.logical_size[3LL] * T1.logical_size[6LL])) / T1.logical_size[6LL]))) + (T1.alloc_stride[6LL] * (((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (((2 * T0.logical_size[3LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((T0.logical_size[3LL] * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (T0.logical_size[3LL] * T1.logical_size[6LL])) % T1.logical_size[6LL])))];
Array<float, 1, 1> T10;
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_34a95f1a_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_34a95f1a_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_34a95f1a_191103std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_34a95f1a_1911011nvfuser_298ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_34a95f1a_1911011nvfuser_298ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_34a95f1a_1911011nvfuser_298ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1[64],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_34a95f1a_1911011nvfuser_298ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_2[64]
)
{
.reg .pred %p<6>;
.reg .f32 %f<32>;
.reg .b32 %r<146>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r65, %r66}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_34a95f1a_1911011nvfuser_298ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0+8];
ld.param.v2.u32 {%r67, %r68}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_34a95f1a_1911011nvfuser_298ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0+16];
ld.param.v2.u32 {%r69, %r70}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_34a95f1a_1911011nvfuser_298ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0+24];
ld.param.v2.u32 {%r71, %r72}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_34a95f1a_1911011nvfuser_298ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0+32];
ld.param.v2.u32 {%r73, %r74}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_34a95f1a_1911011nvfuser_298ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+8];
ld.param.v2.u32 {%r75, %r76}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_34a95f1a_1911011nvfuser_298ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+16];
ld.param.v2.u32 {%r77, %r78}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_34a95f1a_1911011nvfuser_298ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+24];
ld.param.v2.u32 {%r79, %r80}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_34a95f1a_1911011nvfuser_298ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+32];
ld.param.v2.u32 {%r81, %r82}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_34a95f1a_1911011nvfuser_298ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+40];
ld.param.v2.u32 {%r83, %r84}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_34a95f1a_1911011nvfuser_298ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+48];
ld.param.v2.u32 {%r85, %r86}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_34a95f1a_1911011nvfuser_298ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+56];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_34a95f1a_1911011nvfuser_298ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_2];
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_34a95f1a_1911011nvfuser_298ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_34a95f1a_1911011nvfuser_298ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0];
shl.b32 %r6, %r68, 1;
mul.lo.s32 %r101, %r6, %r73;
mul.lo.s32 %r102, %r101, %r74;
mul.lo.s32 %r103, %r102, %r75;
mul.lo.s32 %r104, %r103, %r77;
mul.lo.s32 %r11, %r104, %r79;
add.s32 %r105, %r11, 127;
shr.s32 %r106, %r105, 31;
shr.u32 %r107, %r106, 25;
add.s32 %r108, %r105, %r107;
shr.s32 %r109, %r108, 7;
mov.u32 %r12, %ctaid.x;
setp.ge.s32 %p1, %r12, %r109;
@%p1 bra $L__BB0_6;
mov.u32 %r110, %tid.x;
shl.b32 %r111, %r12, 7;
add.s32 %r14, %r111, %r110;
setp.ge.s32 %p2, %r14, %r11;
@%p2 bra $L__BB0_6;
mul.lo.s32 %r112, %r6, %r74;
mul.lo.s32 %r113, %r112, %r75;
mul.lo.s32 %r114, %r113, %r77;
mul.lo.s32 %r19, %r114, %r79;
rem.s32 %r20, %r14, %r19;
mul.lo.s32 %r115, %r6, %r75;
mul.lo.s32 %r116, %r115, %r77;
mul.lo.s32 %r21, %r116, %r79;
rem.s32 %r22, %r20, %r21;
mul.lo.s32 %r117, %r6, %r77;
mul.lo.s32 %r23, %r117, %r79;
rem.s32 %r118, %r22, %r23;
mul.lo.s32 %r24, %r79, %r68;
mul.lo.s32 %r119, %r24, %r77;
div.s32 %r25, %r118, %r119;
mul.lo.s32 %r120, %r25, %r119;
sub.s32 %r26, %r118, %r120;
div.s32 %r27, %r26, %r24;
mad.lo.s32 %r28, %r25, %r77, %r27;
setp.ge.s32 %p3, %r28, %r67;
@%p3 bra $L__BB0_6;
div.s32 %r121, %r14, %r19;
mul.lo.s32 %r122, %r121, %r80;
div.s32 %r123, %r20, %r21;
mad.lo.s32 %r124, %r123, %r81, %r122;
div.s32 %r125, %r22, %r23;
mad.lo.s32 %r126, %r125, %r82, %r124;
mad.lo.s32 %r127, %r25, %r83, %r126;
mad.lo.s32 %r128, %r27, %r84, %r127;
rem.s32 %r129, %r26, %r24;
div.s32 %r130, %r129, %r79;
mad.lo.s32 %r131, %r130, %r85, %r128;
mul.lo.s32 %r132, %r130, %r79;
sub.s32 %r133, %r129, %r132;
mad.lo.s32 %r134, %r133, %r86, %r131;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.s32 %rd5, %r134, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.f32 %f1, [%rd6];
div.s32 %r135, %r123, %r66;
mul.lo.s32 %r136, %r135, %r69;
mul.lo.s32 %r137, %r135, %r66;
sub.s32 %r138, %r123, %r137;
mad.lo.s32 %r139, %r138, %r70, %r136;
mad.lo.s32 %r140, %r28, %r71, %r139;
mad.lo.s32 %r141, %r130, %r72, %r140;
cvta.to.global.u64 %rd7, %rd1;
mul.wide.s32 %rd8, %r141, 4;
add.s64 %rd9, %rd7, %rd8;
ld.global.f32 %f2, [%rd9];
mul.f32 %f3, %f2, 0f3F3504F3;
abs.f32 %f7, %f3;
setp.ltu.f32 %p4, %f7, 0f3F8060FE;
setp.ge.f32 %p5, %f7, 0f3F8060FE;
mul.f32 %f8, %f3, %f3;
selp.f32 %f9, %f7, %f8, %p5;
selp.f32 %f10, 0f38EB4C3A, 0f38B1E96A, %p5;
selp.f32 %f11, 0fBAAE005B, 0fBA574D20, %p5;
fma.rn.f32 %f12, %f10, %f9, %f11;
selp.f32 %f13, 0f3C09919F, 0f3BAAD5EA, %p5;
fma.rn.f32 %f14, %f12, %f9, %f13;
selp.f32 %f15, 0fBD24D99A, 0fBCDC1BE7, %p5;
fma.rn.f32 %f16, %f14, %f9, %f15;
selp.f32 %f17, 0f3E235519, 0f3DE718AF, %p5;
fma.rn.f32 %f18, %f16, %f9, %f17;
selp.f32 %f19, 0f3F69B4F9, 0fBEC093AC, %p5;
fma.rn.f32 %f20, %f18, %f9, %f19;
selp.f32 %f21, 0f3F210A14, 0f3E0375D3, %p5;
fma.rn.f32 %f22, %f20, %f9, %f21;
neg.f32 %f23, %f7;
selp.f32 %f24, %f23, %f3, %p5;
fma.rn.f32 %f31, %f22, %f24, %f24;
@%p4 bra $L__BB0_5;
ex2.approx.ftz.f32 %f25, %f31;
mov.f32 %f26, 0f3F800000;
sub.f32 %f27, %f26, %f25;
mov.b32 %r142, %f27;
mov.b32 %r143, %f3;
and.b32 %r144, %r143, -2147483648;
or.b32 %r145, %r144, %r142;
mov.b32 %f31, %r145;
$L__BB0_5:
add.f32 %f28, %f31, 0f3F800000;
mul.f32 %f29, %f28, 0f3F000000;
fma.rn.f32 %f30, %f2, %f29, %f1;
cvta.to.global.u64 %rd10, %rd3;
mul.wide.s32 %rd11, %r14, 4;
add.s64 %rd12, %rd10, %rd11;
st.global.f32 [%rd12], %f30;
$L__BB0_6:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_ca61e1b0_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_ca61e1b0_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_ca61e1b0_160113std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_ca61e1b0_1601111nvfuser_298ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_ca61e1b0_1601111nvfuser_298ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_ca61e1b0_1601111nvfuser_298ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1[64],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_ca61e1b0_1601111nvfuser_298ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_2[64]
)
{
.reg .pred %p<6>;
.reg .f32 %f<32>;
.reg .b32 %r<156>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r65, %r66}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_ca61e1b0_1601111nvfuser_298ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0+8];
ld.param.v2.u32 {%r67, %r68}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_ca61e1b0_1601111nvfuser_298ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0+16];
ld.param.v2.u32 {%r69, %r70}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_ca61e1b0_1601111nvfuser_298ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0+24];
ld.param.v2.u32 {%r71, %r72}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_ca61e1b0_1601111nvfuser_298ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0+32];
ld.param.v2.u32 {%r73, %r74}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_ca61e1b0_1601111nvfuser_298ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+8];
ld.param.v2.u32 {%r75, %r76}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_ca61e1b0_1601111nvfuser_298ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+16];
ld.param.v2.u32 {%r77, %r78}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_ca61e1b0_1601111nvfuser_298ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+24];
ld.param.v2.u32 {%r79, %r80}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_ca61e1b0_1601111nvfuser_298ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+32];
ld.param.v2.u32 {%r81, %r82}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_ca61e1b0_1601111nvfuser_298ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+40];
ld.param.v2.u32 {%r83, %r84}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_ca61e1b0_1601111nvfuser_298ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+48];
ld.param.v2.u32 {%r85, %r86}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_ca61e1b0_1601111nvfuser_298ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+56];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_ca61e1b0_1601111nvfuser_298ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_2];
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_ca61e1b0_1601111nvfuser_298ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_298_cu_ca61e1b0_1601111nvfuser_298ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0];
shl.b32 %r6, %r68, 1;
mul.lo.s32 %r101, %r6, %r73;
mul.lo.s32 %r102, %r101, %r74;
mul.lo.s32 %r103, %r102, %r75;
mul.lo.s32 %r104, %r103, %r77;
mul.lo.s32 %r11, %r104, %r79;
add.s32 %r105, %r11, 127;
shr.s32 %r106, %r105, 31;
shr.u32 %r107, %r106, 25;
add.s32 %r108, %r105, %r107;
shr.s32 %r109, %r108, 7;
mov.u32 %r12, %ctaid.x;
setp.ge.s32 %p1, %r12, %r109;
@%p1 bra $L__BB0_6;
mov.u32 %r110, %tid.x;
shl.b32 %r111, %r12, 7;
add.s32 %r14, %r111, %r110;
setp.ge.s32 %p2, %r14, %r11;
@%p2 bra $L__BB0_6;
mul.lo.s32 %r112, %r6, %r74;
mul.lo.s32 %r113, %r112, %r75;
mul.lo.s32 %r114, %r113, %r77;
mul.lo.s32 %r19, %r114, %r79;
rem.s32 %r20, %r14, %r19;
mul.lo.s32 %r115, %r6, %r75;
mul.lo.s32 %r116, %r115, %r77;
mul.lo.s32 %r21, %r116, %r79;
rem.s32 %r22, %r20, %r21;
mul.lo.s32 %r117, %r6, %r77;
mul.lo.s32 %r23, %r117, %r79;
rem.s32 %r24, %r22, %r23;
mul.lo.s32 %r118, %r77, %r68;
mul.lo.s32 %r119, %r118, %r79;
div.s32 %r25, %r24, %r119;
mul.lo.s32 %r120, %r25, %r119;
sub.s32 %r26, %r24, %r120;
mul.lo.s32 %r27, %r79, %r68;
div.s32 %r28, %r26, %r27;
mad.lo.s32 %r121, %r25, %r77, %r28;
setp.ge.s32 %p3, %r121, %r67;
@%p3 bra $L__BB0_6;
div.s32 %r122, %r14, %r19;
mul.lo.s32 %r123, %r122, %r80;
div.s32 %r124, %r20, %r21;
mad.lo.s32 %r125, %r124, %r81, %r123;
div.s32 %r126, %r22, %r23;
mad.lo.s32 %r127, %r126, %r82, %r125;
mad.lo.s32 %r128, %r25, %r83, %r127;
mad.lo.s32 %r129, %r28, %r84, %r128;
rem.s32 %r130, %r26, %r27;
div.s32 %r131, %r130, %r79;
mad.lo.s32 %r132, %r131, %r85, %r129;
mul.lo.s32 %r133, %r131, %r79;
sub.s32 %r134, %r130, %r133;
mad.lo.s32 %r135, %r134, %r86, %r132;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.s32 %rd5, %r135, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.f32 %f1, [%rd6];
div.s32 %r136, %r124, %r66;
mul.lo.s32 %r137, %r136, %r69;
mul.lo.s32 %r138, %r136, %r66;
sub.s32 %r139, %r124, %r138;
mul.lo.s32 %r140, %r27, %r77;
div.s32 %r141, %r24, %r140;
mul.lo.s32 %r142, %r141, %r140;
sub.s32 %r143, %r24, %r142;
div.s32 %r144, %r143, %r27;
mul.lo.s32 %r145, %r144, %r27;
sub.s32 %r146, %r143, %r145;
div.s32 %r147, %r146, %r79;
mad.lo.s32 %r148, %r141, %r77, %r144;
mad.lo.s32 %r149, %r139, %r70, %r137;
mad.lo.s32 %r150, %r148, %r71, %r149;
mad.lo.s32 %r151, %r147, %r72, %r150;
cvta.to.global.u64 %rd7, %rd1;
mul.wide.s32 %rd8, %r151, 4;
add.s64 %rd9, %rd7, %rd8;
ld.global.f32 %f2, [%rd9];
mul.f32 %f3, %f2, 0f3F3504F3;
abs.f32 %f7, %f3;
setp.ltu.f32 %p4, %f7, 0f3F8060FE;
setp.ge.f32 %p5, %f7, 0f3F8060FE;
mul.f32 %f8, %f3, %f3;
selp.f32 %f9, %f7, %f8, %p5;
selp.f32 %f10, 0f38EB4C3A, 0f38B1E96A, %p5;
selp.f32 %f11, 0fBAAE005B, 0fBA574D20, %p5;
fma.rn.f32 %f12, %f10, %f9, %f11;
selp.f32 %f13, 0f3C09919F, 0f3BAAD5EA, %p5;
fma.rn.f32 %f14, %f12, %f9, %f13;
selp.f32 %f15, 0fBD24D99A, 0fBCDC1BE7, %p5;
fma.rn.f32 %f16, %f14, %f9, %f15;
selp.f32 %f17, 0f3E235519, 0f3DE718AF, %p5;
fma.rn.f32 %f18, %f16, %f9, %f17;
selp.f32 %f19, 0f3F69B4F9, 0fBEC093AC, %p5;
fma.rn.f32 %f20, %f18, %f9, %f19;
selp.f32 %f21, 0f3F210A14, 0f3E0375D3, %p5;
fma.rn.f32 %f22, %f20, %f9, %f21;
neg.f32 %f23, %f7;
selp.f32 %f24, %f23, %f3, %p5;
fma.rn.f32 %f31, %f22, %f24, %f24;
@%p4 bra $L__BB0_5;
ex2.approx.ftz.f32 %f25, %f31;
mov.f32 %f26, 0f3F800000;
sub.f32 %f27, %f26, %f25;
mov.b32 %r152, %f27;
mov.b32 %r153, %f3;
and.b32 %r154, %r153, -2147483648;
or.b32 %r155, %r154, %r152;
mov.b32 %f31, %r155;
$L__BB0_5:
add.f32 %f28, %f31, 0f3F800000;
mul.f32 %f29, %f28, 0f3F000000;
fma.rn.f32 %f30, %f2, %f29, %f1;
cvta.to.global.u64 %rd10, %rd3;
mul.wide.s32 %rd11, %r14, 4;
add.s64 %rd12, %rd10, %rd11;
st.global.f32 [%rd12], %f30;
$L__BB0_6:
ret;
}
--- 53997da5d
+++ 03a1b695e
@@ -20,11 +20,11 @@
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_2[64]
)
{
.reg .pred %p<6>;
.reg .f32 %f<32>;
- .reg .b32 %r<146>;
+ .reg .b32 %r<156>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r65, %r66}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0+8];
ld.param.v2.u32 {%r67, %r68}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0+16];
@@ -70,48 +70,58 @@
mul.lo.s32 %r116, %r115, %r77;
mul.lo.s32 %r21, %r116, %r79;
rem.s32 %r22, %r20, %r21;
mul.lo.s32 %r117, %r6, %r77;
mul.lo.s32 %r23, %r117, %r79;
- rem.s32 %r118, %r22, %r23;
- mul.lo.s32 %r24, %r79, %r68;
- mul.lo.s32 %r119, %r24, %r77;
- div.s32 %r25, %r118, %r119;
+ rem.s32 %r24, %r22, %r23;
+ mul.lo.s32 %r118, %r77, %r68;
+ mul.lo.s32 %r119, %r118, %r79;
+ div.s32 %r25, %r24, %r119;
mul.lo.s32 %r120, %r25, %r119;
- sub.s32 %r26, %r118, %r120;
- div.s32 %r27, %r26, %r24;
- mad.lo.s32 %r28, %r25, %r77, %r27;
- setp.ge.s32 %p3, %r28, %r67;
+ sub.s32 %r26, %r24, %r120;
+ mul.lo.s32 %r27, %r79, %r68;
+ div.s32 %r28, %r26, %r27;
+ mad.lo.s32 %r121, %r25, %r77, %r28;
+ setp.ge.s32 %p3, %r121, %r67;
@%p3 bra $L__BB0_6;
- div.s32 %r121, %r14, %r19;
- mul.lo.s32 %r122, %r121, %r80;
- div.s32 %r123, %r20, %r21;
- mad.lo.s32 %r124, %r123, %r81, %r122;
- div.s32 %r125, %r22, %r23;
- mad.lo.s32 %r126, %r125, %r82, %r124;
- mad.lo.s32 %r127, %r25, %r83, %r126;
- mad.lo.s32 %r128, %r27, %r84, %r127;
- rem.s32 %r129, %r26, %r24;
- div.s32 %r130, %r129, %r79;
- mad.lo.s32 %r131, %r130, %r85, %r128;
- mul.lo.s32 %r132, %r130, %r79;
- sub.s32 %r133, %r129, %r132;
- mad.lo.s32 %r134, %r133, %r86, %r131;
+ div.s32 %r122, %r14, %r19;
+ mul.lo.s32 %r123, %r122, %r80;
+ div.s32 %r124, %r20, %r21;
+ mad.lo.s32 %r125, %r124, %r81, %r123;
+ div.s32 %r126, %r22, %r23;
+ mad.lo.s32 %r127, %r126, %r82, %r125;
+ mad.lo.s32 %r128, %r25, %r83, %r127;
+ mad.lo.s32 %r129, %r28, %r84, %r128;
+ rem.s32 %r130, %r26, %r27;
+ div.s32 %r131, %r130, %r79;
+ mad.lo.s32 %r132, %r131, %r85, %r129;
+ mul.lo.s32 %r133, %r131, %r79;
+ sub.s32 %r134, %r130, %r133;
+ mad.lo.s32 %r135, %r134, %r86, %r132;
cvta.to.global.u64 %rd4, %rd2;
- mul.wide.s32 %rd5, %r134, 4;
+ mul.wide.s32 %rd5, %r135, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.f32 %f1, [%rd6];
- div.s32 %r135, %r123, %r66;
- mul.lo.s32 %r136, %r135, %r69;
- mul.lo.s32 %r137, %r135, %r66;
- sub.s32 %r138, %r123, %r137;
- mad.lo.s32 %r139, %r138, %r70, %r136;
- mad.lo.s32 %r140, %r28, %r71, %r139;
- mad.lo.s32 %r141, %r130, %r72, %r140;
+ div.s32 %r136, %r124, %r66;
+ mul.lo.s32 %r137, %r136, %r69;
+ mul.lo.s32 %r138, %r136, %r66;
+ sub.s32 %r139, %r124, %r138;
+ mul.lo.s32 %r140, %r27, %r77;
+ div.s32 %r141, %r24, %r140;
+ mul.lo.s32 %r142, %r141, %r140;
+ sub.s32 %r143, %r24, %r142;
+ div.s32 %r144, %r143, %r27;
+ mul.lo.s32 %r145, %r144, %r27;
+ sub.s32 %r146, %r143, %r145;
+ div.s32 %r147, %r146, %r79;
+ mad.lo.s32 %r148, %r141, %r77, %r144;
+ mad.lo.s32 %r149, %r139, %r70, %r137;
+ mad.lo.s32 %r150, %r148, %r71, %r149;
+ mad.lo.s32 %r151, %r147, %r72, %r150;
cvta.to.global.u64 %rd7, %rd1;
- mul.wide.s32 %rd8, %r141, 4;
+ mul.wide.s32 %rd8, %r151, 4;
add.s64 %rd9, %rd7, %rd8;
ld.global.f32 %f2, [%rd9];
mul.f32 %f3, %f2, 0f3F3504F3;
abs.f32 %f7, %f3;
setp.ltu.f32 %p4, %f7, 0f3F8060FE;
@@ -137,15 +147,15 @@
@%p4 bra $L__BB0_5;
ex2.approx.ftz.f32 %f25, %f31;
mov.f32 %f26, 0f3F800000;
sub.f32 %f27, %f26, %f25;
- mov.b32 %r142, %f27;
- mov.b32 %r143, %f3;
- and.b32 %r144, %r143, -2147483648;
- or.b32 %r145, %r144, %r142;
- mov.b32 %f31, %r145;
+ mov.b32 %r152, %f27;
+ mov.b32 %r153, %f3;
+ and.b32 %r154, %r153, -2147483648;
+ or.b32 %r155, %r154, %r152;
+ mov.b32 %f31, %r155;
$L__BB0_5:
add.f32 %f28, %f31, 0f3F800000;
mul.f32 %f29, %f28, 0f3F000000;
fma.rn.f32 %f30, %f2, %f29, %f1;
Kernel 20
CUDA
PTX
53997da5d
Diff
03a1b695e
-4
+4 index type: int
registers: 18
gmem: 3
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 4, 4> T0, Tensor<float, 7, 7> T1, Tensor<float, 7, 7> T9) {
if (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < 120)) {
Array<float, 1, 1> T11;
T11[0] = 0;
T11[0]
= T1[((((T1.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 20)) + (T1.alloc_stride[3LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) / 10))) + (T1.alloc_stride[4LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) / 5))) + (T1.alloc_stride[5LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) % 5)))];
Array<float, 1, 1> T10;
T10[0] = 0;
T10[0]
= T0[(((((T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 60)) + (T0.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 20) % 3))) + ((2 * T0.alloc_stride[2LL]) * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) / 10))) + (T0.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) / 5))) + (T0.alloc_stride[3LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) % 5)))];
Array<float, 1, 1> T2;
T2[0]
= T10[0]
* (float) 7.07106781186547573e-01;
Array<float, 1, 1> T3;
T3[0]
= erff(T2[0]);
Array<float, 1, 1> T4;
T4[0]
= (float) 1.00000000000000000e+00
+ T3[0];
Array<float, 1, 1> T5;
T5[0]
= (float) 5.00000000000000000e-01
* T4[0];
Array<float, 1, 1> T6;
T6[0]
= T10[0]
* T5[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0];
Array<float, 1, 1> T8;
T8[0]
= T7[0];
Array<float, 1, 1> T12;
T12[0]
= T8[0]
+ T11[0];
T9[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
= T12[0];
}
}
__global__ void nvfuser_N(Tensor<float, 4, 4> T0, Tensor<float, 7, 7> T1, Tensor<float, 7, 7> T9) {
if ((((nvfuser_index_t)threadIdx.x) < 120)) {
Array<float, 1, 1> T11;
T11[0] = 0;
T11[0]
= T1[((((T1.alloc_stride[1LL] * (((nvfuser_index_t)threadIdx.x) / 20)) + (T1.alloc_stride[3LL] * ((((nvfuser_index_t)threadIdx.x) % 20) / 10))) + (T1.alloc_stride[4LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) / 5))) + (T1.alloc_stride[5LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) % 5)))];
Array<float, 1, 1> T10;
T10[0] = 0;
T10[0]
= T0[(((((T0.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) / 60)) + (T0.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.x) / 20) % 3))) + ((2 * T0.alloc_stride[2LL]) * ((((nvfuser_index_t)threadIdx.x) % 20) / 10))) + (T0.alloc_stride[2LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) / 5))) + (T0.alloc_stride[3LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) % 5)))];
Array<float, 1, 1> T2;
T2[0]
= T10[0]
* (float) 7.07106781186547573e-01;
Array<float, 1, 1> T3;
T3[0]
= erff(T2[0]);
Array<float, 1, 1> T4;
T4[0]
= (float) 1.00000000000000000e+00
+ T3[0];
Array<float, 1, 1> T5;
T5[0]
= (float) 5.00000000000000000e-01
* T4[0];
Array<float, 1, 1> T6;
T6[0]
= T10[0]
* T5[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0];
Array<float, 1, 1> T8;
T8[0]
= T7[0];
Array<float, 1, 1> T12;
T12[0]
= T8[0]
+ T11[0];
T9[((nvfuser_index_t)threadIdx.x)]
= T12[0];
}
}
--- 53997da5d
+++ 03a1b695e
@@ -1,15 +1,15 @@
__global__ void nvfuser_N(Tensor<float, 4, 4> T0, Tensor<float, 7, 7> T1, Tensor<float, 7, 7> T9) {
- if (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < 120)) {
+ if ((((nvfuser_index_t)threadIdx.x) < 120)) {
Array<float, 1, 1> T11;
T11[0] = 0;
T11[0]
- = T1[((((T1.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 20)) + (T1.alloc_stride[3LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) / 10))) + (T1.alloc_stride[4LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) / 5))) + (T1.alloc_stride[5LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) % 5)))];
+ = T1[((((T1.alloc_stride[1LL] * (((nvfuser_index_t)threadIdx.x) / 20)) + (T1.alloc_stride[3LL] * ((((nvfuser_index_t)threadIdx.x) % 20) / 10))) + (T1.alloc_stride[4LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) / 5))) + (T1.alloc_stride[5LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) % 5)))];
Array<float, 1, 1> T10;
T10[0] = 0;
T10[0]
- = T0[(((((T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 60)) + (T0.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 20) % 3))) + ((2 * T0.alloc_stride[2LL]) * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) / 10))) + (T0.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) / 5))) + (T0.alloc_stride[3LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) % 5)))];
+ = T0[(((((T0.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) / 60)) + (T0.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.x) / 20) % 3))) + ((2 * T0.alloc_stride[2LL]) * ((((nvfuser_index_t)threadIdx.x) % 20) / 10))) + (T0.alloc_stride[2LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) / 5))) + (T0.alloc_stride[3LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) % 5)))];
Array<float, 1, 1> T2;
T2[0]
= T10[0]
* (float) 7.07106781186547573e-01;
Array<float, 1, 1> T3;
@@ -35,9 +35,9 @@
= T7[0];
Array<float, 1, 1> T12;
T12[0]
= T8[0]
+ T11[0];
- T9[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
+ T9[((nvfuser_index_t)threadIdx.x)]
= T12[0];
}
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_299_cu_34a95f1a_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_299_cu_34a95f1a_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_299_cu_34a95f1a_191103std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_299_cu_34a95f1a_1911011nvfuser_299ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_299_cu_34a95f1a_1911011nvfuser_299ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_299_cu_34a95f1a_1911011nvfuser_299ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1[64],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_299_cu_34a95f1a_1911011nvfuser_299ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_2[64]
)
{
.reg .pred %p<4>;
.reg .f32 %f<32>;
.reg .b32 %r<123>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r46, %r47}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_299_cu_34a95f1a_1911011nvfuser_299ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0+24];
ld.param.v2.u32 {%r48, %r49}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_299_cu_34a95f1a_1911011nvfuser_299ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0+32];
ld.param.v2.u32 {%r58, %r59}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_299_cu_34a95f1a_1911011nvfuser_299ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+40];
ld.param.v2.u32 {%r60, %r61}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_299_cu_34a95f1a_1911011nvfuser_299ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+48];
ld.param.v2.u32 {%r62, %r63}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_299_cu_34a95f1a_1911011nvfuser_299ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+56];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_299_cu_34a95f1a_1911011nvfuser_299ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_2];
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_299_cu_34a95f1a_1911011nvfuser_299ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_299_cu_34a95f1a_1911011nvfuser_299ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0];
mov.u32 %r78, %ctaid.x;
shl.b32 %r79, %r78, 7;
mov.u32 %r80, %tid.x;
add.s32 %r5, %r79, %r80;
setp.gt.s32 %p1, %r5, 119;
@%p1 bra $L__BB0_4;
cvta.to.global.u64 %rd4, %rd2;
cvta.to.global.u64 %rd5, %rd1;
mul.hi.s32 %r81, %r5, 1717986919;
shr.u32 %r82, %r81, 31;
shr.s32 %r83, %r81, 3;
add.s32 %r84, %r83, %r82;
mul.lo.s32 %r85, %r84, 20;
sub.s32 %r86, %r5, %r85;
mul.hi.s32 %r87, %r86, 1717986919;
shr.u32 %r88, %r87, 31;
shr.s32 %r89, %r87, 2;
add.s32 %r90, %r89, %r88;
mul.lo.s32 %r91, %r60, %r90;
mad.lo.s32 %r92, %r58, %r84, %r91;
mul.lo.s32 %r93, %r90, 10;
sub.s32 %r94, %r86, %r93;
mul.hi.s32 %r95, %r94, 1717986919;
shr.u32 %r96, %r95, 31;
shr.s32 %r97, %r95, 1;
add.s32 %r98, %r97, %r96;
mad.lo.s32 %r99, %r61, %r98, %r92;
mul.lo.s32 %r100, %r98, 5;
sub.s32 %r101, %r94, %r100;
mad.lo.s32 %r102, %r62, %r101, %r99;
mul.wide.s32 %rd6, %r102, 4;
add.s64 %rd7, %rd4, %rd6;
ld.global.f32 %f1, [%rd7];
mul.hi.s32 %r103, %r5, -2004318071;
add.s32 %r104, %r103, %r5;
shr.u32 %r105, %r104, 31;
shr.s32 %r106, %r104, 5;
add.s32 %r107, %r106, %r105;
mul.hi.s32 %r108, %r84, 1431655766;
shr.u32 %r109, %r108, 31;
add.s32 %r110, %r108, %r109;
mul.lo.s32 %r111, %r110, 3;
sub.s32 %r112, %r84, %r111;
mul.lo.s32 %r113, %r47, %r112;
shl.b32 %r114, %r90, 1;
add.s32 %r115, %r114, %r98;
mad.lo.s32 %r116, %r46, %r107, %r113;
mad.lo.s32 %r117, %r101, %r49, %r116;
mad.lo.s32 %r118, %r115, %r48, %r117;
mul.wide.s32 %rd8, %r118, 4;
add.s64 %rd9, %rd5, %rd8;
ld.global.f32 %f2, [%rd9];
mul.f32 %f3, %f2, 0f3F3504F3;
abs.f32 %f7, %f3;
setp.ltu.f32 %p2, %f7, 0f3F8060FE;
setp.ge.f32 %p3, %f7, 0f3F8060FE;
mul.f32 %f8, %f3, %f3;
selp.f32 %f9, %f7, %f8, %p3;
selp.f32 %f10, 0f38EB4C3A, 0f38B1E96A, %p3;
selp.f32 %f11, 0fBAAE005B, 0fBA574D20, %p3;
fma.rn.f32 %f12, %f10, %f9, %f11;
selp.f32 %f13, 0f3C09919F, 0f3BAAD5EA, %p3;
fma.rn.f32 %f14, %f12, %f9, %f13;
selp.f32 %f15, 0fBD24D99A, 0fBCDC1BE7, %p3;
fma.rn.f32 %f16, %f14, %f9, %f15;
selp.f32 %f17, 0f3E235519, 0f3DE718AF, %p3;
fma.rn.f32 %f18, %f16, %f9, %f17;
selp.f32 %f19, 0f3F69B4F9, 0fBEC093AC, %p3;
fma.rn.f32 %f20, %f18, %f9, %f19;
selp.f32 %f21, 0f3F210A14, 0f3E0375D3, %p3;
fma.rn.f32 %f22, %f20, %f9, %f21;
neg.f32 %f23, %f7;
selp.f32 %f24, %f23, %f3, %p3;
fma.rn.f32 %f31, %f22, %f24, %f24;
@%p2 bra $L__BB0_3;
ex2.approx.ftz.f32 %f25, %f31;
mov.f32 %f26, 0f3F800000;
sub.f32 %f27, %f26, %f25;
mov.b32 %r119, %f27;
mov.b32 %r120, %f3;
and.b32 %r121, %r120, -2147483648;
or.b32 %r122, %r121, %r119;
mov.b32 %f31, %r122;
$L__BB0_3:
add.f32 %f28, %f31, 0f3F800000;
mul.f32 %f29, %f28, 0f3F000000;
fma.rn.f32 %f30, %f2, %f29, %f1;
cvta.to.global.u64 %rd10, %rd3;
mul.wide.s32 %rd11, %r5, 4;
add.s64 %rd12, %rd10, %rd11;
st.global.f32 [%rd12], %f30;
$L__BB0_4:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_299_cu_ca61e1b0_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_299_cu_ca61e1b0_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_299_cu_ca61e1b0_160113std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_299_cu_ca61e1b0_1601111nvfuser_299ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_299_cu_ca61e1b0_1601111nvfuser_299ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_299_cu_ca61e1b0_1601111nvfuser_299ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1[64],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_299_cu_ca61e1b0_1601111nvfuser_299ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_2[64]
)
{
.reg .pred %p<4>;
.reg .f32 %f<32>;
.reg .b32 %r<120>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r46, %r47}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_299_cu_ca61e1b0_1601111nvfuser_299ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0+24];
ld.param.v2.u32 {%r48, %r49}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_299_cu_ca61e1b0_1601111nvfuser_299ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0+32];
ld.param.v2.u32 {%r58, %r59}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_299_cu_ca61e1b0_1601111nvfuser_299ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+40];
ld.param.v2.u32 {%r60, %r61}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_299_cu_ca61e1b0_1601111nvfuser_299ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+48];
ld.param.v2.u32 {%r62, %r63}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_299_cu_ca61e1b0_1601111nvfuser_299ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+56];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_299_cu_ca61e1b0_1601111nvfuser_299ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_2];
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_299_cu_ca61e1b0_1601111nvfuser_299ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_299_cu_ca61e1b0_1601111nvfuser_299ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0];
mov.u32 %r5, %tid.x;
setp.gt.s32 %p1, %r5, 119;
@%p1 bra $L__BB0_4;
cvta.to.global.u64 %rd4, %rd2;
cvta.to.global.u64 %rd5, %rd1;
mul.hi.s32 %r78, %r5, 1717986919;
shr.u32 %r79, %r78, 31;
shr.s32 %r80, %r78, 3;
add.s32 %r81, %r80, %r79;
mul.lo.s32 %r82, %r81, 20;
sub.s32 %r83, %r5, %r82;
mul.hi.s32 %r84, %r83, 1717986919;
shr.u32 %r85, %r84, 31;
shr.s32 %r86, %r84, 2;
add.s32 %r87, %r86, %r85;
mul.lo.s32 %r88, %r60, %r87;
mad.lo.s32 %r89, %r58, %r81, %r88;
mul.lo.s32 %r90, %r87, 10;
sub.s32 %r91, %r83, %r90;
mul.hi.s32 %r92, %r91, 1717986919;
shr.u32 %r93, %r92, 31;
shr.s32 %r94, %r92, 1;
add.s32 %r95, %r94, %r93;
mad.lo.s32 %r96, %r61, %r95, %r89;
mul.lo.s32 %r97, %r95, 5;
sub.s32 %r98, %r91, %r97;
mad.lo.s32 %r99, %r62, %r98, %r96;
mul.wide.s32 %rd6, %r99, 4;
add.s64 %rd7, %rd4, %rd6;
ld.global.f32 %f1, [%rd7];
mul.hi.s32 %r100, %r5, -2004318071;
add.s32 %r101, %r100, %r5;
shr.u32 %r102, %r101, 31;
shr.s32 %r103, %r101, 5;
add.s32 %r104, %r103, %r102;
mul.hi.s32 %r105, %r81, 1431655766;
shr.u32 %r106, %r105, 31;
add.s32 %r107, %r105, %r106;
mul.lo.s32 %r108, %r107, 3;
sub.s32 %r109, %r81, %r108;
mul.lo.s32 %r110, %r47, %r109;
shl.b32 %r111, %r87, 1;
add.s32 %r112, %r111, %r95;
mad.lo.s32 %r113, %r46, %r104, %r110;
mad.lo.s32 %r114, %r49, %r98, %r113;
mad.lo.s32 %r115, %r112, %r48, %r114;
mul.wide.s32 %rd8, %r115, 4;
add.s64 %rd9, %rd5, %rd8;
ld.global.f32 %f2, [%rd9];
mul.f32 %f3, %f2, 0f3F3504F3;
abs.f32 %f7, %f3;
setp.ltu.f32 %p2, %f7, 0f3F8060FE;
setp.ge.f32 %p3, %f7, 0f3F8060FE;
mul.f32 %f8, %f3, %f3;
selp.f32 %f9, %f7, %f8, %p3;
selp.f32 %f10, 0f38EB4C3A, 0f38B1E96A, %p3;
selp.f32 %f11, 0fBAAE005B, 0fBA574D20, %p3;
fma.rn.f32 %f12, %f10, %f9, %f11;
selp.f32 %f13, 0f3C09919F, 0f3BAAD5EA, %p3;
fma.rn.f32 %f14, %f12, %f9, %f13;
selp.f32 %f15, 0fBD24D99A, 0fBCDC1BE7, %p3;
fma.rn.f32 %f16, %f14, %f9, %f15;
selp.f32 %f17, 0f3E235519, 0f3DE718AF, %p3;
fma.rn.f32 %f18, %f16, %f9, %f17;
selp.f32 %f19, 0f3F69B4F9, 0fBEC093AC, %p3;
fma.rn.f32 %f20, %f18, %f9, %f19;
selp.f32 %f21, 0f3F210A14, 0f3E0375D3, %p3;
fma.rn.f32 %f22, %f20, %f9, %f21;
neg.f32 %f23, %f7;
selp.f32 %f24, %f23, %f3, %p3;
fma.rn.f32 %f31, %f22, %f24, %f24;
@%p2 bra $L__BB0_3;
ex2.approx.ftz.f32 %f25, %f31;
mov.f32 %f26, 0f3F800000;
sub.f32 %f27, %f26, %f25;
mov.b32 %r116, %f27;
mov.b32 %r117, %f3;
and.b32 %r118, %r117, -2147483648;
or.b32 %r119, %r118, %r116;
mov.b32 %f31, %r119;
$L__BB0_3:
add.f32 %f28, %f31, 0f3F800000;
mul.f32 %f29, %f28, 0f3F000000;
fma.rn.f32 %f30, %f2, %f29, %f1;
cvta.to.global.u64 %rd10, %rd3;
mul.wide.s32 %rd11, %r5, 4;
add.s64 %rd12, %rd10, %rd11;
st.global.f32 [%rd12], %f30;
$L__BB0_4:
ret;
}
--- 53997da5d
+++ 03a1b695e
@@ -20,11 +20,11 @@
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_2[64]
)
{
.reg .pred %p<4>;
.reg .f32 %f<32>;
- .reg .b32 %r<123>;
+ .reg .b32 %r<120>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r46, %r47}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0+24];
ld.param.v2.u32 {%r48, %r49}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0+32];
@@ -32,61 +32,58 @@
ld.param.v2.u32 {%r60, %r61}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+48];
ld.param.v2.u32 {%r62, %r63}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+56];
ld.param.u64 %rd3, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_2];
ld.param.u64 %rd2, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1];
ld.param.u64 %rd1, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0];
- mov.u32 %r78, %ctaid.x;
- shl.b32 %r79, %r78, 7;
- mov.u32 %r80, %tid.x;
- add.s32 %r5, %r79, %r80;
+ mov.u32 %r5, %tid.x;
setp.gt.s32 %p1, %r5, 119;
@%p1 bra $L__BB0_4;
cvta.to.global.u64 %rd4, %rd2;
cvta.to.global.u64 %rd5, %rd1;
- mul.hi.s32 %r81, %r5, 1717986919;
- shr.u32 %r82, %r81, 31;
- shr.s32 %r83, %r81, 3;
- add.s32 %r84, %r83, %r82;
- mul.lo.s32 %r85, %r84, 20;
- sub.s32 %r86, %r5, %r85;
- mul.hi.s32 %r87, %r86, 1717986919;
- shr.u32 %r88, %r87, 31;
- shr.s32 %r89, %r87, 2;
- add.s32 %r90, %r89, %r88;
- mul.lo.s32 %r91, %r60, %r90;
- mad.lo.s32 %r92, %r58, %r84, %r91;
- mul.lo.s32 %r93, %r90, 10;
- sub.s32 %r94, %r86, %r93;
- mul.hi.s32 %r95, %r94, 1717986919;
- shr.u32 %r96, %r95, 31;
- shr.s32 %r97, %r95, 1;
- add.s32 %r98, %r97, %r96;
- mad.lo.s32 %r99, %r61, %r98, %r92;
- mul.lo.s32 %r100, %r98, 5;
- sub.s32 %r101, %r94, %r100;
- mad.lo.s32 %r102, %r62, %r101, %r99;
- mul.wide.s32 %rd6, %r102, 4;
+ mul.hi.s32 %r78, %r5, 1717986919;
+ shr.u32 %r79, %r78, 31;
+ shr.s32 %r80, %r78, 3;
+ add.s32 %r81, %r80, %r79;
+ mul.lo.s32 %r82, %r81, 20;
+ sub.s32 %r83, %r5, %r82;
+ mul.hi.s32 %r84, %r83, 1717986919;
+ shr.u32 %r85, %r84, 31;
+ shr.s32 %r86, %r84, 2;
+ add.s32 %r87, %r86, %r85;
+ mul.lo.s32 %r88, %r60, %r87;
+ mad.lo.s32 %r89, %r58, %r81, %r88;
+ mul.lo.s32 %r90, %r87, 10;
+ sub.s32 %r91, %r83, %r90;
+ mul.hi.s32 %r92, %r91, 1717986919;
+ shr.u32 %r93, %r92, 31;
+ shr.s32 %r94, %r92, 1;
+ add.s32 %r95, %r94, %r93;
+ mad.lo.s32 %r96, %r61, %r95, %r89;
+ mul.lo.s32 %r97, %r95, 5;
+ sub.s32 %r98, %r91, %r97;
+ mad.lo.s32 %r99, %r62, %r98, %r96;
+ mul.wide.s32 %rd6, %r99, 4;
add.s64 %rd7, %rd4, %rd6;
ld.global.f32 %f1, [%rd7];
- mul.hi.s32 %r103, %r5, -2004318071;
- add.s32 %r104, %r103, %r5;
- shr.u32 %r105, %r104, 31;
- shr.s32 %r106, %r104, 5;
- add.s32 %r107, %r106, %r105;
- mul.hi.s32 %r108, %r84, 1431655766;
- shr.u32 %r109, %r108, 31;
- add.s32 %r110, %r108, %r109;
- mul.lo.s32 %r111, %r110, 3;
- sub.s32 %r112, %r84, %r111;
- mul.lo.s32 %r113, %r47, %r112;
- shl.b32 %r114, %r90, 1;
- add.s32 %r115, %r114, %r98;
- mad.lo.s32 %r116, %r46, %r107, %r113;
- mad.lo.s32 %r117, %r101, %r49, %r116;
- mad.lo.s32 %r118, %r115, %r48, %r117;
- mul.wide.s32 %rd8, %r118, 4;
+ mul.hi.s32 %r100, %r5, -2004318071;
+ add.s32 %r101, %r100, %r5;
+ shr.u32 %r102, %r101, 31;
+ shr.s32 %r103, %r101, 5;
+ add.s32 %r104, %r103, %r102;
+ mul.hi.s32 %r105, %r81, 1431655766;
+ shr.u32 %r106, %r105, 31;
+ add.s32 %r107, %r105, %r106;
+ mul.lo.s32 %r108, %r107, 3;
+ sub.s32 %r109, %r81, %r108;
+ mul.lo.s32 %r110, %r47, %r109;
+ shl.b32 %r111, %r87, 1;
+ add.s32 %r112, %r111, %r95;
+ mad.lo.s32 %r113, %r46, %r104, %r110;
+ mad.lo.s32 %r114, %r49, %r98, %r113;
+ mad.lo.s32 %r115, %r112, %r48, %r114;
+ mul.wide.s32 %rd8, %r115, 4;
add.s64 %rd9, %rd5, %rd8;
ld.global.f32 %f2, [%rd9];
mul.f32 %f3, %f2, 0f3F3504F3;
abs.f32 %f7, %f3;
setp.ltu.f32 %p2, %f7, 0f3F8060FE;
@@ -112,15 +109,15 @@
@%p2 bra $L__BB0_3;
ex2.approx.ftz.f32 %f25, %f31;
mov.f32 %f26, 0f3F800000;
sub.f32 %f27, %f26, %f25;
- mov.b32 %r119, %f27;
- mov.b32 %r120, %f3;
- and.b32 %r121, %r120, -2147483648;
- or.b32 %r122, %r121, %r119;
- mov.b32 %f31, %r122;
+ mov.b32 %r116, %f27;
+ mov.b32 %r117, %f3;
+ and.b32 %r118, %r117, -2147483648;
+ or.b32 %r119, %r118, %r116;
+ mov.b32 %f31, %r119;
$L__BB0_3:
add.f32 %f28, %f31, 0f3F800000;
mul.f32 %f29, %f28, 0f3F000000;
fma.rn.f32 %f30, %f2, %f29, %f1;
Kernel 21
CUDA
PTX
53997da5d
Diff
03a1b695e
-1
+1 index type: int
registers: 28
gmem: 3
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 3, 3> T0, Tensor<float, 5, 5> T1, Tensor<float, 5, 5> T9) {
if ((((((nvfuser_index_t)blockIdx.x) < (ceilDiv(((((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL]) * T0.logical_size[0LL]), 128))) && ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < ((((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL]) * T0.logical_size[0LL]))) && ((((ceilDiv(T0.logical_size[1LL], 11)) * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) / ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL]))) + ((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) % ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])) % (T1.logical_size[3LL] * T1.logical_size[4LL])) % T1.logical_size[4LL]) / T0.logical_size[2LL])) < T0.logical_size[1LL]))) {
Array<float, 1, 1> T11;
T11[0] = 0;
T11[0]
= T1[(((((T1.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL]))) + (T1.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) / ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])))) + (T1.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) % ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])) / (T1.logical_size[3LL] * T1.logical_size[4LL])))) + (T1.alloc_stride[3LL] * (((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) % ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])) % (T1.logical_size[3LL] * T1.logical_size[4LL])) / T1.logical_size[4LL]))) + (T1.alloc_stride[4LL] * (((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) % ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])) % (T1.logical_size[3LL] * T1.logical_size[4LL])) % T1.logical_size[4LL])))];
Array<float, 1, 1> T10;
T10[0] = 0;
T10[0]
= T0[((((T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL]))) + ((T0.alloc_stride[1LL] * (ceilDiv(T0.logical_size[1LL], 11))) * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) / ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])))) + (T0.alloc_stride[1LL] * ((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) % ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])) % (T1.logical_size[3LL] * T1.logical_size[4LL])) % T1.logical_size[4LL]) / T0.logical_size[2LL]))) + (T0.alloc_stride[2LL] * ((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) % ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])) % (T1.logical_size[3LL] * T1.logical_size[4LL])) % T1.logical_size[4LL]) % T0.logical_size[2LL])))];
Array<float, 1, 1> T2;
T2[0]
= T10[0]
* (float) 7.07106781186547573e-01;
Array<float, 1, 1> T3;
T3[0]
= erff(T2[0]);
Array<float, 1, 1> T4;
T4[0]
= (float) 1.00000000000000000e+00
+ T3[0];
Array<float, 1, 1> T5;
T5[0]
= (float) 5.00000000000000000e-01
* T4[0];
Array<float, 1, 1> T6;
T6[0]
= T10[0]
* T5[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0];
Array<float, 1, 1> T8;
T8[0]
= T7[0];
Array<float, 1, 1> T12;
T12[0]
= T8[0]
+ T11[0];
T9[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
= T12[0];
}
}
__global__ void nvfuser_N(Tensor<float, 3, 3> T0, Tensor<float, 5, 5> T1, Tensor<float, 5, 5> T9) {
if ((((((nvfuser_index_t)blockIdx.x) < (ceilDiv(((((11 * T0.logical_size[0LL]) * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL]), 128))) && ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < ((((11 * T0.logical_size[0LL]) * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL]))) && ((((ceilDiv(T0.logical_size[1LL], 11)) * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) / ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL]))) + ((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) % ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])) % (T1.logical_size[3LL] * T1.logical_size[4LL])) % T1.logical_size[4LL]) / T0.logical_size[2LL])) < T0.logical_size[1LL]))) {
Array<float, 1, 1> T11;
T11[0] = 0;
T11[0]
= T1[(((((T1.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL]))) + (T1.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) / ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])))) + (T1.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) % ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])) / (T1.logical_size[3LL] * T1.logical_size[4LL])))) + (T1.alloc_stride[3LL] * (((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) % ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])) % (T1.logical_size[3LL] * T1.logical_size[4LL])) / T1.logical_size[4LL]))) + (T1.alloc_stride[4LL] * (((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) % ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])) % (T1.logical_size[3LL] * T1.logical_size[4LL])) % T1.logical_size[4LL])))];
Array<float, 1, 1> T10;
T10[0] = 0;
T10[0]
= T0[((((T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL]))) + ((T0.alloc_stride[1LL] * (ceilDiv(T0.logical_size[1LL], 11))) * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) / ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])))) + (T0.alloc_stride[1LL] * ((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) % ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])) % (T1.logical_size[3LL] * T1.logical_size[4LL])) % T1.logical_size[4LL]) / T0.logical_size[2LL]))) + (T0.alloc_stride[2LL] * ((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) % ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])) % (T1.logical_size[3LL] * T1.logical_size[4LL])) % T1.logical_size[4LL]) % T0.logical_size[2LL])))];
Array<float, 1, 1> T2;
T2[0]
= T10[0]
* (float) 7.07106781186547573e-01;
Array<float, 1, 1> T3;
T3[0]
= erff(T2[0]);
Array<float, 1, 1> T4;
T4[0]
= (float) 1.00000000000000000e+00
+ T3[0];
Array<float, 1, 1> T5;
T5[0]
= (float) 5.00000000000000000e-01
* T4[0];
Array<float, 1, 1> T6;
T6[0]
= T10[0]
* T5[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0];
Array<float, 1, 1> T8;
T8[0]
= T7[0];
Array<float, 1, 1> T12;
T12[0]
= T8[0]
+ T11[0];
T9[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
= T12[0];
}
}
--- 53997da5d
+++ 03a1b695e
@@ -1,7 +1,7 @@
__global__ void nvfuser_N(Tensor<float, 3, 3> T0, Tensor<float, 5, 5> T1, Tensor<float, 5, 5> T9) {
- if ((((((nvfuser_index_t)blockIdx.x) < (ceilDiv(((((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL]) * T0.logical_size[0LL]), 128))) && ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < ((((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL]) * T0.logical_size[0LL]))) && ((((ceilDiv(T0.logical_size[1LL], 11)) * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) / ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL]))) + ((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) % ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])) % (T1.logical_size[3LL] * T1.logical_size[4LL])) % T1.logical_size[4LL]) / T0.logical_size[2LL])) < T0.logical_size[1LL]))) {
+ if ((((((nvfuser_index_t)blockIdx.x) < (ceilDiv(((((11 * T0.logical_size[0LL]) * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL]), 128))) && ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < ((((11 * T0.logical_size[0LL]) * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL]))) && ((((ceilDiv(T0.logical_size[1LL], 11)) * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) / ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL]))) + ((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) % ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])) % (T1.logical_size[3LL] * T1.logical_size[4LL])) % T1.logical_size[4LL]) / T0.logical_size[2LL])) < T0.logical_size[1LL]))) {
Array<float, 1, 1> T11;
T11[0] = 0;
T11[0]
= T1[(((((T1.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL]))) + (T1.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) / ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])))) + (T1.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) % ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])) / (T1.logical_size[3LL] * T1.logical_size[4LL])))) + (T1.alloc_stride[3LL] * (((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) % ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])) % (T1.logical_size[3LL] * T1.logical_size[4LL])) / T1.logical_size[4LL]))) + (T1.alloc_stride[4LL] * (((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) % ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])) % (T1.logical_size[3LL] * T1.logical_size[4LL])) % T1.logical_size[4LL])))];
Array<float, 1, 1> T10;
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_300_cu_bbf17009_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_300_cu_bbf17009_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_300_cu_bbf17009_191103std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_300_cu_bbf17009_1911011nvfuser_300ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_300_cu_bbf17009_1911011nvfuser_300ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_0[32],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_300_cu_bbf17009_1911011nvfuser_300ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_1[48],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_300_cu_bbf17009_1911011nvfuser_300ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_2[48]
)
{
.reg .pred %p<6>;
.reg .f32 %f<32>;
.reg .b32 %r<109>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r47, %r48}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_300_cu_bbf17009_1911011nvfuser_300ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_0+8];
ld.param.v2.u32 {%r49, %r50}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_300_cu_bbf17009_1911011nvfuser_300ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_0+16];
ld.param.v2.u32 {%r51, %r52}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_300_cu_bbf17009_1911011nvfuser_300ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_0+24];
ld.param.v2.u32 {%r55, %r56}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_300_cu_bbf17009_1911011nvfuser_300ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_1+16];
ld.param.v2.u32 {%r57, %r58}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_300_cu_bbf17009_1911011nvfuser_300ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_1+24];
ld.param.v2.u32 {%r59, %r60}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_300_cu_bbf17009_1911011nvfuser_300ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_1+32];
ld.param.v2.u32 {%r61, %r62}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_300_cu_bbf17009_1911011nvfuser_300ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_1+40];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_300_cu_bbf17009_1911011nvfuser_300ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_2];
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_300_cu_bbf17009_1911011nvfuser_300ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_300_cu_bbf17009_1911011nvfuser_300ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_0];
mul.lo.s32 %r73, %r47, %r55;
mul.lo.s32 %r74, %r73, %r56;
mul.lo.s32 %r75, %r74, %r57;
mul.lo.s32 %r7, %r75, 11;
add.s32 %r76, %r7, 127;
shr.s32 %r77, %r76, 31;
shr.u32 %r78, %r77, 25;
add.s32 %r79, %r76, %r78;
shr.s32 %r80, %r79, 7;
mov.u32 %r8, %ctaid.x;
setp.ge.s32 %p1, %r8, %r80;
@%p1 bra $L__BB0_6;
mov.u32 %r81, %tid.x;
shl.b32 %r82, %r8, 7;
add.s32 %r10, %r82, %r81;
setp.ge.s32 %p2, %r10, %r7;
@%p2 bra $L__BB0_6;
add.s32 %r83, %r48, 10;
mul.hi.s32 %r84, %r83, 780903145;
shr.u32 %r85, %r84, 31;
shr.s32 %r86, %r84, 1;
add.s32 %r87, %r86, %r85;
mul.lo.s32 %r88, %r55, %r56;
mul.lo.s32 %r89, %r88, %r57;
mul.lo.s32 %r14, %r89, 11;
rem.s32 %r90, %r10, %r14;
div.s32 %r15, %r90, %r89;
mul.lo.s32 %r91, %r15, %r89;
sub.s32 %r16, %r90, %r91;
mul.lo.s32 %r17, %r56, %r57;
rem.s32 %r18, %r16, %r17;
rem.s32 %r19, %r18, %r57;
div.s32 %r92, %r19, %r49;
mad.lo.s32 %r20, %r15, %r87, %r92;
setp.ge.s32 %p3, %r20, %r48;
@%p3 bra $L__BB0_6;
div.s32 %r93, %r10, %r14;
mul.lo.s32 %r94, %r93, %r58;
mad.lo.s32 %r95, %r15, %r59, %r94;
div.s32 %r96, %r16, %r17;
mad.lo.s32 %r97, %r96, %r60, %r95;
div.s32 %r98, %r18, %r57;
mad.lo.s32 %r99, %r98, %r61, %r97;
mad.lo.s32 %r100, %r19, %r62, %r99;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.s32 %rd5, %r100, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.f32 %f1, [%rd6];
mul.lo.s32 %r101, %r93, %r50;
rem.s32 %r102, %r19, %r49;
mad.lo.s32 %r103, %r20, %r51, %r101;
mad.lo.s32 %r104, %r102, %r52, %r103;
cvta.to.global.u64 %rd7, %rd1;
mul.wide.s32 %rd8, %r104, 4;
add.s64 %rd9, %rd7, %rd8;
ld.global.f32 %f2, [%rd9];
mul.f32 %f3, %f2, 0f3F3504F3;
abs.f32 %f7, %f3;
setp.ltu.f32 %p4, %f7, 0f3F8060FE;
setp.ge.f32 %p5, %f7, 0f3F8060FE;
mul.f32 %f8, %f3, %f3;
selp.f32 %f9, %f7, %f8, %p5;
selp.f32 %f10, 0f38EB4C3A, 0f38B1E96A, %p5;
selp.f32 %f11, 0fBAAE005B, 0fBA574D20, %p5;
fma.rn.f32 %f12, %f10, %f9, %f11;
selp.f32 %f13, 0f3C09919F, 0f3BAAD5EA, %p5;
fma.rn.f32 %f14, %f12, %f9, %f13;
selp.f32 %f15, 0fBD24D99A, 0fBCDC1BE7, %p5;
fma.rn.f32 %f16, %f14, %f9, %f15;
selp.f32 %f17, 0f3E235519, 0f3DE718AF, %p5;
fma.rn.f32 %f18, %f16, %f9, %f17;
selp.f32 %f19, 0f3F69B4F9, 0fBEC093AC, %p5;
fma.rn.f32 %f20, %f18, %f9, %f19;
selp.f32 %f21, 0f3F210A14, 0f3E0375D3, %p5;
fma.rn.f32 %f22, %f20, %f9, %f21;
neg.f32 %f23, %f7;
selp.f32 %f24, %f23, %f3, %p5;
fma.rn.f32 %f31, %f22, %f24, %f24;
@%p4 bra $L__BB0_5;
ex2.approx.ftz.f32 %f25, %f31;
mov.f32 %f26, 0f3F800000;
sub.f32 %f27, %f26, %f25;
mov.b32 %r105, %f27;
mov.b32 %r106, %f3;
and.b32 %r107, %r106, -2147483648;
or.b32 %r108, %r107, %r105;
mov.b32 %f31, %r108;
$L__BB0_5:
add.f32 %f28, %f31, 0f3F800000;
mul.f32 %f29, %f28, 0f3F000000;
fma.rn.f32 %f30, %f2, %f29, %f1;
cvta.to.global.u64 %rd10, %rd3;
mul.wide.s32 %rd11, %r10, 4;
add.s64 %rd12, %rd10, %rd11;
st.global.f32 [%rd12], %f30;
$L__BB0_6:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_300_cu_ca61e1b0_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_300_cu_ca61e1b0_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_300_cu_ca61e1b0_160113std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_300_cu_ca61e1b0_1601111nvfuser_300ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_300_cu_ca61e1b0_1601111nvfuser_300ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_0[32],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_300_cu_ca61e1b0_1601111nvfuser_300ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_1[48],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_300_cu_ca61e1b0_1601111nvfuser_300ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_2[48]
)
{
.reg .pred %p<6>;
.reg .f32 %f<32>;
.reg .b32 %r<109>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r47, %r48}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_300_cu_ca61e1b0_1601111nvfuser_300ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_0+8];
ld.param.v2.u32 {%r49, %r50}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_300_cu_ca61e1b0_1601111nvfuser_300ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_0+16];
ld.param.v2.u32 {%r51, %r52}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_300_cu_ca61e1b0_1601111nvfuser_300ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_0+24];
ld.param.v2.u32 {%r55, %r56}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_300_cu_ca61e1b0_1601111nvfuser_300ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_1+16];
ld.param.v2.u32 {%r57, %r58}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_300_cu_ca61e1b0_1601111nvfuser_300ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_1+24];
ld.param.v2.u32 {%r59, %r60}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_300_cu_ca61e1b0_1601111nvfuser_300ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_1+32];
ld.param.v2.u32 {%r61, %r62}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_300_cu_ca61e1b0_1601111nvfuser_300ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_1+40];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_300_cu_ca61e1b0_1601111nvfuser_300ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_2];
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_300_cu_ca61e1b0_1601111nvfuser_300ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_300_cu_ca61e1b0_1601111nvfuser_300ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_0];
mul.lo.s32 %r73, %r47, %r55;
mul.lo.s32 %r74, %r73, %r56;
mul.lo.s32 %r75, %r74, %r57;
mul.lo.s32 %r7, %r75, 11;
add.s32 %r76, %r7, 127;
shr.s32 %r77, %r76, 31;
shr.u32 %r78, %r77, 25;
add.s32 %r79, %r76, %r78;
shr.s32 %r80, %r79, 7;
mov.u32 %r8, %ctaid.x;
setp.ge.s32 %p1, %r8, %r80;
@%p1 bra $L__BB0_6;
mov.u32 %r81, %tid.x;
shl.b32 %r82, %r8, 7;
add.s32 %r10, %r82, %r81;
setp.ge.s32 %p2, %r10, %r7;
@%p2 bra $L__BB0_6;
add.s32 %r83, %r48, 10;
mul.hi.s32 %r84, %r83, 780903145;
shr.u32 %r85, %r84, 31;
shr.s32 %r86, %r84, 1;
add.s32 %r87, %r86, %r85;
mul.lo.s32 %r88, %r55, %r56;
mul.lo.s32 %r89, %r88, %r57;
mul.lo.s32 %r14, %r89, 11;
rem.s32 %r90, %r10, %r14;
div.s32 %r15, %r90, %r89;
mul.lo.s32 %r91, %r15, %r89;
sub.s32 %r16, %r90, %r91;
mul.lo.s32 %r17, %r56, %r57;
rem.s32 %r18, %r16, %r17;
rem.s32 %r19, %r18, %r57;
div.s32 %r92, %r19, %r49;
mad.lo.s32 %r20, %r15, %r87, %r92;
setp.ge.s32 %p3, %r20, %r48;
@%p3 bra $L__BB0_6;
div.s32 %r93, %r10, %r14;
mul.lo.s32 %r94, %r93, %r58;
mad.lo.s32 %r95, %r15, %r59, %r94;
div.s32 %r96, %r16, %r17;
mad.lo.s32 %r97, %r96, %r60, %r95;
div.s32 %r98, %r18, %r57;
mad.lo.s32 %r99, %r98, %r61, %r97;
mad.lo.s32 %r100, %r19, %r62, %r99;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.s32 %rd5, %r100, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.f32 %f1, [%rd6];
mul.lo.s32 %r101, %r93, %r50;
rem.s32 %r102, %r19, %r49;
mad.lo.s32 %r103, %r20, %r51, %r101;
mad.lo.s32 %r104, %r102, %r52, %r103;
cvta.to.global.u64 %rd7, %rd1;
mul.wide.s32 %rd8, %r104, 4;
add.s64 %rd9, %rd7, %rd8;
ld.global.f32 %f2, [%rd9];
mul.f32 %f3, %f2, 0f3F3504F3;
abs.f32 %f7, %f3;
setp.ltu.f32 %p4, %f7, 0f3F8060FE;
setp.ge.f32 %p5, %f7, 0f3F8060FE;
mul.f32 %f8, %f3, %f3;
selp.f32 %f9, %f7, %f8, %p5;
selp.f32 %f10, 0f38EB4C3A, 0f38B1E96A, %p5;
selp.f32 %f11, 0fBAAE005B, 0fBA574D20, %p5;
fma.rn.f32 %f12, %f10, %f9, %f11;
selp.f32 %f13, 0f3C09919F, 0f3BAAD5EA, %p5;
fma.rn.f32 %f14, %f12, %f9, %f13;
selp.f32 %f15, 0fBD24D99A, 0fBCDC1BE7, %p5;
fma.rn.f32 %f16, %f14, %f9, %f15;
selp.f32 %f17, 0f3E235519, 0f3DE718AF, %p5;
fma.rn.f32 %f18, %f16, %f9, %f17;
selp.f32 %f19, 0f3F69B4F9, 0fBEC093AC, %p5;
fma.rn.f32 %f20, %f18, %f9, %f19;
selp.f32 %f21, 0f3F210A14, 0f3E0375D3, %p5;
fma.rn.f32 %f22, %f20, %f9, %f21;
neg.f32 %f23, %f7;
selp.f32 %f24, %f23, %f3, %p5;
fma.rn.f32 %f31, %f22, %f24, %f24;
@%p4 bra $L__BB0_5;
ex2.approx.ftz.f32 %f25, %f31;
mov.f32 %f26, 0f3F800000;
sub.f32 %f27, %f26, %f25;
mov.b32 %r105, %f27;
mov.b32 %r106, %f3;
and.b32 %r107, %r106, -2147483648;
or.b32 %r108, %r107, %r105;
mov.b32 %f31, %r108;
$L__BB0_5:
add.f32 %f28, %f31, 0f3F800000;
mul.f32 %f29, %f28, 0f3F000000;
fma.rn.f32 %f30, %f2, %f29, %f1;
cvta.to.global.u64 %rd10, %rd3;
mul.wide.s32 %rd11, %r10, 4;
add.s64 %rd12, %rd10, %rd11;
st.global.f32 [%rd12], %f30;
$L__BB0_6:
ret;
}
Kernel 25
CUDA
PTX
53997da5d
Diff
03a1b695e
-1
+1 index type: int
registers: 28
gmem: 3
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 3, 3> T0, Tensor<float, 5, 5> T1, Tensor<float, 5, 5> T9) {
if ((((((nvfuser_index_t)blockIdx.x) < (ceilDiv(((((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL]) * T0.logical_size[0LL]), 128))) && ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < ((((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL]) * T0.logical_size[0LL]))) && ((((ceilDiv(T0.logical_size[1LL], 11)) * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) / ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL]))) + ((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) % ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])) % (T1.logical_size[3LL] * T1.logical_size[4LL])) % T1.logical_size[4LL]) / T0.logical_size[2LL])) < T0.logical_size[1LL]))) {
Array<float, 1, 1> T11;
T11[0] = 0;
T11[0]
= T1[(((((T1.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL]))) + (T1.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) / ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])))) + (T1.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) % ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])) / (T1.logical_size[3LL] * T1.logical_size[4LL])))) + (T1.alloc_stride[3LL] * (((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) % ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])) % (T1.logical_size[3LL] * T1.logical_size[4LL])) / T1.logical_size[4LL]))) + (T1.alloc_stride[4LL] * (((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) % ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])) % (T1.logical_size[3LL] * T1.logical_size[4LL])) % T1.logical_size[4LL])))];
Array<float, 1, 1> T10;
T10[0] = 0;
T10[0]
= T0[((((T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL]))) + ((T0.alloc_stride[1LL] * (ceilDiv(T0.logical_size[1LL], 11))) * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) / ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])))) + (T0.alloc_stride[1LL] * ((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) % ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])) % (T1.logical_size[3LL] * T1.logical_size[4LL])) % T1.logical_size[4LL]) / T0.logical_size[2LL]))) + (T0.alloc_stride[2LL] * ((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) % ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])) % (T1.logical_size[3LL] * T1.logical_size[4LL])) % T1.logical_size[4LL]) % T0.logical_size[2LL])))];
Array<float, 1, 1> T2;
T2[0]
= T10[0]
* (float) 7.07106781186547573e-01;
Array<float, 1, 1> T3;
T3[0]
= erff(T2[0]);
Array<float, 1, 1> T4;
T4[0]
= (float) 1.00000000000000000e+00
+ T3[0];
Array<float, 1, 1> T5;
T5[0]
= (float) 5.00000000000000000e-01
* T4[0];
Array<float, 1, 1> T6;
T6[0]
= T10[0]
* T5[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0];
Array<float, 1, 1> T8;
T8[0]
= T7[0];
Array<float, 1, 1> T12;
T12[0]
= T8[0]
+ T11[0];
T9[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
= T12[0];
}
}
__global__ void nvfuser_N(Tensor<float, 3, 3> T0, Tensor<float, 5, 5> T1, Tensor<float, 5, 5> T9) {
if ((((((nvfuser_index_t)blockIdx.x) < (ceilDiv(((((11 * T0.logical_size[0LL]) * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL]), 128))) && ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < ((((11 * T0.logical_size[0LL]) * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL]))) && ((((ceilDiv(T0.logical_size[1LL], 11)) * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) / ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL]))) + ((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) % ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])) % (T1.logical_size[3LL] * T1.logical_size[4LL])) % T1.logical_size[4LL]) / T0.logical_size[2LL])) < T0.logical_size[1LL]))) {
Array<float, 1, 1> T11;
T11[0] = 0;
T11[0]
= T1[(((((T1.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL]))) + (T1.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) / ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])))) + (T1.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) % ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])) / (T1.logical_size[3LL] * T1.logical_size[4LL])))) + (T1.alloc_stride[3LL] * (((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) % ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])) % (T1.logical_size[3LL] * T1.logical_size[4LL])) / T1.logical_size[4LL]))) + (T1.alloc_stride[4LL] * (((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) % ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])) % (T1.logical_size[3LL] * T1.logical_size[4LL])) % T1.logical_size[4LL])))];
Array<float, 1, 1> T10;
T10[0] = 0;
T10[0]
= T0[((((T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL]))) + ((T0.alloc_stride[1LL] * (ceilDiv(T0.logical_size[1LL], 11))) * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) / ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])))) + (T0.alloc_stride[1LL] * ((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) % ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])) % (T1.logical_size[3LL] * T1.logical_size[4LL])) % T1.logical_size[4LL]) / T0.logical_size[2LL]))) + (T0.alloc_stride[2LL] * ((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) % ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])) % (T1.logical_size[3LL] * T1.logical_size[4LL])) % T1.logical_size[4LL]) % T0.logical_size[2LL])))];
Array<float, 1, 1> T2;
T2[0]
= T10[0]
* (float) 7.07106781186547573e-01;
Array<float, 1, 1> T3;
T3[0]
= erff(T2[0]);
Array<float, 1, 1> T4;
T4[0]
= (float) 1.00000000000000000e+00
+ T3[0];
Array<float, 1, 1> T5;
T5[0]
= (float) 5.00000000000000000e-01
* T4[0];
Array<float, 1, 1> T6;
T6[0]
= T10[0]
* T5[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0];
Array<float, 1, 1> T8;
T8[0]
= T7[0];
Array<float, 1, 1> T12;
T12[0]
= T8[0]
+ T11[0];
T9[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
= T12[0];
}
}
--- 53997da5d
+++ 03a1b695e
@@ -1,7 +1,7 @@
__global__ void nvfuser_N(Tensor<float, 3, 3> T0, Tensor<float, 5, 5> T1, Tensor<float, 5, 5> T9) {
- if ((((((nvfuser_index_t)blockIdx.x) < (ceilDiv(((((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL]) * T0.logical_size[0LL]), 128))) && ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < ((((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL]) * T0.logical_size[0LL]))) && ((((ceilDiv(T0.logical_size[1LL], 11)) * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) / ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL]))) + ((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) % ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])) % (T1.logical_size[3LL] * T1.logical_size[4LL])) % T1.logical_size[4LL]) / T0.logical_size[2LL])) < T0.logical_size[1LL]))) {
+ if ((((((nvfuser_index_t)blockIdx.x) < (ceilDiv(((((11 * T0.logical_size[0LL]) * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL]), 128))) && ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < ((((11 * T0.logical_size[0LL]) * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL]))) && ((((ceilDiv(T0.logical_size[1LL], 11)) * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) / ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL]))) + ((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) % ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])) % (T1.logical_size[3LL] * T1.logical_size[4LL])) % T1.logical_size[4LL]) / T0.logical_size[2LL])) < T0.logical_size[1LL]))) {
Array<float, 1, 1> T11;
T11[0] = 0;
T11[0]
= T1[(((((T1.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL]))) + (T1.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) / ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])))) + (T1.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) % ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])) / (T1.logical_size[3LL] * T1.logical_size[4LL])))) + (T1.alloc_stride[3LL] * (((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) % ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])) % (T1.logical_size[3LL] * T1.logical_size[4LL])) / T1.logical_size[4LL]))) + (T1.alloc_stride[4LL] * (((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((11 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T1.logical_size[4LL])) % ((T1.logical_size[2LL] * T1.logical_size[3LL]) * T1.logical_size[4LL])) % (T1.logical_size[3LL] * T1.logical_size[4LL])) % T1.logical_size[4LL])))];
Array<float, 1, 1> T10;
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_304_cu_bbf17009_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_304_cu_bbf17009_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_304_cu_bbf17009_191103std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_304_cu_bbf17009_1911011nvfuser_304ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_304_cu_bbf17009_1911011nvfuser_304ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_0[32],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_304_cu_bbf17009_1911011nvfuser_304ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_1[48],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_304_cu_bbf17009_1911011nvfuser_304ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_2[48]
)
{
.reg .pred %p<6>;
.reg .f32 %f<32>;
.reg .b32 %r<109>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r47, %r48}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_304_cu_bbf17009_1911011nvfuser_304ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_0+8];
ld.param.v2.u32 {%r49, %r50}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_304_cu_bbf17009_1911011nvfuser_304ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_0+16];
ld.param.v2.u32 {%r51, %r52}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_304_cu_bbf17009_1911011nvfuser_304ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_0+24];
ld.param.v2.u32 {%r55, %r56}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_304_cu_bbf17009_1911011nvfuser_304ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_1+16];
ld.param.v2.u32 {%r57, %r58}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_304_cu_bbf17009_1911011nvfuser_304ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_1+24];
ld.param.v2.u32 {%r59, %r60}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_304_cu_bbf17009_1911011nvfuser_304ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_1+32];
ld.param.v2.u32 {%r61, %r62}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_304_cu_bbf17009_1911011nvfuser_304ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_1+40];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_304_cu_bbf17009_1911011nvfuser_304ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_2];
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_304_cu_bbf17009_1911011nvfuser_304ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_304_cu_bbf17009_1911011nvfuser_304ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_0];
mul.lo.s32 %r73, %r47, %r55;
mul.lo.s32 %r74, %r73, %r56;
mul.lo.s32 %r75, %r74, %r57;
mul.lo.s32 %r7, %r75, 11;
add.s32 %r76, %r7, 127;
shr.s32 %r77, %r76, 31;
shr.u32 %r78, %r77, 25;
add.s32 %r79, %r76, %r78;
shr.s32 %r80, %r79, 7;
mov.u32 %r8, %ctaid.x;
setp.ge.s32 %p1, %r8, %r80;
@%p1 bra $L__BB0_6;
mov.u32 %r81, %tid.x;
shl.b32 %r82, %r8, 7;
add.s32 %r10, %r82, %r81;
setp.ge.s32 %p2, %r10, %r7;
@%p2 bra $L__BB0_6;
add.s32 %r83, %r48, 10;
mul.hi.s32 %r84, %r83, 780903145;
shr.u32 %r85, %r84, 31;
shr.s32 %r86, %r84, 1;
add.s32 %r87, %r86, %r85;
mul.lo.s32 %r88, %r55, %r56;
mul.lo.s32 %r89, %r88, %r57;
mul.lo.s32 %r14, %r89, 11;
rem.s32 %r90, %r10, %r14;
div.s32 %r15, %r90, %r89;
mul.lo.s32 %r91, %r15, %r89;
sub.s32 %r16, %r90, %r91;
mul.lo.s32 %r17, %r56, %r57;
rem.s32 %r18, %r16, %r17;
rem.s32 %r19, %r18, %r57;
div.s32 %r92, %r19, %r49;
mad.lo.s32 %r20, %r15, %r87, %r92;
setp.ge.s32 %p3, %r20, %r48;
@%p3 bra $L__BB0_6;
div.s32 %r93, %r10, %r14;
mul.lo.s32 %r94, %r93, %r58;
mad.lo.s32 %r95, %r15, %r59, %r94;
div.s32 %r96, %r16, %r17;
mad.lo.s32 %r97, %r96, %r60, %r95;
div.s32 %r98, %r18, %r57;
mad.lo.s32 %r99, %r98, %r61, %r97;
mad.lo.s32 %r100, %r19, %r62, %r99;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.s32 %rd5, %r100, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.f32 %f1, [%rd6];
mul.lo.s32 %r101, %r93, %r50;
rem.s32 %r102, %r19, %r49;
mad.lo.s32 %r103, %r20, %r51, %r101;
mad.lo.s32 %r104, %r102, %r52, %r103;
cvta.to.global.u64 %rd7, %rd1;
mul.wide.s32 %rd8, %r104, 4;
add.s64 %rd9, %rd7, %rd8;
ld.global.f32 %f2, [%rd9];
mul.f32 %f3, %f2, 0f3F3504F3;
abs.f32 %f7, %f3;
setp.ltu.f32 %p4, %f7, 0f3F8060FE;
setp.ge.f32 %p5, %f7, 0f3F8060FE;
mul.f32 %f8, %f3, %f3;
selp.f32 %f9, %f7, %f8, %p5;
selp.f32 %f10, 0f38EB4C3A, 0f38B1E96A, %p5;
selp.f32 %f11, 0fBAAE005B, 0fBA574D20, %p5;
fma.rn.f32 %f12, %f10, %f9, %f11;
selp.f32 %f13, 0f3C09919F, 0f3BAAD5EA, %p5;
fma.rn.f32 %f14, %f12, %f9, %f13;
selp.f32 %f15, 0fBD24D99A, 0fBCDC1BE7, %p5;
fma.rn.f32 %f16, %f14, %f9, %f15;
selp.f32 %f17, 0f3E235519, 0f3DE718AF, %p5;
fma.rn.f32 %f18, %f16, %f9, %f17;
selp.f32 %f19, 0f3F69B4F9, 0fBEC093AC, %p5;
fma.rn.f32 %f20, %f18, %f9, %f19;
selp.f32 %f21, 0f3F210A14, 0f3E0375D3, %p5;
fma.rn.f32 %f22, %f20, %f9, %f21;
neg.f32 %f23, %f7;
selp.f32 %f24, %f23, %f3, %p5;
fma.rn.f32 %f31, %f22, %f24, %f24;
@%p4 bra $L__BB0_5;
ex2.approx.ftz.f32 %f25, %f31;
mov.f32 %f26, 0f3F800000;
sub.f32 %f27, %f26, %f25;
mov.b32 %r105, %f27;
mov.b32 %r106, %f3;
and.b32 %r107, %r106, -2147483648;
or.b32 %r108, %r107, %r105;
mov.b32 %f31, %r108;
$L__BB0_5:
add.f32 %f28, %f31, 0f3F800000;
mul.f32 %f29, %f28, 0f3F000000;
fma.rn.f32 %f30, %f2, %f29, %f1;
cvta.to.global.u64 %rd10, %rd3;
mul.wide.s32 %rd11, %r10, 4;
add.s64 %rd12, %rd10, %rd11;
st.global.f32 [%rd12], %f30;
$L__BB0_6:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_304_cu_ca61e1b0_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_304_cu_ca61e1b0_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_304_cu_ca61e1b0_160113std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_304_cu_ca61e1b0_1601111nvfuser_304ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_304_cu_ca61e1b0_1601111nvfuser_304ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_0[32],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_304_cu_ca61e1b0_1601111nvfuser_304ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_1[48],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_304_cu_ca61e1b0_1601111nvfuser_304ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_2[48]
)
{
.reg .pred %p<6>;
.reg .f32 %f<32>;
.reg .b32 %r<109>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r47, %r48}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_304_cu_ca61e1b0_1601111nvfuser_304ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_0+8];
ld.param.v2.u32 {%r49, %r50}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_304_cu_ca61e1b0_1601111nvfuser_304ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_0+16];
ld.param.v2.u32 {%r51, %r52}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_304_cu_ca61e1b0_1601111nvfuser_304ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_0+24];
ld.param.v2.u32 {%r55, %r56}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_304_cu_ca61e1b0_1601111nvfuser_304ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_1+16];
ld.param.v2.u32 {%r57, %r58}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_304_cu_ca61e1b0_1601111nvfuser_304ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_1+24];
ld.param.v2.u32 {%r59, %r60}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_304_cu_ca61e1b0_1601111nvfuser_304ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_1+32];
ld.param.v2.u32 {%r61, %r62}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_304_cu_ca61e1b0_1601111nvfuser_304ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_1+40];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_304_cu_ca61e1b0_1601111nvfuser_304ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_2];
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_304_cu_ca61e1b0_1601111nvfuser_304ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_304_cu_ca61e1b0_1601111nvfuser_304ENS_6TensorIfLi3ELi3EEENS0_IfLi5ELi5EEES2__param_0];
mul.lo.s32 %r73, %r47, %r55;
mul.lo.s32 %r74, %r73, %r56;
mul.lo.s32 %r75, %r74, %r57;
mul.lo.s32 %r7, %r75, 11;
add.s32 %r76, %r7, 127;
shr.s32 %r77, %r76, 31;
shr.u32 %r78, %r77, 25;
add.s32 %r79, %r76, %r78;
shr.s32 %r80, %r79, 7;
mov.u32 %r8, %ctaid.x;
setp.ge.s32 %p1, %r8, %r80;
@%p1 bra $L__BB0_6;
mov.u32 %r81, %tid.x;
shl.b32 %r82, %r8, 7;
add.s32 %r10, %r82, %r81;
setp.ge.s32 %p2, %r10, %r7;
@%p2 bra $L__BB0_6;
add.s32 %r83, %r48, 10;
mul.hi.s32 %r84, %r83, 780903145;
shr.u32 %r85, %r84, 31;
shr.s32 %r86, %r84, 1;
add.s32 %r87, %r86, %r85;
mul.lo.s32 %r88, %r55, %r56;
mul.lo.s32 %r89, %r88, %r57;
mul.lo.s32 %r14, %r89, 11;
rem.s32 %r90, %r10, %r14;
div.s32 %r15, %r90, %r89;
mul.lo.s32 %r91, %r15, %r89;
sub.s32 %r16, %r90, %r91;
mul.lo.s32 %r17, %r56, %r57;
rem.s32 %r18, %r16, %r17;
rem.s32 %r19, %r18, %r57;
div.s32 %r92, %r19, %r49;
mad.lo.s32 %r20, %r15, %r87, %r92;
setp.ge.s32 %p3, %r20, %r48;
@%p3 bra $L__BB0_6;
div.s32 %r93, %r10, %r14;
mul.lo.s32 %r94, %r93, %r58;
mad.lo.s32 %r95, %r15, %r59, %r94;
div.s32 %r96, %r16, %r17;
mad.lo.s32 %r97, %r96, %r60, %r95;
div.s32 %r98, %r18, %r57;
mad.lo.s32 %r99, %r98, %r61, %r97;
mad.lo.s32 %r100, %r19, %r62, %r99;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.s32 %rd5, %r100, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.f32 %f1, [%rd6];
mul.lo.s32 %r101, %r93, %r50;
rem.s32 %r102, %r19, %r49;
mad.lo.s32 %r103, %r20, %r51, %r101;
mad.lo.s32 %r104, %r102, %r52, %r103;
cvta.to.global.u64 %rd7, %rd1;
mul.wide.s32 %rd8, %r104, 4;
add.s64 %rd9, %rd7, %rd8;
ld.global.f32 %f2, [%rd9];
mul.f32 %f3, %f2, 0f3F3504F3;
abs.f32 %f7, %f3;
setp.ltu.f32 %p4, %f7, 0f3F8060FE;
setp.ge.f32 %p5, %f7, 0f3F8060FE;
mul.f32 %f8, %f3, %f3;
selp.f32 %f9, %f7, %f8, %p5;
selp.f32 %f10, 0f38EB4C3A, 0f38B1E96A, %p5;
selp.f32 %f11, 0fBAAE005B, 0fBA574D20, %p5;
fma.rn.f32 %f12, %f10, %f9, %f11;
selp.f32 %f13, 0f3C09919F, 0f3BAAD5EA, %p5;
fma.rn.f32 %f14, %f12, %f9, %f13;
selp.f32 %f15, 0fBD24D99A, 0fBCDC1BE7, %p5;
fma.rn.f32 %f16, %f14, %f9, %f15;
selp.f32 %f17, 0f3E235519, 0f3DE718AF, %p5;
fma.rn.f32 %f18, %f16, %f9, %f17;
selp.f32 %f19, 0f3F69B4F9, 0fBEC093AC, %p5;
fma.rn.f32 %f20, %f18, %f9, %f19;
selp.f32 %f21, 0f3F210A14, 0f3E0375D3, %p5;
fma.rn.f32 %f22, %f20, %f9, %f21;
neg.f32 %f23, %f7;
selp.f32 %f24, %f23, %f3, %p5;
fma.rn.f32 %f31, %f22, %f24, %f24;
@%p4 bra $L__BB0_5;
ex2.approx.ftz.f32 %f25, %f31;
mov.f32 %f26, 0f3F800000;
sub.f32 %f27, %f26, %f25;
mov.b32 %r105, %f27;
mov.b32 %r106, %f3;
and.b32 %r107, %r106, -2147483648;
or.b32 %r108, %r107, %r105;
mov.b32 %f31, %r108;
$L__BB0_5:
add.f32 %f28, %f31, 0f3F800000;
mul.f32 %f29, %f28, 0f3F000000;
fma.rn.f32 %f30, %f2, %f29, %f1;
cvta.to.global.u64 %rd10, %rd3;
mul.wide.s32 %rd11, %r10, 4;
add.s64 %rd12, %rd10, %rd11;
st.global.f32 [%rd12], %f30;
$L__BB0_6:
ret;
}
Kernel 39
CUDA
PTX
53997da5d
Diff
03a1b695e
-1
+1 index type: int
registers: 23
gmem: 3
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 3, 3> T0, Tensor<float, 4, 4> T1, Tensor<float, 4, 4> T9) {
if ((((((nvfuser_index_t)blockIdx.x) < (ceilDiv((((303 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T0.logical_size[0LL]), 128))) && ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < (((303 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T0.logical_size[0LL]))) && (((T1.logical_size[2LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) / (T1.logical_size[2LL] * T1.logical_size[3LL]))) + ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) % (T1.logical_size[2LL] * T1.logical_size[3LL])) / T1.logical_size[3LL])) < (T0.logical_size[1LL] * T0.logical_size[2LL])))) {
Array<float, 1, 1> T11;
T11[0] = 0;
T11[0]
= T1[((((T1.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL]))) + (T1.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) / (T1.logical_size[2LL] * T1.logical_size[3LL])))) + (T1.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) % (T1.logical_size[2LL] * T1.logical_size[3LL])) / T1.logical_size[3LL]))) + (T1.alloc_stride[3LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) % (T1.logical_size[2LL] * T1.logical_size[3LL])) % T1.logical_size[3LL])))];
Array<float, 1, 1> T10;
T10[0] = 0;
T10[0]
= T0[(((T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL]))) + (T0.alloc_stride[1LL] * (((T1.logical_size[2LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) / (T1.logical_size[2LL] * T1.logical_size[3LL]))) + ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) % (T1.logical_size[2LL] * T1.logical_size[3LL])) / T1.logical_size[3LL])) / T0.logical_size[2LL]))) + (T0.alloc_stride[2LL] * (((T1.logical_size[2LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) / (T1.logical_size[2LL] * T1.logical_size[3LL]))) + ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) % (T1.logical_size[2LL] * T1.logical_size[3LL])) / T1.logical_size[3LL])) % T0.logical_size[2LL])))];
Array<float, 1, 1> T2;
T2[0]
= T10[0]
* (float) 7.07106781186547573e-01;
Array<float, 1, 1> T3;
T3[0]
= erff(T2[0]);
Array<float, 1, 1> T4;
T4[0]
= (float) 1.00000000000000000e+00
+ T3[0];
Array<float, 1, 1> T5;
T5[0]
= (float) 5.00000000000000000e-01
* T4[0];
Array<float, 1, 1> T6;
T6[0]
= T10[0]
* T5[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0];
Array<float, 1, 1> T8;
T8[0]
= T7[0];
Array<float, 1, 1> T12;
T12[0]
= T8[0]
+ T11[0];
T9[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
= T12[0];
}
}
__global__ void nvfuser_N(Tensor<float, 3, 3> T0, Tensor<float, 4, 4> T1, Tensor<float, 4, 4> T9) {
if ((((((nvfuser_index_t)blockIdx.x) < (ceilDiv((((303 * T0.logical_size[0LL]) * T1.logical_size[2LL]) * T1.logical_size[3LL]), 128))) && ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < (((303 * T0.logical_size[0LL]) * T1.logical_size[2LL]) * T1.logical_size[3LL]))) && (((T1.logical_size[2LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) / (T1.logical_size[2LL] * T1.logical_size[3LL]))) + ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) % (T1.logical_size[2LL] * T1.logical_size[3LL])) / T1.logical_size[3LL])) < (T0.logical_size[1LL] * T0.logical_size[2LL])))) {
Array<float, 1, 1> T11;
T11[0] = 0;
T11[0]
= T1[((((T1.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL]))) + (T1.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) / (T1.logical_size[2LL] * T1.logical_size[3LL])))) + (T1.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) % (T1.logical_size[2LL] * T1.logical_size[3LL])) / T1.logical_size[3LL]))) + (T1.alloc_stride[3LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) % (T1.logical_size[2LL] * T1.logical_size[3LL])) % T1.logical_size[3LL])))];
Array<float, 1, 1> T10;
T10[0] = 0;
T10[0]
= T0[(((T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL]))) + (T0.alloc_stride[1LL] * (((T1.logical_size[2LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) / (T1.logical_size[2LL] * T1.logical_size[3LL]))) + ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) % (T1.logical_size[2LL] * T1.logical_size[3LL])) / T1.logical_size[3LL])) / T0.logical_size[2LL]))) + (T0.alloc_stride[2LL] * (((T1.logical_size[2LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) / (T1.logical_size[2LL] * T1.logical_size[3LL]))) + ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) % (T1.logical_size[2LL] * T1.logical_size[3LL])) / T1.logical_size[3LL])) % T0.logical_size[2LL])))];
Array<float, 1, 1> T2;
T2[0]
= T10[0]
* (float) 7.07106781186547573e-01;
Array<float, 1, 1> T3;
T3[0]
= erff(T2[0]);
Array<float, 1, 1> T4;
T4[0]
= (float) 1.00000000000000000e+00
+ T3[0];
Array<float, 1, 1> T5;
T5[0]
= (float) 5.00000000000000000e-01
* T4[0];
Array<float, 1, 1> T6;
T6[0]
= T10[0]
* T5[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0];
Array<float, 1, 1> T8;
T8[0]
= T7[0];
Array<float, 1, 1> T12;
T12[0]
= T8[0]
+ T11[0];
T9[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
= T12[0];
}
}
--- 53997da5d
+++ 03a1b695e
@@ -1,7 +1,7 @@
__global__ void nvfuser_N(Tensor<float, 3, 3> T0, Tensor<float, 4, 4> T1, Tensor<float, 4, 4> T9) {
- if ((((((nvfuser_index_t)blockIdx.x) < (ceilDiv((((303 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T0.logical_size[0LL]), 128))) && ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < (((303 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T0.logical_size[0LL]))) && (((T1.logical_size[2LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) / (T1.logical_size[2LL] * T1.logical_size[3LL]))) + ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) % (T1.logical_size[2LL] * T1.logical_size[3LL])) / T1.logical_size[3LL])) < (T0.logical_size[1LL] * T0.logical_size[2LL])))) {
+ if ((((((nvfuser_index_t)blockIdx.x) < (ceilDiv((((303 * T0.logical_size[0LL]) * T1.logical_size[2LL]) * T1.logical_size[3LL]), 128))) && ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < (((303 * T0.logical_size[0LL]) * T1.logical_size[2LL]) * T1.logical_size[3LL]))) && (((T1.logical_size[2LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) / (T1.logical_size[2LL] * T1.logical_size[3LL]))) + ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) % (T1.logical_size[2LL] * T1.logical_size[3LL])) / T1.logical_size[3LL])) < (T0.logical_size[1LL] * T0.logical_size[2LL])))) {
Array<float, 1, 1> T11;
T11[0] = 0;
T11[0]
= T1[((((T1.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL]))) + (T1.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) / (T1.logical_size[2LL] * T1.logical_size[3LL])))) + (T1.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) % (T1.logical_size[2LL] * T1.logical_size[3LL])) / T1.logical_size[3LL]))) + (T1.alloc_stride[3LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) % (T1.logical_size[2LL] * T1.logical_size[3LL])) % T1.logical_size[3LL])))];
Array<float, 1, 1> T10;
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_318_cu_eb02c686_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_318_cu_eb02c686_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_318_cu_eb02c686_191103std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_318_cu_eb02c686_1911011nvfuser_318ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_318_cu_eb02c686_1911011nvfuser_318ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_0[32],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_318_cu_eb02c686_1911011nvfuser_318ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_1[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_318_cu_eb02c686_1911011nvfuser_318ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_2[40]
)
{
.reg .pred %p<6>;
.reg .f32 %f<32>;
.reg .b32 %r<91>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r40, %r41}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_318_cu_eb02c686_1911011nvfuser_318ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_0+8];
ld.param.v2.u32 {%r42, %r43}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_318_cu_eb02c686_1911011nvfuser_318ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_0+16];
ld.param.v2.u32 {%r44, %r45}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_318_cu_eb02c686_1911011nvfuser_318ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_0+24];
ld.param.v2.u32 {%r48, %r49}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_318_cu_eb02c686_1911011nvfuser_318ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_1+16];
ld.param.v2.u32 {%r50, %r51}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_318_cu_eb02c686_1911011nvfuser_318ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_1+24];
ld.param.v2.u32 {%r52, %r53}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_318_cu_eb02c686_1911011nvfuser_318ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_1+32];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_318_cu_eb02c686_1911011nvfuser_318ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_2];
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_318_cu_eb02c686_1911011nvfuser_318ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_318_cu_eb02c686_1911011nvfuser_318ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_0];
mul.lo.s32 %r62, %r40, %r48;
mul.lo.s32 %r63, %r62, %r49;
mul.lo.s32 %r6, %r63, 303;
add.s32 %r64, %r6, 127;
shr.s32 %r65, %r64, 31;
shr.u32 %r66, %r65, 25;
add.s32 %r67, %r64, %r66;
shr.s32 %r68, %r67, 7;
mov.u32 %r7, %ctaid.x;
setp.ge.s32 %p1, %r7, %r68;
@%p1 bra $L__BB0_6;
mov.u32 %r69, %tid.x;
shl.b32 %r70, %r7, 7;
add.s32 %r9, %r70, %r69;
setp.ge.s32 %p2, %r9, %r6;
@%p2 bra $L__BB0_6;
mul.lo.s32 %r71, %r48, %r49;
mul.lo.s32 %r13, %r71, 303;
rem.s32 %r72, %r9, %r13;
div.s32 %r14, %r72, %r71;
mul.lo.s32 %r73, %r14, %r71;
sub.s32 %r15, %r72, %r73;
div.s32 %r16, %r15, %r49;
mad.lo.s32 %r17, %r14, %r48, %r16;
mul.lo.s32 %r74, %r41, %r42;
setp.ge.s32 %p3, %r17, %r74;
@%p3 bra $L__BB0_6;
div.s32 %r75, %r9, %r13;
mul.lo.s32 %r76, %r75, %r50;
mad.lo.s32 %r77, %r14, %r51, %r76;
mad.lo.s32 %r78, %r16, %r52, %r77;
rem.s32 %r79, %r15, %r49;
mad.lo.s32 %r80, %r79, %r53, %r78;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.s32 %rd5, %r80, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.f32 %f1, [%rd6];
mul.lo.s32 %r81, %r75, %r43;
div.s32 %r82, %r17, %r42;
mad.lo.s32 %r83, %r82, %r44, %r81;
mul.lo.s32 %r84, %r82, %r42;
sub.s32 %r85, %r17, %r84;
mad.lo.s32 %r86, %r85, %r45, %r83;
cvta.to.global.u64 %rd7, %rd1;
mul.wide.s32 %rd8, %r86, 4;
add.s64 %rd9, %rd7, %rd8;
ld.global.f32 %f2, [%rd9];
mul.f32 %f3, %f2, 0f3F3504F3;
abs.f32 %f7, %f3;
setp.ltu.f32 %p4, %f7, 0f3F8060FE;
setp.ge.f32 %p5, %f7, 0f3F8060FE;
mul.f32 %f8, %f3, %f3;
selp.f32 %f9, %f7, %f8, %p5;
selp.f32 %f10, 0f38EB4C3A, 0f38B1E96A, %p5;
selp.f32 %f11, 0fBAAE005B, 0fBA574D20, %p5;
fma.rn.f32 %f12, %f10, %f9, %f11;
selp.f32 %f13, 0f3C09919F, 0f3BAAD5EA, %p5;
fma.rn.f32 %f14, %f12, %f9, %f13;
selp.f32 %f15, 0fBD24D99A, 0fBCDC1BE7, %p5;
fma.rn.f32 %f16, %f14, %f9, %f15;
selp.f32 %f17, 0f3E235519, 0f3DE718AF, %p5;
fma.rn.f32 %f18, %f16, %f9, %f17;
selp.f32 %f19, 0f3F69B4F9, 0fBEC093AC, %p5;
fma.rn.f32 %f20, %f18, %f9, %f19;
selp.f32 %f21, 0f3F210A14, 0f3E0375D3, %p5;
fma.rn.f32 %f22, %f20, %f9, %f21;
neg.f32 %f23, %f7;
selp.f32 %f24, %f23, %f3, %p5;
fma.rn.f32 %f31, %f22, %f24, %f24;
@%p4 bra $L__BB0_5;
ex2.approx.ftz.f32 %f25, %f31;
mov.f32 %f26, 0f3F800000;
sub.f32 %f27, %f26, %f25;
mov.b32 %r87, %f27;
mov.b32 %r88, %f3;
and.b32 %r89, %r88, -2147483648;
or.b32 %r90, %r89, %r87;
mov.b32 %f31, %r90;
$L__BB0_5:
add.f32 %f28, %f31, 0f3F800000;
mul.f32 %f29, %f28, 0f3F000000;
fma.rn.f32 %f30, %f2, %f29, %f1;
cvta.to.global.u64 %rd10, %rd3;
mul.wide.s32 %rd11, %r9, 4;
add.s64 %rd12, %rd10, %rd11;
st.global.f32 [%rd12], %f30;
$L__BB0_6:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_318_cu_9a92573f_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_318_cu_9a92573f_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_318_cu_9a92573f_160113std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_318_cu_9a92573f_1601111nvfuser_318ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_318_cu_9a92573f_1601111nvfuser_318ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_0[32],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_318_cu_9a92573f_1601111nvfuser_318ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_1[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_318_cu_9a92573f_1601111nvfuser_318ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_2[40]
)
{
.reg .pred %p<6>;
.reg .f32 %f<32>;
.reg .b32 %r<91>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r40, %r41}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_318_cu_9a92573f_1601111nvfuser_318ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_0+8];
ld.param.v2.u32 {%r42, %r43}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_318_cu_9a92573f_1601111nvfuser_318ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_0+16];
ld.param.v2.u32 {%r44, %r45}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_318_cu_9a92573f_1601111nvfuser_318ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_0+24];
ld.param.v2.u32 {%r48, %r49}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_318_cu_9a92573f_1601111nvfuser_318ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_1+16];
ld.param.v2.u32 {%r50, %r51}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_318_cu_9a92573f_1601111nvfuser_318ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_1+24];
ld.param.v2.u32 {%r52, %r53}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_318_cu_9a92573f_1601111nvfuser_318ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_1+32];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_318_cu_9a92573f_1601111nvfuser_318ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_2];
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_318_cu_9a92573f_1601111nvfuser_318ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_318_cu_9a92573f_1601111nvfuser_318ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_0];
mul.lo.s32 %r62, %r40, %r48;
mul.lo.s32 %r63, %r62, %r49;
mul.lo.s32 %r6, %r63, 303;
add.s32 %r64, %r6, 127;
shr.s32 %r65, %r64, 31;
shr.u32 %r66, %r65, 25;
add.s32 %r67, %r64, %r66;
shr.s32 %r68, %r67, 7;
mov.u32 %r7, %ctaid.x;
setp.ge.s32 %p1, %r7, %r68;
@%p1 bra $L__BB0_6;
mov.u32 %r69, %tid.x;
shl.b32 %r70, %r7, 7;
add.s32 %r9, %r70, %r69;
setp.ge.s32 %p2, %r9, %r6;
@%p2 bra $L__BB0_6;
mul.lo.s32 %r71, %r48, %r49;
mul.lo.s32 %r13, %r71, 303;
rem.s32 %r72, %r9, %r13;
div.s32 %r14, %r72, %r71;
mul.lo.s32 %r73, %r14, %r71;
sub.s32 %r15, %r72, %r73;
div.s32 %r16, %r15, %r49;
mad.lo.s32 %r17, %r14, %r48, %r16;
mul.lo.s32 %r74, %r41, %r42;
setp.ge.s32 %p3, %r17, %r74;
@%p3 bra $L__BB0_6;
div.s32 %r75, %r9, %r13;
mul.lo.s32 %r76, %r75, %r50;
mad.lo.s32 %r77, %r14, %r51, %r76;
mad.lo.s32 %r78, %r16, %r52, %r77;
rem.s32 %r79, %r15, %r49;
mad.lo.s32 %r80, %r79, %r53, %r78;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.s32 %rd5, %r80, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.f32 %f1, [%rd6];
mul.lo.s32 %r81, %r75, %r43;
div.s32 %r82, %r17, %r42;
mad.lo.s32 %r83, %r82, %r44, %r81;
mul.lo.s32 %r84, %r82, %r42;
sub.s32 %r85, %r17, %r84;
mad.lo.s32 %r86, %r85, %r45, %r83;
cvta.to.global.u64 %rd7, %rd1;
mul.wide.s32 %rd8, %r86, 4;
add.s64 %rd9, %rd7, %rd8;
ld.global.f32 %f2, [%rd9];
mul.f32 %f3, %f2, 0f3F3504F3;
abs.f32 %f7, %f3;
setp.ltu.f32 %p4, %f7, 0f3F8060FE;
setp.ge.f32 %p5, %f7, 0f3F8060FE;
mul.f32 %f8, %f3, %f3;
selp.f32 %f9, %f7, %f8, %p5;
selp.f32 %f10, 0f38EB4C3A, 0f38B1E96A, %p5;
selp.f32 %f11, 0fBAAE005B, 0fBA574D20, %p5;
fma.rn.f32 %f12, %f10, %f9, %f11;
selp.f32 %f13, 0f3C09919F, 0f3BAAD5EA, %p5;
fma.rn.f32 %f14, %f12, %f9, %f13;
selp.f32 %f15, 0fBD24D99A, 0fBCDC1BE7, %p5;
fma.rn.f32 %f16, %f14, %f9, %f15;
selp.f32 %f17, 0f3E235519, 0f3DE718AF, %p5;
fma.rn.f32 %f18, %f16, %f9, %f17;
selp.f32 %f19, 0f3F69B4F9, 0fBEC093AC, %p5;
fma.rn.f32 %f20, %f18, %f9, %f19;
selp.f32 %f21, 0f3F210A14, 0f3E0375D3, %p5;
fma.rn.f32 %f22, %f20, %f9, %f21;
neg.f32 %f23, %f7;
selp.f32 %f24, %f23, %f3, %p5;
fma.rn.f32 %f31, %f22, %f24, %f24;
@%p4 bra $L__BB0_5;
ex2.approx.ftz.f32 %f25, %f31;
mov.f32 %f26, 0f3F800000;
sub.f32 %f27, %f26, %f25;
mov.b32 %r87, %f27;
mov.b32 %r88, %f3;
and.b32 %r89, %r88, -2147483648;
or.b32 %r90, %r89, %r87;
mov.b32 %f31, %r90;
$L__BB0_5:
add.f32 %f28, %f31, 0f3F800000;
mul.f32 %f29, %f28, 0f3F000000;
fma.rn.f32 %f30, %f2, %f29, %f1;
cvta.to.global.u64 %rd10, %rd3;
mul.wide.s32 %rd11, %r9, 4;
add.s64 %rd12, %rd10, %rd11;
st.global.f32 [%rd12], %f30;
$L__BB0_6:
ret;
}
Kernel 41
CUDA
PTX
53997da5d
Diff
03a1b695e
-1
+1 index type: int
registers: 23
gmem: 3
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 3, 3> T0, Tensor<float, 4, 4> T1, Tensor<float, 4, 4> T9) {
if ((((((nvfuser_index_t)blockIdx.x) < (ceilDiv((((303 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T0.logical_size[0LL]), 128))) && ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < (((303 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T0.logical_size[0LL]))) && (((T1.logical_size[2LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) / (T1.logical_size[2LL] * T1.logical_size[3LL]))) + ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) % (T1.logical_size[2LL] * T1.logical_size[3LL])) / T1.logical_size[3LL])) < (T0.logical_size[1LL] * T0.logical_size[2LL])))) {
Array<float, 1, 1> T11;
T11[0] = 0;
T11[0]
= T1[((((T1.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL]))) + (T1.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) / (T1.logical_size[2LL] * T1.logical_size[3LL])))) + (T1.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) % (T1.logical_size[2LL] * T1.logical_size[3LL])) / T1.logical_size[3LL]))) + (T1.alloc_stride[3LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) % (T1.logical_size[2LL] * T1.logical_size[3LL])) % T1.logical_size[3LL])))];
Array<float, 1, 1> T10;
T10[0] = 0;
T10[0]
= T0[(((T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL]))) + (T0.alloc_stride[1LL] * (((T1.logical_size[2LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) / (T1.logical_size[2LL] * T1.logical_size[3LL]))) + ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) % (T1.logical_size[2LL] * T1.logical_size[3LL])) / T1.logical_size[3LL])) / T0.logical_size[2LL]))) + (T0.alloc_stride[2LL] * (((T1.logical_size[2LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) / (T1.logical_size[2LL] * T1.logical_size[3LL]))) + ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) % (T1.logical_size[2LL] * T1.logical_size[3LL])) / T1.logical_size[3LL])) % T0.logical_size[2LL])))];
Array<float, 1, 1> T2;
T2[0]
= T10[0]
* (float) 7.07106781186547573e-01;
Array<float, 1, 1> T3;
T3[0]
= erff(T2[0]);
Array<float, 1, 1> T4;
T4[0]
= (float) 1.00000000000000000e+00
+ T3[0];
Array<float, 1, 1> T5;
T5[0]
= (float) 5.00000000000000000e-01
* T4[0];
Array<float, 1, 1> T6;
T6[0]
= T10[0]
* T5[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0];
Array<float, 1, 1> T8;
T8[0]
= T7[0];
Array<float, 1, 1> T12;
T12[0]
= T8[0]
+ T11[0];
T9[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
= T12[0];
}
}
__global__ void nvfuser_N(Tensor<float, 3, 3> T0, Tensor<float, 4, 4> T1, Tensor<float, 4, 4> T9) {
if ((((((nvfuser_index_t)blockIdx.x) < (ceilDiv((((303 * T0.logical_size[0LL]) * T1.logical_size[2LL]) * T1.logical_size[3LL]), 128))) && ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < (((303 * T0.logical_size[0LL]) * T1.logical_size[2LL]) * T1.logical_size[3LL]))) && (((T1.logical_size[2LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) / (T1.logical_size[2LL] * T1.logical_size[3LL]))) + ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) % (T1.logical_size[2LL] * T1.logical_size[3LL])) / T1.logical_size[3LL])) < (T0.logical_size[1LL] * T0.logical_size[2LL])))) {
Array<float, 1, 1> T11;
T11[0] = 0;
T11[0]
= T1[((((T1.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL]))) + (T1.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) / (T1.logical_size[2LL] * T1.logical_size[3LL])))) + (T1.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) % (T1.logical_size[2LL] * T1.logical_size[3LL])) / T1.logical_size[3LL]))) + (T1.alloc_stride[3LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) % (T1.logical_size[2LL] * T1.logical_size[3LL])) % T1.logical_size[3LL])))];
Array<float, 1, 1> T10;
T10[0] = 0;
T10[0]
= T0[(((T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL]))) + (T0.alloc_stride[1LL] * (((T1.logical_size[2LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) / (T1.logical_size[2LL] * T1.logical_size[3LL]))) + ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) % (T1.logical_size[2LL] * T1.logical_size[3LL])) / T1.logical_size[3LL])) / T0.logical_size[2LL]))) + (T0.alloc_stride[2LL] * (((T1.logical_size[2LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) / (T1.logical_size[2LL] * T1.logical_size[3LL]))) + ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) % (T1.logical_size[2LL] * T1.logical_size[3LL])) / T1.logical_size[3LL])) % T0.logical_size[2LL])))];
Array<float, 1, 1> T2;
T2[0]
= T10[0]
* (float) 7.07106781186547573e-01;
Array<float, 1, 1> T3;
T3[0]
= erff(T2[0]);
Array<float, 1, 1> T4;
T4[0]
= (float) 1.00000000000000000e+00
+ T3[0];
Array<float, 1, 1> T5;
T5[0]
= (float) 5.00000000000000000e-01
* T4[0];
Array<float, 1, 1> T6;
T6[0]
= T10[0]
* T5[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0];
Array<float, 1, 1> T8;
T8[0]
= T7[0];
Array<float, 1, 1> T12;
T12[0]
= T8[0]
+ T11[0];
T9[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
= T12[0];
}
}
--- 53997da5d
+++ 03a1b695e
@@ -1,7 +1,7 @@
__global__ void nvfuser_N(Tensor<float, 3, 3> T0, Tensor<float, 4, 4> T1, Tensor<float, 4, 4> T9) {
- if ((((((nvfuser_index_t)blockIdx.x) < (ceilDiv((((303 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T0.logical_size[0LL]), 128))) && ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < (((303 * T1.logical_size[2LL]) * T1.logical_size[3LL]) * T0.logical_size[0LL]))) && (((T1.logical_size[2LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) / (T1.logical_size[2LL] * T1.logical_size[3LL]))) + ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) % (T1.logical_size[2LL] * T1.logical_size[3LL])) / T1.logical_size[3LL])) < (T0.logical_size[1LL] * T0.logical_size[2LL])))) {
+ if ((((((nvfuser_index_t)blockIdx.x) < (ceilDiv((((303 * T0.logical_size[0LL]) * T1.logical_size[2LL]) * T1.logical_size[3LL]), 128))) && ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < (((303 * T0.logical_size[0LL]) * T1.logical_size[2LL]) * T1.logical_size[3LL]))) && (((T1.logical_size[2LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) / (T1.logical_size[2LL] * T1.logical_size[3LL]))) + ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) % (T1.logical_size[2LL] * T1.logical_size[3LL])) / T1.logical_size[3LL])) < (T0.logical_size[1LL] * T0.logical_size[2LL])))) {
Array<float, 1, 1> T11;
T11[0] = 0;
T11[0]
= T1[((((T1.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL]))) + (T1.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) / (T1.logical_size[2LL] * T1.logical_size[3LL])))) + (T1.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) % (T1.logical_size[2LL] * T1.logical_size[3LL])) / T1.logical_size[3LL]))) + (T1.alloc_stride[3LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % ((303 * T1.logical_size[2LL]) * T1.logical_size[3LL])) % (T1.logical_size[2LL] * T1.logical_size[3LL])) % T1.logical_size[3LL])))];
Array<float, 1, 1> T10;
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_320_cu_eb02c686_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_320_cu_eb02c686_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_320_cu_eb02c686_191103std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_320_cu_eb02c686_1911011nvfuser_320ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_320_cu_eb02c686_1911011nvfuser_320ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_0[32],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_320_cu_eb02c686_1911011nvfuser_320ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_1[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_320_cu_eb02c686_1911011nvfuser_320ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_2[40]
)
{
.reg .pred %p<6>;
.reg .f32 %f<32>;
.reg .b32 %r<91>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r40, %r41}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_320_cu_eb02c686_1911011nvfuser_320ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_0+8];
ld.param.v2.u32 {%r42, %r43}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_320_cu_eb02c686_1911011nvfuser_320ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_0+16];
ld.param.v2.u32 {%r44, %r45}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_320_cu_eb02c686_1911011nvfuser_320ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_0+24];
ld.param.v2.u32 {%r48, %r49}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_320_cu_eb02c686_1911011nvfuser_320ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_1+16];
ld.param.v2.u32 {%r50, %r51}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_320_cu_eb02c686_1911011nvfuser_320ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_1+24];
ld.param.v2.u32 {%r52, %r53}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_320_cu_eb02c686_1911011nvfuser_320ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_1+32];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_320_cu_eb02c686_1911011nvfuser_320ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_2];
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_320_cu_eb02c686_1911011nvfuser_320ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_320_cu_eb02c686_1911011nvfuser_320ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_0];
mul.lo.s32 %r62, %r40, %r48;
mul.lo.s32 %r63, %r62, %r49;
mul.lo.s32 %r6, %r63, 303;
add.s32 %r64, %r6, 127;
shr.s32 %r65, %r64, 31;
shr.u32 %r66, %r65, 25;
add.s32 %r67, %r64, %r66;
shr.s32 %r68, %r67, 7;
mov.u32 %r7, %ctaid.x;
setp.ge.s32 %p1, %r7, %r68;
@%p1 bra $L__BB0_6;
mov.u32 %r69, %tid.x;
shl.b32 %r70, %r7, 7;
add.s32 %r9, %r70, %r69;
setp.ge.s32 %p2, %r9, %r6;
@%p2 bra $L__BB0_6;
mul.lo.s32 %r71, %r48, %r49;
mul.lo.s32 %r13, %r71, 303;
rem.s32 %r72, %r9, %r13;
div.s32 %r14, %r72, %r71;
mul.lo.s32 %r73, %r14, %r71;
sub.s32 %r15, %r72, %r73;
div.s32 %r16, %r15, %r49;
mad.lo.s32 %r17, %r14, %r48, %r16;
mul.lo.s32 %r74, %r41, %r42;
setp.ge.s32 %p3, %r17, %r74;
@%p3 bra $L__BB0_6;
div.s32 %r75, %r9, %r13;
mul.lo.s32 %r76, %r75, %r50;
mad.lo.s32 %r77, %r14, %r51, %r76;
mad.lo.s32 %r78, %r16, %r52, %r77;
rem.s32 %r79, %r15, %r49;
mad.lo.s32 %r80, %r79, %r53, %r78;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.s32 %rd5, %r80, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.f32 %f1, [%rd6];
mul.lo.s32 %r81, %r75, %r43;
div.s32 %r82, %r17, %r42;
mad.lo.s32 %r83, %r82, %r44, %r81;
mul.lo.s32 %r84, %r82, %r42;
sub.s32 %r85, %r17, %r84;
mad.lo.s32 %r86, %r85, %r45, %r83;
cvta.to.global.u64 %rd7, %rd1;
mul.wide.s32 %rd8, %r86, 4;
add.s64 %rd9, %rd7, %rd8;
ld.global.f32 %f2, [%rd9];
mul.f32 %f3, %f2, 0f3F3504F3;
abs.f32 %f7, %f3;
setp.ltu.f32 %p4, %f7, 0f3F8060FE;
setp.ge.f32 %p5, %f7, 0f3F8060FE;
mul.f32 %f8, %f3, %f3;
selp.f32 %f9, %f7, %f8, %p5;
selp.f32 %f10, 0f38EB4C3A, 0f38B1E96A, %p5;
selp.f32 %f11, 0fBAAE005B, 0fBA574D20, %p5;
fma.rn.f32 %f12, %f10, %f9, %f11;
selp.f32 %f13, 0f3C09919F, 0f3BAAD5EA, %p5;
fma.rn.f32 %f14, %f12, %f9, %f13;
selp.f32 %f15, 0fBD24D99A, 0fBCDC1BE7, %p5;
fma.rn.f32 %f16, %f14, %f9, %f15;
selp.f32 %f17, 0f3E235519, 0f3DE718AF, %p5;
fma.rn.f32 %f18, %f16, %f9, %f17;
selp.f32 %f19, 0f3F69B4F9, 0fBEC093AC, %p5;
fma.rn.f32 %f20, %f18, %f9, %f19;
selp.f32 %f21, 0f3F210A14, 0f3E0375D3, %p5;
fma.rn.f32 %f22, %f20, %f9, %f21;
neg.f32 %f23, %f7;
selp.f32 %f24, %f23, %f3, %p5;
fma.rn.f32 %f31, %f22, %f24, %f24;
@%p4 bra $L__BB0_5;
ex2.approx.ftz.f32 %f25, %f31;
mov.f32 %f26, 0f3F800000;
sub.f32 %f27, %f26, %f25;
mov.b32 %r87, %f27;
mov.b32 %r88, %f3;
and.b32 %r89, %r88, -2147483648;
or.b32 %r90, %r89, %r87;
mov.b32 %f31, %r90;
$L__BB0_5:
add.f32 %f28, %f31, 0f3F800000;
mul.f32 %f29, %f28, 0f3F000000;
fma.rn.f32 %f30, %f2, %f29, %f1;
cvta.to.global.u64 %rd10, %rd3;
mul.wide.s32 %rd11, %r9, 4;
add.s64 %rd12, %rd10, %rd11;
st.global.f32 [%rd12], %f30;
$L__BB0_6:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_320_cu_9a92573f_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_320_cu_9a92573f_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_320_cu_9a92573f_160113std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_320_cu_9a92573f_1601111nvfuser_320ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_320_cu_9a92573f_1601111nvfuser_320ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_0[32],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_320_cu_9a92573f_1601111nvfuser_320ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_1[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_320_cu_9a92573f_1601111nvfuser_320ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_2[40]
)
{
.reg .pred %p<6>;
.reg .f32 %f<32>;
.reg .b32 %r<91>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r40, %r41}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_320_cu_9a92573f_1601111nvfuser_320ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_0+8];
ld.param.v2.u32 {%r42, %r43}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_320_cu_9a92573f_1601111nvfuser_320ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_0+16];
ld.param.v2.u32 {%r44, %r45}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_320_cu_9a92573f_1601111nvfuser_320ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_0+24];
ld.param.v2.u32 {%r48, %r49}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_320_cu_9a92573f_1601111nvfuser_320ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_1+16];
ld.param.v2.u32 {%r50, %r51}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_320_cu_9a92573f_1601111nvfuser_320ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_1+24];
ld.param.v2.u32 {%r52, %r53}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_320_cu_9a92573f_1601111nvfuser_320ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_1+32];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_320_cu_9a92573f_1601111nvfuser_320ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_2];
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_320_cu_9a92573f_1601111nvfuser_320ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_320_cu_9a92573f_1601111nvfuser_320ENS_6TensorIfLi3ELi3EEENS0_IfLi4ELi4EEES2__param_0];
mul.lo.s32 %r62, %r40, %r48;
mul.lo.s32 %r63, %r62, %r49;
mul.lo.s32 %r6, %r63, 303;
add.s32 %r64, %r6, 127;
shr.s32 %r65, %r64, 31;
shr.u32 %r66, %r65, 25;
add.s32 %r67, %r64, %r66;
shr.s32 %r68, %r67, 7;
mov.u32 %r7, %ctaid.x;
setp.ge.s32 %p1, %r7, %r68;
@%p1 bra $L__BB0_6;
mov.u32 %r69, %tid.x;
shl.b32 %r70, %r7, 7;
add.s32 %r9, %r70, %r69;
setp.ge.s32 %p2, %r9, %r6;
@%p2 bra $L__BB0_6;
mul.lo.s32 %r71, %r48, %r49;
mul.lo.s32 %r13, %r71, 303;
rem.s32 %r72, %r9, %r13;
div.s32 %r14, %r72, %r71;
mul.lo.s32 %r73, %r14, %r71;
sub.s32 %r15, %r72, %r73;
div.s32 %r16, %r15, %r49;
mad.lo.s32 %r17, %r14, %r48, %r16;
mul.lo.s32 %r74, %r41, %r42;
setp.ge.s32 %p3, %r17, %r74;
@%p3 bra $L__BB0_6;
div.s32 %r75, %r9, %r13;
mul.lo.s32 %r76, %r75, %r50;
mad.lo.s32 %r77, %r14, %r51, %r76;
mad.lo.s32 %r78, %r16, %r52, %r77;
rem.s32 %r79, %r15, %r49;
mad.lo.s32 %r80, %r79, %r53, %r78;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.s32 %rd5, %r80, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.f32 %f1, [%rd6];
mul.lo.s32 %r81, %r75, %r43;
div.s32 %r82, %r17, %r42;
mad.lo.s32 %r83, %r82, %r44, %r81;
mul.lo.s32 %r84, %r82, %r42;
sub.s32 %r85, %r17, %r84;
mad.lo.s32 %r86, %r85, %r45, %r83;
cvta.to.global.u64 %rd7, %rd1;
mul.wide.s32 %rd8, %r86, 4;
add.s64 %rd9, %rd7, %rd8;
ld.global.f32 %f2, [%rd9];
mul.f32 %f3, %f2, 0f3F3504F3;
abs.f32 %f7, %f3;
setp.ltu.f32 %p4, %f7, 0f3F8060FE;
setp.ge.f32 %p5, %f7, 0f3F8060FE;
mul.f32 %f8, %f3, %f3;
selp.f32 %f9, %f7, %f8, %p5;
selp.f32 %f10, 0f38EB4C3A, 0f38B1E96A, %p5;
selp.f32 %f11, 0fBAAE005B, 0fBA574D20, %p5;
fma.rn.f32 %f12, %f10, %f9, %f11;
selp.f32 %f13, 0f3C09919F, 0f3BAAD5EA, %p5;
fma.rn.f32 %f14, %f12, %f9, %f13;
selp.f32 %f15, 0fBD24D99A, 0fBCDC1BE7, %p5;
fma.rn.f32 %f16, %f14, %f9, %f15;
selp.f32 %f17, 0f3E235519, 0f3DE718AF, %p5;
fma.rn.f32 %f18, %f16, %f9, %f17;
selp.f32 %f19, 0f3F69B4F9, 0fBEC093AC, %p5;
fma.rn.f32 %f20, %f18, %f9, %f19;
selp.f32 %f21, 0f3F210A14, 0f3E0375D3, %p5;
fma.rn.f32 %f22, %f20, %f9, %f21;
neg.f32 %f23, %f7;
selp.f32 %f24, %f23, %f3, %p5;
fma.rn.f32 %f31, %f22, %f24, %f24;
@%p4 bra $L__BB0_5;
ex2.approx.ftz.f32 %f25, %f31;
mov.f32 %f26, 0f3F800000;
sub.f32 %f27, %f26, %f25;
mov.b32 %r87, %f27;
mov.b32 %r88, %f3;
and.b32 %r89, %r88, -2147483648;
or.b32 %r90, %r89, %r87;
mov.b32 %f31, %r90;
$L__BB0_5:
add.f32 %f28, %f31, 0f3F800000;
mul.f32 %f29, %f28, 0f3F000000;
fma.rn.f32 %f30, %f2, %f29, %f1;
cvta.to.global.u64 %rd10, %rd3;
mul.wide.s32 %rd11, %r9, 4;
add.s64 %rd12, %rd10, %rd11;
st.global.f32 [%rd12], %f30;
$L__BB0_6:
ret;
}
Kernel 59
CUDA
PTX
53997da5d
Diff
03a1b695e
-1
+1 index type: int
registers: 32→ 30
gmem: 3
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 4, 4> T0, Tensor<float, 7, 7> T1, Tensor<float, 7, 7> T9) {
if ((((((nvfuser_index_t)blockIdx.x) < (ceilDiv(((((((2 * T1.logical_size[0LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL]), 128))) && ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < ((((((2 * T1.logical_size[0LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL]))) && (((T1.logical_size[4LL] * (((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % ((((2 * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % (((2 * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) / ((T1.logical_size[4LL] * T1.logical_size[6LL]) * T0.logical_size[3LL]))) + ((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % ((((2 * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % (((2 * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % ((T1.logical_size[4LL] * T1.logical_size[6LL]) * T0.logical_size[3LL])) / (T1.logical_size[6LL] * T0.logical_size[3LL]))) < T0.logical_size[2LL]))) {
Array<float, 1, 1> T11;
T11[0] = 0;
T11[0]
= T1[(((((((T1.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]))) + (T1.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) / ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])))) + (T1.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) / (((2 * T0.logical_size[3LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])))) + (T1.alloc_stride[3LL] * (((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (((2 * T0.logical_size[3LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) / ((T0.logical_size[3LL] * T1.logical_size[4LL]) * T1.logical_size[6LL])))) + (T1.alloc_stride[4LL] * ((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (((2 * T0.logical_size[3LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((T0.logical_size[3LL] * T1.logical_size[4LL]) * T1.logical_size[6LL])) / (T0.logical_size[3LL] * T1.logical_size[6LL])))) + (T1.alloc_stride[5LL] * (((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (((2 * T0.logical_size[3LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((T0.logical_size[3LL] * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (T0.logical_size[3LL] * T1.logical_size[6LL])) / T1.logical_size[6LL]))) + (T1.alloc_stride[6LL] * (((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (((2 * T0.logical_size[3LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((T0.logical_size[3LL] * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (T0.logical_size[3LL] * T1.logical_size[6LL])) % T1.logical_size[6LL])))];
Array<float, 1, 1> T10;
T10[0] = 0;
T10[0]
= T0[(((((T0.alloc_stride[0LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) / ((((2 * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) / T0.logical_size[1LL])) + (T0.alloc_stride[1LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) / ((((2 * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % T0.logical_size[1LL]))) + ((T1.logical_size[4LL] * T0.alloc_stride[2LL]) * (((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % ((((2 * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % (((2 * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) / ((T1.logical_size[4LL] * T1.logical_size[6LL]) * T0.logical_size[3LL])))) + (T0.alloc_stride[2LL] * ((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % ((((2 * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % (((2 * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % ((T1.logical_size[4LL] * T1.logical_size[6LL]) * T0.logical_size[3LL])) / (T1.logical_size[6LL] * T0.logical_size[3LL])))) + (T0.alloc_stride[3LL] * (((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % ((((2 * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % (((2 * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % ((T1.logical_size[4LL] * T1.logical_size[6LL]) * T0.logical_size[3LL])) % (T1.logical_size[6LL] * T0.logical_size[3LL])) / T1.logical_size[6LL])))];
Array<float, 1, 1> T2;
T2[0]
= T10[0]
* (float) 7.07106781186547573e-01;
Array<float, 1, 1> T3;
T3[0]
= erff(T2[0]);
Array<float, 1, 1> T4;
T4[0]
= (float) 1.00000000000000000e+00
+ T3[0];
Array<float, 1, 1> T5;
T5[0]
= (float) 5.00000000000000000e-01
* T4[0];
Array<float, 1, 1> T6;
T6[0]
= T10[0]
* T5[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0];
Array<float, 1, 1> T8;
T8[0]
= T7[0];
Array<float, 1, 1> T12;
T12[0]
= T8[0]
+ T11[0];
T9[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
= T12[0];
}
}
__global__ void nvfuser_N(Tensor<float, 4, 4> T0, Tensor<float, 7, 7> T1, Tensor<float, 7, 7> T9) {
if ((((((nvfuser_index_t)blockIdx.x) < (ceilDiv(((((((2 * T0.logical_size[3LL]) * T1.logical_size[0LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]), 128))) && ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < ((((((2 * T0.logical_size[3LL]) * T1.logical_size[0LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]))) && (((T1.logical_size[4LL] * (((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (((2 * T0.logical_size[3LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) / ((T0.logical_size[3LL] * T1.logical_size[4LL]) * T1.logical_size[6LL]))) + ((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (((2 * T0.logical_size[3LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((T0.logical_size[3LL] * T1.logical_size[4LL]) * T1.logical_size[6LL])) / (T0.logical_size[3LL] * T1.logical_size[6LL]))) < T0.logical_size[2LL]))) {
Array<float, 1, 1> T11;
T11[0] = 0;
T11[0]
= T1[(((((((T1.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]))) + (T1.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) / ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])))) + (T1.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) / (((2 * T0.logical_size[3LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])))) + (T1.alloc_stride[3LL] * (((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (((2 * T0.logical_size[3LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) / ((T0.logical_size[3LL] * T1.logical_size[4LL]) * T1.logical_size[6LL])))) + (T1.alloc_stride[4LL] * ((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (((2 * T0.logical_size[3LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((T0.logical_size[3LL] * T1.logical_size[4LL]) * T1.logical_size[6LL])) / (T0.logical_size[3LL] * T1.logical_size[6LL])))) + (T1.alloc_stride[5LL] * (((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (((2 * T0.logical_size[3LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((T0.logical_size[3LL] * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (T0.logical_size[3LL] * T1.logical_size[6LL])) / T1.logical_size[6LL]))) + (T1.alloc_stride[6LL] * (((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (((2 * T0.logical_size[3LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((T0.logical_size[3LL] * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (T0.logical_size[3LL] * T1.logical_size[6LL])) % T1.logical_size[6LL])))];
Array<float, 1, 1> T10;
T10[0] = 0;
T10[0]
= T0[(((((T0.alloc_stride[0LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) / ((((2 * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) / T0.logical_size[1LL])) + (T0.alloc_stride[1LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) / ((((2 * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % T0.logical_size[1LL]))) + ((T1.logical_size[4LL] * T0.alloc_stride[2LL]) * (((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % ((((2 * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % (((2 * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) / ((T1.logical_size[4LL] * T1.logical_size[6LL]) * T0.logical_size[3LL])))) + (T0.alloc_stride[2LL] * ((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % ((((2 * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % (((2 * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % ((T1.logical_size[4LL] * T1.logical_size[6LL]) * T0.logical_size[3LL])) / (T1.logical_size[6LL] * T0.logical_size[3LL])))) + (T0.alloc_stride[3LL] * (((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % ((((2 * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % (((2 * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % ((T1.logical_size[4LL] * T1.logical_size[6LL]) * T0.logical_size[3LL])) % (T1.logical_size[6LL] * T0.logical_size[3LL])) / T1.logical_size[6LL])))];
Array<float, 1, 1> T2;
T2[0]
= T10[0]
* (float) 7.07106781186547573e-01;
Array<float, 1, 1> T3;
T3[0]
= erff(T2[0]);
Array<float, 1, 1> T4;
T4[0]
= (float) 1.00000000000000000e+00
+ T3[0];
Array<float, 1, 1> T5;
T5[0]
= (float) 5.00000000000000000e-01
* T4[0];
Array<float, 1, 1> T6;
T6[0]
= T10[0]
* T5[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0];
Array<float, 1, 1> T8;
T8[0]
= T7[0];
Array<float, 1, 1> T12;
T12[0]
= T8[0]
+ T11[0];
T9[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
= T12[0];
}
}
--- 53997da5d
+++ 03a1b695e
@@ -1,7 +1,7 @@
__global__ void nvfuser_N(Tensor<float, 4, 4> T0, Tensor<float, 7, 7> T1, Tensor<float, 7, 7> T9) {
- if ((((((nvfuser_index_t)blockIdx.x) < (ceilDiv(((((((2 * T1.logical_size[0LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL]), 128))) && ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < ((((((2 * T1.logical_size[0LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL]))) && (((T1.logical_size[4LL] * (((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % ((((2 * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % (((2 * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) / ((T1.logical_size[4LL] * T1.logical_size[6LL]) * T0.logical_size[3LL]))) + ((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % ((((2 * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % (((2 * T1.logical_size[4LL]) * T1.logical_size[6LL]) * T0.logical_size[3LL])) % ((T1.logical_size[4LL] * T1.logical_size[6LL]) * T0.logical_size[3LL])) / (T1.logical_size[6LL] * T0.logical_size[3LL]))) < T0.logical_size[2LL]))) {
+ if ((((((nvfuser_index_t)blockIdx.x) < (ceilDiv(((((((2 * T0.logical_size[3LL]) * T1.logical_size[0LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]), 128))) && ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < ((((((2 * T0.logical_size[3LL]) * T1.logical_size[0LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]))) && (((T1.logical_size[4LL] * (((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (((2 * T0.logical_size[3LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) / ((T0.logical_size[3LL] * T1.logical_size[4LL]) * T1.logical_size[6LL]))) + ((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (((2 * T0.logical_size[3LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((T0.logical_size[3LL] * T1.logical_size[4LL]) * T1.logical_size[6LL])) / (T0.logical_size[3LL] * T1.logical_size[6LL]))) < T0.logical_size[2LL]))) {
Array<float, 1, 1> T11;
T11[0] = 0;
T11[0]
= T1[(((((((T1.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL]))) + (T1.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) / ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])))) + (T1.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) / (((2 * T0.logical_size[3LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])))) + (T1.alloc_stride[3LL] * (((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (((2 * T0.logical_size[3LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) / ((T0.logical_size[3LL] * T1.logical_size[4LL]) * T1.logical_size[6LL])))) + (T1.alloc_stride[4LL] * ((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (((2 * T0.logical_size[3LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((T0.logical_size[3LL] * T1.logical_size[4LL]) * T1.logical_size[6LL])) / (T0.logical_size[3LL] * T1.logical_size[6LL])))) + (T1.alloc_stride[5LL] * (((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (((2 * T0.logical_size[3LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((T0.logical_size[3LL] * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (T0.logical_size[3LL] * T1.logical_size[6LL])) / T1.logical_size[6LL]))) + (T1.alloc_stride[6LL] * (((((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (((((2 * T0.logical_size[3LL]) * T1.logical_size[1LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((((2 * T0.logical_size[3LL]) * T1.logical_size[2LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (((2 * T0.logical_size[3LL]) * T1.logical_size[4LL]) * T1.logical_size[6LL])) % ((T0.logical_size[3LL] * T1.logical_size[4LL]) * T1.logical_size[6LL])) % (T0.logical_size[3LL] * T1.logical_size[6LL])) % T1.logical_size[6LL])))];
Array<float, 1, 1> T10;
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_954e3204_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_954e3204_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_954e3204_191103std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_954e3204_1911011nvfuser_338ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_954e3204_1911011nvfuser_338ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_954e3204_1911011nvfuser_338ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1[64],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_954e3204_1911011nvfuser_338ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_2[64]
)
{
.reg .pred %p<6>;
.reg .f32 %f<32>;
.reg .b32 %r<146>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r65, %r66}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_954e3204_1911011nvfuser_338ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0+8];
ld.param.v2.u32 {%r67, %r68}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_954e3204_1911011nvfuser_338ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0+16];
ld.param.v2.u32 {%r69, %r70}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_954e3204_1911011nvfuser_338ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0+24];
ld.param.v2.u32 {%r71, %r72}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_954e3204_1911011nvfuser_338ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0+32];
ld.param.v2.u32 {%r73, %r74}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_954e3204_1911011nvfuser_338ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+8];
ld.param.v2.u32 {%r75, %r76}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_954e3204_1911011nvfuser_338ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+16];
ld.param.v2.u32 {%r77, %r78}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_954e3204_1911011nvfuser_338ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+24];
ld.param.v2.u32 {%r79, %r80}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_954e3204_1911011nvfuser_338ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+32];
ld.param.v2.u32 {%r81, %r82}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_954e3204_1911011nvfuser_338ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+40];
ld.param.v2.u32 {%r83, %r84}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_954e3204_1911011nvfuser_338ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+48];
ld.param.v2.u32 {%r85, %r86}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_954e3204_1911011nvfuser_338ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+56];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_954e3204_1911011nvfuser_338ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_2];
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_954e3204_1911011nvfuser_338ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_954e3204_1911011nvfuser_338ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0];
shl.b32 %r6, %r68, 1;
mul.lo.s32 %r101, %r6, %r73;
mul.lo.s32 %r102, %r101, %r74;
mul.lo.s32 %r103, %r102, %r75;
mul.lo.s32 %r104, %r103, %r77;
mul.lo.s32 %r11, %r104, %r79;
add.s32 %r105, %r11, 127;
shr.s32 %r106, %r105, 31;
shr.u32 %r107, %r106, 25;
add.s32 %r108, %r105, %r107;
shr.s32 %r109, %r108, 7;
mov.u32 %r12, %ctaid.x;
setp.ge.s32 %p1, %r12, %r109;
@%p1 bra $L__BB0_6;
mov.u32 %r110, %tid.x;
shl.b32 %r111, %r12, 7;
add.s32 %r14, %r111, %r110;
setp.ge.s32 %p2, %r14, %r11;
@%p2 bra $L__BB0_6;
mul.lo.s32 %r112, %r6, %r74;
mul.lo.s32 %r113, %r112, %r75;
mul.lo.s32 %r114, %r113, %r77;
mul.lo.s32 %r19, %r114, %r79;
rem.s32 %r20, %r14, %r19;
mul.lo.s32 %r115, %r6, %r75;
mul.lo.s32 %r116, %r115, %r77;
mul.lo.s32 %r21, %r116, %r79;
rem.s32 %r22, %r20, %r21;
mul.lo.s32 %r117, %r6, %r77;
mul.lo.s32 %r23, %r117, %r79;
rem.s32 %r118, %r22, %r23;
mul.lo.s32 %r24, %r79, %r68;
mul.lo.s32 %r119, %r24, %r77;
div.s32 %r25, %r118, %r119;
mul.lo.s32 %r120, %r25, %r119;
sub.s32 %r26, %r118, %r120;
div.s32 %r27, %r26, %r24;
mad.lo.s32 %r28, %r25, %r77, %r27;
setp.ge.s32 %p3, %r28, %r67;
@%p3 bra $L__BB0_6;
div.s32 %r121, %r14, %r19;
mul.lo.s32 %r122, %r121, %r80;
div.s32 %r123, %r20, %r21;
mad.lo.s32 %r124, %r123, %r81, %r122;
div.s32 %r125, %r22, %r23;
mad.lo.s32 %r126, %r125, %r82, %r124;
mad.lo.s32 %r127, %r25, %r83, %r126;
mad.lo.s32 %r128, %r27, %r84, %r127;
rem.s32 %r129, %r26, %r24;
div.s32 %r130, %r129, %r79;
mad.lo.s32 %r131, %r130, %r85, %r128;
mul.lo.s32 %r132, %r130, %r79;
sub.s32 %r133, %r129, %r132;
mad.lo.s32 %r134, %r133, %r86, %r131;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.s32 %rd5, %r134, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.f32 %f1, [%rd6];
div.s32 %r135, %r123, %r66;
mul.lo.s32 %r136, %r135, %r69;
mul.lo.s32 %r137, %r135, %r66;
sub.s32 %r138, %r123, %r137;
mad.lo.s32 %r139, %r138, %r70, %r136;
mad.lo.s32 %r140, %r28, %r71, %r139;
mad.lo.s32 %r141, %r130, %r72, %r140;
cvta.to.global.u64 %rd7, %rd1;
mul.wide.s32 %rd8, %r141, 4;
add.s64 %rd9, %rd7, %rd8;
ld.global.f32 %f2, [%rd9];
mul.f32 %f3, %f2, 0f3F3504F3;
abs.f32 %f7, %f3;
setp.ltu.f32 %p4, %f7, 0f3F8060FE;
setp.ge.f32 %p5, %f7, 0f3F8060FE;
mul.f32 %f8, %f3, %f3;
selp.f32 %f9, %f7, %f8, %p5;
selp.f32 %f10, 0f38EB4C3A, 0f38B1E96A, %p5;
selp.f32 %f11, 0fBAAE005B, 0fBA574D20, %p5;
fma.rn.f32 %f12, %f10, %f9, %f11;
selp.f32 %f13, 0f3C09919F, 0f3BAAD5EA, %p5;
fma.rn.f32 %f14, %f12, %f9, %f13;
selp.f32 %f15, 0fBD24D99A, 0fBCDC1BE7, %p5;
fma.rn.f32 %f16, %f14, %f9, %f15;
selp.f32 %f17, 0f3E235519, 0f3DE718AF, %p5;
fma.rn.f32 %f18, %f16, %f9, %f17;
selp.f32 %f19, 0f3F69B4F9, 0fBEC093AC, %p5;
fma.rn.f32 %f20, %f18, %f9, %f19;
selp.f32 %f21, 0f3F210A14, 0f3E0375D3, %p5;
fma.rn.f32 %f22, %f20, %f9, %f21;
neg.f32 %f23, %f7;
selp.f32 %f24, %f23, %f3, %p5;
fma.rn.f32 %f31, %f22, %f24, %f24;
@%p4 bra $L__BB0_5;
ex2.approx.ftz.f32 %f25, %f31;
mov.f32 %f26, 0f3F800000;
sub.f32 %f27, %f26, %f25;
mov.b32 %r142, %f27;
mov.b32 %r143, %f3;
and.b32 %r144, %r143, -2147483648;
or.b32 %r145, %r144, %r142;
mov.b32 %f31, %r145;
$L__BB0_5:
add.f32 %f28, %f31, 0f3F800000;
mul.f32 %f29, %f28, 0f3F000000;
fma.rn.f32 %f30, %f2, %f29, %f1;
cvta.to.global.u64 %rd10, %rd3;
mul.wide.s32 %rd11, %r14, 4;
add.s64 %rd12, %rd10, %rd11;
st.global.f32 [%rd12], %f30;
$L__BB0_6:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_d00b204b_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_d00b204b_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_d00b204b_160113std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_d00b204b_1601111nvfuser_338ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_d00b204b_1601111nvfuser_338ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_d00b204b_1601111nvfuser_338ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1[64],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_d00b204b_1601111nvfuser_338ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_2[64]
)
{
.reg .pred %p<6>;
.reg .f32 %f<32>;
.reg .b32 %r<156>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r65, %r66}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_d00b204b_1601111nvfuser_338ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0+8];
ld.param.v2.u32 {%r67, %r68}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_d00b204b_1601111nvfuser_338ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0+16];
ld.param.v2.u32 {%r69, %r70}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_d00b204b_1601111nvfuser_338ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0+24];
ld.param.v2.u32 {%r71, %r72}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_d00b204b_1601111nvfuser_338ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0+32];
ld.param.v2.u32 {%r73, %r74}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_d00b204b_1601111nvfuser_338ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+8];
ld.param.v2.u32 {%r75, %r76}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_d00b204b_1601111nvfuser_338ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+16];
ld.param.v2.u32 {%r77, %r78}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_d00b204b_1601111nvfuser_338ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+24];
ld.param.v2.u32 {%r79, %r80}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_d00b204b_1601111nvfuser_338ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+32];
ld.param.v2.u32 {%r81, %r82}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_d00b204b_1601111nvfuser_338ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+40];
ld.param.v2.u32 {%r83, %r84}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_d00b204b_1601111nvfuser_338ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+48];
ld.param.v2.u32 {%r85, %r86}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_d00b204b_1601111nvfuser_338ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+56];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_d00b204b_1601111nvfuser_338ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_2];
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_d00b204b_1601111nvfuser_338ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_338_cu_d00b204b_1601111nvfuser_338ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0];
shl.b32 %r6, %r68, 1;
mul.lo.s32 %r101, %r6, %r73;
mul.lo.s32 %r102, %r101, %r74;
mul.lo.s32 %r103, %r102, %r75;
mul.lo.s32 %r104, %r103, %r77;
mul.lo.s32 %r11, %r104, %r79;
add.s32 %r105, %r11, 127;
shr.s32 %r106, %r105, 31;
shr.u32 %r107, %r106, 25;
add.s32 %r108, %r105, %r107;
shr.s32 %r109, %r108, 7;
mov.u32 %r12, %ctaid.x;
setp.ge.s32 %p1, %r12, %r109;
@%p1 bra $L__BB0_6;
mov.u32 %r110, %tid.x;
shl.b32 %r111, %r12, 7;
add.s32 %r14, %r111, %r110;
setp.ge.s32 %p2, %r14, %r11;
@%p2 bra $L__BB0_6;
mul.lo.s32 %r112, %r6, %r74;
mul.lo.s32 %r113, %r112, %r75;
mul.lo.s32 %r114, %r113, %r77;
mul.lo.s32 %r19, %r114, %r79;
rem.s32 %r20, %r14, %r19;
mul.lo.s32 %r115, %r6, %r75;
mul.lo.s32 %r116, %r115, %r77;
mul.lo.s32 %r21, %r116, %r79;
rem.s32 %r22, %r20, %r21;
mul.lo.s32 %r117, %r6, %r77;
mul.lo.s32 %r23, %r117, %r79;
rem.s32 %r24, %r22, %r23;
mul.lo.s32 %r118, %r77, %r68;
mul.lo.s32 %r119, %r118, %r79;
div.s32 %r25, %r24, %r119;
mul.lo.s32 %r120, %r25, %r119;
sub.s32 %r26, %r24, %r120;
mul.lo.s32 %r27, %r79, %r68;
div.s32 %r28, %r26, %r27;
mad.lo.s32 %r121, %r25, %r77, %r28;
setp.ge.s32 %p3, %r121, %r67;
@%p3 bra $L__BB0_6;
div.s32 %r122, %r14, %r19;
mul.lo.s32 %r123, %r122, %r80;
div.s32 %r124, %r20, %r21;
mad.lo.s32 %r125, %r124, %r81, %r123;
div.s32 %r126, %r22, %r23;
mad.lo.s32 %r127, %r126, %r82, %r125;
mad.lo.s32 %r128, %r25, %r83, %r127;
mad.lo.s32 %r129, %r28, %r84, %r128;
rem.s32 %r130, %r26, %r27;
div.s32 %r131, %r130, %r79;
mad.lo.s32 %r132, %r131, %r85, %r129;
mul.lo.s32 %r133, %r131, %r79;
sub.s32 %r134, %r130, %r133;
mad.lo.s32 %r135, %r134, %r86, %r132;
cvta.to.global.u64 %rd4, %rd2;
mul.wide.s32 %rd5, %r135, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.f32 %f1, [%rd6];
div.s32 %r136, %r124, %r66;
mul.lo.s32 %r137, %r136, %r69;
mul.lo.s32 %r138, %r136, %r66;
sub.s32 %r139, %r124, %r138;
mul.lo.s32 %r140, %r27, %r77;
div.s32 %r141, %r24, %r140;
mul.lo.s32 %r142, %r141, %r140;
sub.s32 %r143, %r24, %r142;
div.s32 %r144, %r143, %r27;
mul.lo.s32 %r145, %r144, %r27;
sub.s32 %r146, %r143, %r145;
div.s32 %r147, %r146, %r79;
mad.lo.s32 %r148, %r141, %r77, %r144;
mad.lo.s32 %r149, %r139, %r70, %r137;
mad.lo.s32 %r150, %r148, %r71, %r149;
mad.lo.s32 %r151, %r147, %r72, %r150;
cvta.to.global.u64 %rd7, %rd1;
mul.wide.s32 %rd8, %r151, 4;
add.s64 %rd9, %rd7, %rd8;
ld.global.f32 %f2, [%rd9];
mul.f32 %f3, %f2, 0f3F3504F3;
abs.f32 %f7, %f3;
setp.ltu.f32 %p4, %f7, 0f3F8060FE;
setp.ge.f32 %p5, %f7, 0f3F8060FE;
mul.f32 %f8, %f3, %f3;
selp.f32 %f9, %f7, %f8, %p5;
selp.f32 %f10, 0f38EB4C3A, 0f38B1E96A, %p5;
selp.f32 %f11, 0fBAAE005B, 0fBA574D20, %p5;
fma.rn.f32 %f12, %f10, %f9, %f11;
selp.f32 %f13, 0f3C09919F, 0f3BAAD5EA, %p5;
fma.rn.f32 %f14, %f12, %f9, %f13;
selp.f32 %f15, 0fBD24D99A, 0fBCDC1BE7, %p5;
fma.rn.f32 %f16, %f14, %f9, %f15;
selp.f32 %f17, 0f3E235519, 0f3DE718AF, %p5;
fma.rn.f32 %f18, %f16, %f9, %f17;
selp.f32 %f19, 0f3F69B4F9, 0fBEC093AC, %p5;
fma.rn.f32 %f20, %f18, %f9, %f19;
selp.f32 %f21, 0f3F210A14, 0f3E0375D3, %p5;
fma.rn.f32 %f22, %f20, %f9, %f21;
neg.f32 %f23, %f7;
selp.f32 %f24, %f23, %f3, %p5;
fma.rn.f32 %f31, %f22, %f24, %f24;
@%p4 bra $L__BB0_5;
ex2.approx.ftz.f32 %f25, %f31;
mov.f32 %f26, 0f3F800000;
sub.f32 %f27, %f26, %f25;
mov.b32 %r152, %f27;
mov.b32 %r153, %f3;
and.b32 %r154, %r153, -2147483648;
or.b32 %r155, %r154, %r152;
mov.b32 %f31, %r155;
$L__BB0_5:
add.f32 %f28, %f31, 0f3F800000;
mul.f32 %f29, %f28, 0f3F000000;
fma.rn.f32 %f30, %f2, %f29, %f1;
cvta.to.global.u64 %rd10, %rd3;
mul.wide.s32 %rd11, %r14, 4;
add.s64 %rd12, %rd10, %rd11;
st.global.f32 [%rd12], %f30;
$L__BB0_6:
ret;
}
--- 53997da5d
+++ 03a1b695e
@@ -20,11 +20,11 @@
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_2[64]
)
{
.reg .pred %p<6>;
.reg .f32 %f<32>;
- .reg .b32 %r<146>;
+ .reg .b32 %r<156>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r65, %r66}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0+8];
ld.param.v2.u32 {%r67, %r68}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0+16];
@@ -70,48 +70,58 @@
mul.lo.s32 %r116, %r115, %r77;
mul.lo.s32 %r21, %r116, %r79;
rem.s32 %r22, %r20, %r21;
mul.lo.s32 %r117, %r6, %r77;
mul.lo.s32 %r23, %r117, %r79;
- rem.s32 %r118, %r22, %r23;
- mul.lo.s32 %r24, %r79, %r68;
- mul.lo.s32 %r119, %r24, %r77;
- div.s32 %r25, %r118, %r119;
+ rem.s32 %r24, %r22, %r23;
+ mul.lo.s32 %r118, %r77, %r68;
+ mul.lo.s32 %r119, %r118, %r79;
+ div.s32 %r25, %r24, %r119;
mul.lo.s32 %r120, %r25, %r119;
- sub.s32 %r26, %r118, %r120;
- div.s32 %r27, %r26, %r24;
- mad.lo.s32 %r28, %r25, %r77, %r27;
- setp.ge.s32 %p3, %r28, %r67;
+ sub.s32 %r26, %r24, %r120;
+ mul.lo.s32 %r27, %r79, %r68;
+ div.s32 %r28, %r26, %r27;
+ mad.lo.s32 %r121, %r25, %r77, %r28;
+ setp.ge.s32 %p3, %r121, %r67;
@%p3 bra $L__BB0_6;
- div.s32 %r121, %r14, %r19;
- mul.lo.s32 %r122, %r121, %r80;
- div.s32 %r123, %r20, %r21;
- mad.lo.s32 %r124, %r123, %r81, %r122;
- div.s32 %r125, %r22, %r23;
- mad.lo.s32 %r126, %r125, %r82, %r124;
- mad.lo.s32 %r127, %r25, %r83, %r126;
- mad.lo.s32 %r128, %r27, %r84, %r127;
- rem.s32 %r129, %r26, %r24;
- div.s32 %r130, %r129, %r79;
- mad.lo.s32 %r131, %r130, %r85, %r128;
- mul.lo.s32 %r132, %r130, %r79;
- sub.s32 %r133, %r129, %r132;
- mad.lo.s32 %r134, %r133, %r86, %r131;
+ div.s32 %r122, %r14, %r19;
+ mul.lo.s32 %r123, %r122, %r80;
+ div.s32 %r124, %r20, %r21;
+ mad.lo.s32 %r125, %r124, %r81, %r123;
+ div.s32 %r126, %r22, %r23;
+ mad.lo.s32 %r127, %r126, %r82, %r125;
+ mad.lo.s32 %r128, %r25, %r83, %r127;
+ mad.lo.s32 %r129, %r28, %r84, %r128;
+ rem.s32 %r130, %r26, %r27;
+ div.s32 %r131, %r130, %r79;
+ mad.lo.s32 %r132, %r131, %r85, %r129;
+ mul.lo.s32 %r133, %r131, %r79;
+ sub.s32 %r134, %r130, %r133;
+ mad.lo.s32 %r135, %r134, %r86, %r132;
cvta.to.global.u64 %rd4, %rd2;
- mul.wide.s32 %rd5, %r134, 4;
+ mul.wide.s32 %rd5, %r135, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.f32 %f1, [%rd6];
- div.s32 %r135, %r123, %r66;
- mul.lo.s32 %r136, %r135, %r69;
- mul.lo.s32 %r137, %r135, %r66;
- sub.s32 %r138, %r123, %r137;
- mad.lo.s32 %r139, %r138, %r70, %r136;
- mad.lo.s32 %r140, %r28, %r71, %r139;
- mad.lo.s32 %r141, %r130, %r72, %r140;
+ div.s32 %r136, %r124, %r66;
+ mul.lo.s32 %r137, %r136, %r69;
+ mul.lo.s32 %r138, %r136, %r66;
+ sub.s32 %r139, %r124, %r138;
+ mul.lo.s32 %r140, %r27, %r77;
+ div.s32 %r141, %r24, %r140;
+ mul.lo.s32 %r142, %r141, %r140;
+ sub.s32 %r143, %r24, %r142;
+ div.s32 %r144, %r143, %r27;
+ mul.lo.s32 %r145, %r144, %r27;
+ sub.s32 %r146, %r143, %r145;
+ div.s32 %r147, %r146, %r79;
+ mad.lo.s32 %r148, %r141, %r77, %r144;
+ mad.lo.s32 %r149, %r139, %r70, %r137;
+ mad.lo.s32 %r150, %r148, %r71, %r149;
+ mad.lo.s32 %r151, %r147, %r72, %r150;
cvta.to.global.u64 %rd7, %rd1;
- mul.wide.s32 %rd8, %r141, 4;
+ mul.wide.s32 %rd8, %r151, 4;
add.s64 %rd9, %rd7, %rd8;
ld.global.f32 %f2, [%rd9];
mul.f32 %f3, %f2, 0f3F3504F3;
abs.f32 %f7, %f3;
setp.ltu.f32 %p4, %f7, 0f3F8060FE;
@@ -137,15 +147,15 @@
@%p4 bra $L__BB0_5;
ex2.approx.ftz.f32 %f25, %f31;
mov.f32 %f26, 0f3F800000;
sub.f32 %f27, %f26, %f25;
- mov.b32 %r142, %f27;
- mov.b32 %r143, %f3;
- and.b32 %r144, %r143, -2147483648;
- or.b32 %r145, %r144, %r142;
- mov.b32 %f31, %r145;
+ mov.b32 %r152, %f27;
+ mov.b32 %r153, %f3;
+ and.b32 %r154, %r153, -2147483648;
+ or.b32 %r155, %r154, %r152;
+ mov.b32 %f31, %r155;
$L__BB0_5:
add.f32 %f28, %f31, 0f3F800000;
mul.f32 %f29, %f28, 0f3F000000;
fma.rn.f32 %f30, %f2, %f29, %f1;
Kernel 60
CUDA
PTX
53997da5d
Diff
03a1b695e
-4
+4 index type: int
registers: 18
gmem: 3
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 4, 4> T0, Tensor<float, 7, 7> T1, Tensor<float, 7, 7> T9) {
if (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < 120)) {
Array<float, 1, 1> T11;
T11[0] = 0;
T11[0]
= T1[((((T1.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 20)) + (T1.alloc_stride[3LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) / 10))) + (T1.alloc_stride[4LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) / 5))) + (T1.alloc_stride[5LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) % 5)))];
Array<float, 1, 1> T10;
T10[0] = 0;
T10[0]
= T0[(((((T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 60)) + (T0.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 20) % 3))) + ((2 * T0.alloc_stride[2LL]) * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) / 10))) + (T0.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) / 5))) + (T0.alloc_stride[3LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) % 5)))];
Array<float, 1, 1> T2;
T2[0]
= T10[0]
* (float) 7.07106781186547573e-01;
Array<float, 1, 1> T3;
T3[0]
= erff(T2[0]);
Array<float, 1, 1> T4;
T4[0]
= (float) 1.00000000000000000e+00
+ T3[0];
Array<float, 1, 1> T5;
T5[0]
= (float) 5.00000000000000000e-01
* T4[0];
Array<float, 1, 1> T6;
T6[0]
= T10[0]
* T5[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0];
Array<float, 1, 1> T8;
T8[0]
= T7[0];
Array<float, 1, 1> T12;
T12[0]
= T8[0]
+ T11[0];
T9[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
= T12[0];
}
}
__global__ void nvfuser_N(Tensor<float, 4, 4> T0, Tensor<float, 7, 7> T1, Tensor<float, 7, 7> T9) {
if ((((nvfuser_index_t)threadIdx.x) < 120)) {
Array<float, 1, 1> T11;
T11[0] = 0;
T11[0]
= T1[((((T1.alloc_stride[1LL] * (((nvfuser_index_t)threadIdx.x) / 20)) + (T1.alloc_stride[3LL] * ((((nvfuser_index_t)threadIdx.x) % 20) / 10))) + (T1.alloc_stride[4LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) / 5))) + (T1.alloc_stride[5LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) % 5)))];
Array<float, 1, 1> T10;
T10[0] = 0;
T10[0]
= T0[(((((T0.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) / 60)) + (T0.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.x) / 20) % 3))) + ((2 * T0.alloc_stride[2LL]) * ((((nvfuser_index_t)threadIdx.x) % 20) / 10))) + (T0.alloc_stride[2LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) / 5))) + (T0.alloc_stride[3LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) % 5)))];
Array<float, 1, 1> T2;
T2[0]
= T10[0]
* (float) 7.07106781186547573e-01;
Array<float, 1, 1> T3;
T3[0]
= erff(T2[0]);
Array<float, 1, 1> T4;
T4[0]
= (float) 1.00000000000000000e+00
+ T3[0];
Array<float, 1, 1> T5;
T5[0]
= (float) 5.00000000000000000e-01
* T4[0];
Array<float, 1, 1> T6;
T6[0]
= T10[0]
* T5[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0];
Array<float, 1, 1> T8;
T8[0]
= T7[0];
Array<float, 1, 1> T12;
T12[0]
= T8[0]
+ T11[0];
T9[((nvfuser_index_t)threadIdx.x)]
= T12[0];
}
}
--- 53997da5d
+++ 03a1b695e
@@ -1,15 +1,15 @@
__global__ void nvfuser_N(Tensor<float, 4, 4> T0, Tensor<float, 7, 7> T1, Tensor<float, 7, 7> T9) {
- if (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < 120)) {
+ if ((((nvfuser_index_t)threadIdx.x) < 120)) {
Array<float, 1, 1> T11;
T11[0] = 0;
T11[0]
- = T1[((((T1.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 20)) + (T1.alloc_stride[3LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) / 10))) + (T1.alloc_stride[4LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) / 5))) + (T1.alloc_stride[5LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) % 5)))];
+ = T1[((((T1.alloc_stride[1LL] * (((nvfuser_index_t)threadIdx.x) / 20)) + (T1.alloc_stride[3LL] * ((((nvfuser_index_t)threadIdx.x) % 20) / 10))) + (T1.alloc_stride[4LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) / 5))) + (T1.alloc_stride[5LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) % 5)))];
Array<float, 1, 1> T10;
T10[0] = 0;
T10[0]
- = T0[(((((T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 60)) + (T0.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 20) % 3))) + ((2 * T0.alloc_stride[2LL]) * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) / 10))) + (T0.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) / 5))) + (T0.alloc_stride[3LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 20) % 10) % 5)))];
+ = T0[(((((T0.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) / 60)) + (T0.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.x) / 20) % 3))) + ((2 * T0.alloc_stride[2LL]) * ((((nvfuser_index_t)threadIdx.x) % 20) / 10))) + (T0.alloc_stride[2LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) / 5))) + (T0.alloc_stride[3LL] * (((((nvfuser_index_t)threadIdx.x) % 20) % 10) % 5)))];
Array<float, 1, 1> T2;
T2[0]
= T10[0]
* (float) 7.07106781186547573e-01;
Array<float, 1, 1> T3;
@@ -35,9 +35,9 @@
= T7[0];
Array<float, 1, 1> T12;
T12[0]
= T8[0]
+ T11[0];
- T9[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
+ T9[((nvfuser_index_t)threadIdx.x)]
= T12[0];
}
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_339_cu_954e3204_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_339_cu_954e3204_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_339_cu_954e3204_191103std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_339_cu_954e3204_1911011nvfuser_339ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_339_cu_954e3204_1911011nvfuser_339ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_339_cu_954e3204_1911011nvfuser_339ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1[64],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_339_cu_954e3204_1911011nvfuser_339ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_2[64]
)
{
.reg .pred %p<4>;
.reg .f32 %f<32>;
.reg .b32 %r<123>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r46, %r47}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_339_cu_954e3204_1911011nvfuser_339ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0+24];
ld.param.v2.u32 {%r48, %r49}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_339_cu_954e3204_1911011nvfuser_339ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0+32];
ld.param.v2.u32 {%r58, %r59}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_339_cu_954e3204_1911011nvfuser_339ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+40];
ld.param.v2.u32 {%r60, %r61}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_339_cu_954e3204_1911011nvfuser_339ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+48];
ld.param.v2.u32 {%r62, %r63}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_339_cu_954e3204_1911011nvfuser_339ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+56];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_339_cu_954e3204_1911011nvfuser_339ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_2];
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_339_cu_954e3204_1911011nvfuser_339ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_339_cu_954e3204_1911011nvfuser_339ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0];
mov.u32 %r78, %ctaid.x;
shl.b32 %r79, %r78, 7;
mov.u32 %r80, %tid.x;
add.s32 %r5, %r79, %r80;
setp.gt.s32 %p1, %r5, 119;
@%p1 bra $L__BB0_4;
cvta.to.global.u64 %rd4, %rd2;
cvta.to.global.u64 %rd5, %rd1;
mul.hi.s32 %r81, %r5, 1717986919;
shr.u32 %r82, %r81, 31;
shr.s32 %r83, %r81, 3;
add.s32 %r84, %r83, %r82;
mul.lo.s32 %r85, %r84, 20;
sub.s32 %r86, %r5, %r85;
mul.hi.s32 %r87, %r86, 1717986919;
shr.u32 %r88, %r87, 31;
shr.s32 %r89, %r87, 2;
add.s32 %r90, %r89, %r88;
mul.lo.s32 %r91, %r60, %r90;
mad.lo.s32 %r92, %r58, %r84, %r91;
mul.lo.s32 %r93, %r90, 10;
sub.s32 %r94, %r86, %r93;
mul.hi.s32 %r95, %r94, 1717986919;
shr.u32 %r96, %r95, 31;
shr.s32 %r97, %r95, 1;
add.s32 %r98, %r97, %r96;
mad.lo.s32 %r99, %r61, %r98, %r92;
mul.lo.s32 %r100, %r98, 5;
sub.s32 %r101, %r94, %r100;
mad.lo.s32 %r102, %r62, %r101, %r99;
mul.wide.s32 %rd6, %r102, 4;
add.s64 %rd7, %rd4, %rd6;
ld.global.f32 %f1, [%rd7];
mul.hi.s32 %r103, %r5, -2004318071;
add.s32 %r104, %r103, %r5;
shr.u32 %r105, %r104, 31;
shr.s32 %r106, %r104, 5;
add.s32 %r107, %r106, %r105;
mul.hi.s32 %r108, %r84, 1431655766;
shr.u32 %r109, %r108, 31;
add.s32 %r110, %r108, %r109;
mul.lo.s32 %r111, %r110, 3;
sub.s32 %r112, %r84, %r111;
mul.lo.s32 %r113, %r47, %r112;
shl.b32 %r114, %r90, 1;
add.s32 %r115, %r114, %r98;
mad.lo.s32 %r116, %r46, %r107, %r113;
mad.lo.s32 %r117, %r101, %r49, %r116;
mad.lo.s32 %r118, %r115, %r48, %r117;
mul.wide.s32 %rd8, %r118, 4;
add.s64 %rd9, %rd5, %rd8;
ld.global.f32 %f2, [%rd9];
mul.f32 %f3, %f2, 0f3F3504F3;
abs.f32 %f7, %f3;
setp.ltu.f32 %p2, %f7, 0f3F8060FE;
setp.ge.f32 %p3, %f7, 0f3F8060FE;
mul.f32 %f8, %f3, %f3;
selp.f32 %f9, %f7, %f8, %p3;
selp.f32 %f10, 0f38EB4C3A, 0f38B1E96A, %p3;
selp.f32 %f11, 0fBAAE005B, 0fBA574D20, %p3;
fma.rn.f32 %f12, %f10, %f9, %f11;
selp.f32 %f13, 0f3C09919F, 0f3BAAD5EA, %p3;
fma.rn.f32 %f14, %f12, %f9, %f13;
selp.f32 %f15, 0fBD24D99A, 0fBCDC1BE7, %p3;
fma.rn.f32 %f16, %f14, %f9, %f15;
selp.f32 %f17, 0f3E235519, 0f3DE718AF, %p3;
fma.rn.f32 %f18, %f16, %f9, %f17;
selp.f32 %f19, 0f3F69B4F9, 0fBEC093AC, %p3;
fma.rn.f32 %f20, %f18, %f9, %f19;
selp.f32 %f21, 0f3F210A14, 0f3E0375D3, %p3;
fma.rn.f32 %f22, %f20, %f9, %f21;
neg.f32 %f23, %f7;
selp.f32 %f24, %f23, %f3, %p3;
fma.rn.f32 %f31, %f22, %f24, %f24;
@%p2 bra $L__BB0_3;
ex2.approx.ftz.f32 %f25, %f31;
mov.f32 %f26, 0f3F800000;
sub.f32 %f27, %f26, %f25;
mov.b32 %r119, %f27;
mov.b32 %r120, %f3;
and.b32 %r121, %r120, -2147483648;
or.b32 %r122, %r121, %r119;
mov.b32 %f31, %r122;
$L__BB0_3:
add.f32 %f28, %f31, 0f3F800000;
mul.f32 %f29, %f28, 0f3F000000;
fma.rn.f32 %f30, %f2, %f29, %f1;
cvta.to.global.u64 %rd10, %rd3;
mul.wide.s32 %rd11, %r5, 4;
add.s64 %rd12, %rd10, %rd11;
st.global.f32 [%rd12], %f30;
$L__BB0_4:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_339_cu_d00b204b_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_339_cu_d00b204b_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_339_cu_d00b204b_160113std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_339_cu_d00b204b_1601111nvfuser_339ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_339_cu_d00b204b_1601111nvfuser_339ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_339_cu_d00b204b_1601111nvfuser_339ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1[64],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_339_cu_d00b204b_1601111nvfuser_339ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_2[64]
)
{
.reg .pred %p<4>;
.reg .f32 %f<32>;
.reg .b32 %r<120>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r46, %r47}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_339_cu_d00b204b_1601111nvfuser_339ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0+24];
ld.param.v2.u32 {%r48, %r49}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_339_cu_d00b204b_1601111nvfuser_339ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0+32];
ld.param.v2.u32 {%r58, %r59}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_339_cu_d00b204b_1601111nvfuser_339ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+40];
ld.param.v2.u32 {%r60, %r61}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_339_cu_d00b204b_1601111nvfuser_339ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+48];
ld.param.v2.u32 {%r62, %r63}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_339_cu_d00b204b_1601111nvfuser_339ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+56];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_339_cu_d00b204b_1601111nvfuser_339ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_2];
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_339_cu_d00b204b_1601111nvfuser_339ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_339_cu_d00b204b_1601111nvfuser_339ENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0];
mov.u32 %r5, %tid.x;
setp.gt.s32 %p1, %r5, 119;
@%p1 bra $L__BB0_4;
cvta.to.global.u64 %rd4, %rd2;
cvta.to.global.u64 %rd5, %rd1;
mul.hi.s32 %r78, %r5, 1717986919;
shr.u32 %r79, %r78, 31;
shr.s32 %r80, %r78, 3;
add.s32 %r81, %r80, %r79;
mul.lo.s32 %r82, %r81, 20;
sub.s32 %r83, %r5, %r82;
mul.hi.s32 %r84, %r83, 1717986919;
shr.u32 %r85, %r84, 31;
shr.s32 %r86, %r84, 2;
add.s32 %r87, %r86, %r85;
mul.lo.s32 %r88, %r60, %r87;
mad.lo.s32 %r89, %r58, %r81, %r88;
mul.lo.s32 %r90, %r87, 10;
sub.s32 %r91, %r83, %r90;
mul.hi.s32 %r92, %r91, 1717986919;
shr.u32 %r93, %r92, 31;
shr.s32 %r94, %r92, 1;
add.s32 %r95, %r94, %r93;
mad.lo.s32 %r96, %r61, %r95, %r89;
mul.lo.s32 %r97, %r95, 5;
sub.s32 %r98, %r91, %r97;
mad.lo.s32 %r99, %r62, %r98, %r96;
mul.wide.s32 %rd6, %r99, 4;
add.s64 %rd7, %rd4, %rd6;
ld.global.f32 %f1, [%rd7];
mul.hi.s32 %r100, %r5, -2004318071;
add.s32 %r101, %r100, %r5;
shr.u32 %r102, %r101, 31;
shr.s32 %r103, %r101, 5;
add.s32 %r104, %r103, %r102;
mul.hi.s32 %r105, %r81, 1431655766;
shr.u32 %r106, %r105, 31;
add.s32 %r107, %r105, %r106;
mul.lo.s32 %r108, %r107, 3;
sub.s32 %r109, %r81, %r108;
mul.lo.s32 %r110, %r47, %r109;
shl.b32 %r111, %r87, 1;
add.s32 %r112, %r111, %r95;
mad.lo.s32 %r113, %r46, %r104, %r110;
mad.lo.s32 %r114, %r49, %r98, %r113;
mad.lo.s32 %r115, %r112, %r48, %r114;
mul.wide.s32 %rd8, %r115, 4;
add.s64 %rd9, %rd5, %rd8;
ld.global.f32 %f2, [%rd9];
mul.f32 %f3, %f2, 0f3F3504F3;
abs.f32 %f7, %f3;
setp.ltu.f32 %p2, %f7, 0f3F8060FE;
setp.ge.f32 %p3, %f7, 0f3F8060FE;
mul.f32 %f8, %f3, %f3;
selp.f32 %f9, %f7, %f8, %p3;
selp.f32 %f10, 0f38EB4C3A, 0f38B1E96A, %p3;
selp.f32 %f11, 0fBAAE005B, 0fBA574D20, %p3;
fma.rn.f32 %f12, %f10, %f9, %f11;
selp.f32 %f13, 0f3C09919F, 0f3BAAD5EA, %p3;
fma.rn.f32 %f14, %f12, %f9, %f13;
selp.f32 %f15, 0fBD24D99A, 0fBCDC1BE7, %p3;
fma.rn.f32 %f16, %f14, %f9, %f15;
selp.f32 %f17, 0f3E235519, 0f3DE718AF, %p3;
fma.rn.f32 %f18, %f16, %f9, %f17;
selp.f32 %f19, 0f3F69B4F9, 0fBEC093AC, %p3;
fma.rn.f32 %f20, %f18, %f9, %f19;
selp.f32 %f21, 0f3F210A14, 0f3E0375D3, %p3;
fma.rn.f32 %f22, %f20, %f9, %f21;
neg.f32 %f23, %f7;
selp.f32 %f24, %f23, %f3, %p3;
fma.rn.f32 %f31, %f22, %f24, %f24;
@%p2 bra $L__BB0_3;
ex2.approx.ftz.f32 %f25, %f31;
mov.f32 %f26, 0f3F800000;
sub.f32 %f27, %f26, %f25;
mov.b32 %r116, %f27;
mov.b32 %r117, %f3;
and.b32 %r118, %r117, -2147483648;
or.b32 %r119, %r118, %r116;
mov.b32 %f31, %r119;
$L__BB0_3:
add.f32 %f28, %f31, 0f3F800000;
mul.f32 %f29, %f28, 0f3F000000;
fma.rn.f32 %f30, %f2, %f29, %f1;
cvta.to.global.u64 %rd10, %rd3;
mul.wide.s32 %rd11, %r5, 4;
add.s64 %rd12, %rd10, %rd11;
st.global.f32 [%rd12], %f30;
$L__BB0_4:
ret;
}
--- 53997da5d
+++ 03a1b695e
@@ -20,11 +20,11 @@
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_2[64]
)
{
.reg .pred %p<4>;
.reg .f32 %f<32>;
- .reg .b32 %r<123>;
+ .reg .b32 %r<120>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r46, %r47}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0+24];
ld.param.v2.u32 {%r48, %r49}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0+32];
@@ -32,61 +32,58 @@
ld.param.v2.u32 {%r60, %r61}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+48];
ld.param.v2.u32 {%r62, %r63}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1+56];
ld.param.u64 %rd3, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_2];
ld.param.u64 %rd2, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_1];
ld.param.u64 %rd1, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi7ELi7EEES2__param_0];
- mov.u32 %r78, %ctaid.x;
- shl.b32 %r79, %r78, 7;
- mov.u32 %r80, %tid.x;
- add.s32 %r5, %r79, %r80;
+ mov.u32 %r5, %tid.x;
setp.gt.s32 %p1, %r5, 119;
@%p1 bra $L__BB0_4;
cvta.to.global.u64 %rd4, %rd2;
cvta.to.global.u64 %rd5, %rd1;
- mul.hi.s32 %r81, %r5, 1717986919;
- shr.u32 %r82, %r81, 31;
- shr.s32 %r83, %r81, 3;
- add.s32 %r84, %r83, %r82;
- mul.lo.s32 %r85, %r84, 20;
- sub.s32 %r86, %r5, %r85;
- mul.hi.s32 %r87, %r86, 1717986919;
- shr.u32 %r88, %r87, 31;
- shr.s32 %r89, %r87, 2;
- add.s32 %r90, %r89, %r88;
- mul.lo.s32 %r91, %r60, %r90;
- mad.lo.s32 %r92, %r58, %r84, %r91;
- mul.lo.s32 %r93, %r90, 10;
- sub.s32 %r94, %r86, %r93;
- mul.hi.s32 %r95, %r94, 1717986919;
- shr.u32 %r96, %r95, 31;
- shr.s32 %r97, %r95, 1;
- add.s32 %r98, %r97, %r96;
- mad.lo.s32 %r99, %r61, %r98, %r92;
- mul.lo.s32 %r100, %r98, 5;
- sub.s32 %r101, %r94, %r100;
- mad.lo.s32 %r102, %r62, %r101, %r99;
- mul.wide.s32 %rd6, %r102, 4;
+ mul.hi.s32 %r78, %r5, 1717986919;
+ shr.u32 %r79, %r78, 31;
+ shr.s32 %r80, %r78, 3;
+ add.s32 %r81, %r80, %r79;
+ mul.lo.s32 %r82, %r81, 20;
+ sub.s32 %r83, %r5, %r82;
+ mul.hi.s32 %r84, %r83, 1717986919;
+ shr.u32 %r85, %r84, 31;
+ shr.s32 %r86, %r84, 2;
+ add.s32 %r87, %r86, %r85;
+ mul.lo.s32 %r88, %r60, %r87;
+ mad.lo.s32 %r89, %r58, %r81, %r88;
+ mul.lo.s32 %r90, %r87, 10;
+ sub.s32 %r91, %r83, %r90;
+ mul.hi.s32 %r92, %r91, 1717986919;
+ shr.u32 %r93, %r92, 31;
+ shr.s32 %r94, %r92, 1;
+ add.s32 %r95, %r94, %r93;
+ mad.lo.s32 %r96, %r61, %r95, %r89;
+ mul.lo.s32 %r97, %r95, 5;
+ sub.s32 %r98, %r91, %r97;
+ mad.lo.s32 %r99, %r62, %r98, %r96;
+ mul.wide.s32 %rd6, %r99, 4;
add.s64 %rd7, %rd4, %rd6;
ld.global.f32 %f1, [%rd7];
- mul.hi.s32 %r103, %r5, -2004318071;
- add.s32 %r104, %r103, %r5;
- shr.u32 %r105, %r104, 31;
- shr.s32 %r106, %r104, 5;
- add.s32 %r107, %r106, %r105;
- mul.hi.s32 %r108, %r84, 1431655766;
- shr.u32 %r109, %r108, 31;
- add.s32 %r110, %r108, %r109;
- mul.lo.s32 %r111, %r110, 3;
- sub.s32 %r112, %r84, %r111;
- mul.lo.s32 %r113, %r47, %r112;
- shl.b32 %r114, %r90, 1;
- add.s32 %r115, %r114, %r98;
- mad.lo.s32 %r116, %r46, %r107, %r113;
- mad.lo.s32 %r117, %r101, %r49, %r116;
- mad.lo.s32 %r118, %r115, %r48, %r117;
- mul.wide.s32 %rd8, %r118, 4;
+ mul.hi.s32 %r100, %r5, -2004318071;
+ add.s32 %r101, %r100, %r5;
+ shr.u32 %r102, %r101, 31;
+ shr.s32 %r103, %r101, 5;
+ add.s32 %r104, %r103, %r102;
+ mul.hi.s32 %r105, %r81, 1431655766;
+ shr.u32 %r106, %r105, 31;
+ add.s32 %r107, %r105, %r106;
+ mul.lo.s32 %r108, %r107, 3;
+ sub.s32 %r109, %r81, %r108;
+ mul.lo.s32 %r110, %r47, %r109;
+ shl.b32 %r111, %r87, 1;
+ add.s32 %r112, %r111, %r95;
+ mad.lo.s32 %r113, %r46, %r104, %r110;
+ mad.lo.s32 %r114, %r49, %r98, %r113;
+ mad.lo.s32 %r115, %r112, %r48, %r114;
+ mul.wide.s32 %rd8, %r115, 4;
add.s64 %rd9, %rd5, %rd8;
ld.global.f32 %f2, [%rd9];
mul.f32 %f3, %f2, 0f3F3504F3;
abs.f32 %f7, %f3;
setp.ltu.f32 %p2, %f7, 0f3F8060FE;
@@ -112,15 +109,15 @@
@%p2 bra $L__BB0_3;
ex2.approx.ftz.f32 %f25, %f31;
mov.f32 %f26, 0f3F800000;
sub.f32 %f27, %f26, %f25;
- mov.b32 %r119, %f27;
- mov.b32 %r120, %f3;
- and.b32 %r121, %r120, -2147483648;
- or.b32 %r122, %r121, %r119;
- mov.b32 %f31, %r122;
+ mov.b32 %r116, %f27;
+ mov.b32 %r117, %f3;
+ and.b32 %r118, %r117, -2147483648;
+ or.b32 %r119, %r118, %r116;
+ mov.b32 %f31, %r119;
$L__BB0_3:
add.f32 %f28, %f31, 0f3F800000;
mul.f32 %f29, %f28, 0f3F000000;
fma.rn.f32 %f30, %f2, %f29, %f1;
6: GpuViewTest.FusionReshapeConcreteDomain3
Kernel 1
CUDA
PTX
53997da5d
Diff
03a1b695e
-2
+2 index type: int
registers: 32
gmem: 3
static smem: 4
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 4, 4> T0, Tensor<float, 4, 4> T1, Tensor<float, 3, 3> T2, Tensor<float, 3, 3> T6) {
NVFUSER_DEFINE_MAGIC_ZERO;
if (((((((nvfuser_index_t)blockIdx.x) < (ceilDiv((ceilDiv((1344 * T0.logical_size[3LL]), 4)), 128))) && ((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + (512 * ((nvfuser_index_t)blockIdx.x))) < (1344 * T0.logical_size[3LL]))) && (((4 * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])) / (8 * T0.logical_size[3LL]))) + (((3 + (4 * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])) % (8 * T0.logical_size[3LL])))) / T0.logical_size[3LL]) / 8)) < 12)) && ((3 + (4 * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])))) < (T2.logical_size[1LL] * T2.logical_size[2LL])))) {
Array<float, 4, 1> T9;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
T9[i0] = 0;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
T9[i0]
= T2[(((T2.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / (24 * T0.logical_size[3LL]))) + (T2.alloc_stride[1LL] * (((4 * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL]))) + (i0 + nvfuser_zero)) / T2.logical_size[2LL]))) + (T2.alloc_stride[2LL] * (((4 * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL]))) + (i0 + nvfuser_zero)) % T2.logical_size[2LL])))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 4, 1> T8;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
T8[i1] = 0;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
T8[i1]
= T1[((((T1.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / (24 * T0.logical_size[3LL]))) + ((4 * T1.alloc_stride[1LL]) * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])) / (8 * T0.logical_size[3LL])))) + (T1.alloc_stride[1LL] * ((((4 * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])) % (8 * T0.logical_size[3LL]))) + (i1 + nvfuser_zero)) / T0.logical_size[3LL]) / 8))) + (T1.alloc_stride[2LL] * ((((4 * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])) % (8 * T0.logical_size[3LL]))) + (i1 + nvfuser_zero)) / T0.logical_size[3LL]) % 8)))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 4, 1> T7;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 4; ++i2) {
T7[i2] = 0;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 4; ++i2) {
T7[i2]
= T0[(((((T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / (24 * T0.logical_size[3LL]))) + ((4 * T0.alloc_stride[1LL]) * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])) / (8 * T0.logical_size[3LL])))) + (T0.alloc_stride[1LL] * ((((4 * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])) % (8 * T0.logical_size[3LL]))) + (i2 + nvfuser_zero)) / T0.logical_size[3LL]) / 8))) + (T0.alloc_stride[2LL] * ((((4 * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])) % (8 * T0.logical_size[3LL]))) + (i2 + nvfuser_zero)) / T0.logical_size[3LL]) % 8))) + (T0.alloc_stride[3LL] * (((4 * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])) % (8 * T0.logical_size[3LL]))) + (i2 + nvfuser_zero)) % T0.logical_size[3LL])))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 4, 4> T10;
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 4; ++i3) {
Array<float, 1, 1> T3;
T3[0]
= T7[i3]
+ T8[i3];
Array<float, 1, 1> T4;
T4[0]
= T3[0];
Array<float, 1, 1> T5;
T5[0]
= T9[i3];
T10[i3]
= T4[0]
+ T5[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T6[((4 * ((nvfuser_index_t)threadIdx.x)) + (512 * ((nvfuser_index_t)blockIdx.x)))], &T10[0]);
} else {
Array<float, 4, 1> T9;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
T9[i0] = 0;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
if ((((((nvfuser_index_t)blockIdx.x) < (ceilDiv((ceilDiv((1344 * T0.logical_size[3LL]), 4)), 128))) && (((3 + (4 * ((nvfuser_index_t)threadIdx.x))) + (512 * ((nvfuser_index_t)blockIdx.x))) < (1344 * T0.logical_size[3LL]))) && ((3 + (4 * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])))) < (T2.logical_size[1LL] * T2.logical_size[2LL])))) {
T9[i0]
= T2[(((T2.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / (24 * T0.logical_size[3LL]))) + (T2.alloc_stride[1LL] * (((4 * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL]))) + (i0 + nvfuser_zero)) / T2.logical_size[2LL]))) + (T2.alloc_stride[2LL] * (((4 * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL]))) + (i0 + nvfuser_zero)) % T2.logical_size[2LL])))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 4, 1> T8;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
T8[i1] = 0;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
if ((((((-(1344 * T0.logical_size[3LL])) + (4 * ((nvfuser_index_t)threadIdx.x))) + (512 * ((nvfuser_index_t)blockIdx.x))) < (-(i1 + nvfuser_zero))) && (((4 * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])) / (8 * T0.logical_size[3LL]))) + ((((4 * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])) % (8 * T0.logical_size[3LL]))) + (i1 + nvfuser_zero)) / T0.logical_size[3LL]) / 8)) < 12))) {
T8[i1]
= T1[((((T1.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / (24 * T0.logical_size[3LL]))) + ((4 * T1.alloc_stride[1LL]) * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])) / (8 * T0.logical_size[3LL])))) + (T1.alloc_stride[1LL] * ((((4 * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])) % (8 * T0.logical_size[3LL]))) + (i1 + nvfuser_zero)) / T0.logical_size[3LL]) / 8))) + (T1.alloc_stride[2LL] * ((((4 * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])) % (8 * T0.logical_size[3LL]))) + (i1 + nvfuser_zero)) / T0.logical_size[3LL]) % 8)))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 4, 1> T7;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 4; ++i2) {
T7[i2] = 0;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 4; ++i2) {
if (((((nvfuser_index_t)blockIdx.x) < (ceilDiv((ceilDiv((1344 * T0.logical_size[3LL]), 4)), 128))) && (((3 + (4 * ((nvfuser_index_t)threadIdx.x))) + (512 * ((nvfuser_index_t)blockIdx.x))) < (1344 * T0.logical_size[3LL])))) {
T7[i2]
= T0[(((((T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / (24 * T0.logical_size[3LL]))) + ((4 * T0.alloc_stride[1LL]) * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])) / (8 * T0.logical_size[3LL])))) + (T0.alloc_stride[1LL] * ((((4 * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])) % (8 * T0.logical_size[3LL]))) + (i2 + nvfuser_zero)) / T0.logical_size[3LL]) / 8))) + (T0.alloc_stride[2LL] * ((((4 * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])) % (8 * T0.logical_size[3LL]))) + (i2 + nvfuser_zero)) / T0.logical_size[3LL]) % 8))) + (T0.alloc_stride[3LL] * (((4 * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])) % (8 * T0.logical_size[3LL]))) + (i2 + nvfuser_zero)) % T0.logical_size[3LL])))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 4, 4> T10;
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 4; ++i3) {
Array<float, 1, 1> T3;
T3[0]
= T7[i3]
+ T8[i3];
Array<float, 1, 1> T4;
T4[0]
= T3[0];
Array<float, 1, 1> T5;
T5[0]
= T9[i3];
T10[i3]
= T4[0]
+ T5[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)blockIdx.x) < (ceilDiv((ceilDiv((1344 * T0.logical_size[3LL]), 4)), 128))) && (((3 + (4 * ((nvfuser_index_t)threadIdx.x))) + (512 * ((nvfuser_index_t)blockIdx.x))) < (1344 * T0.logical_size[3LL])))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T6[((4 * ((nvfuser_index_t)threadIdx.x)) + (512 * ((nvfuser_index_t)blockIdx.x)))], &T10[0]);
}
}
}
__global__ void nvfuser_N(Tensor<float, 4, 4> T0, Tensor<float, 4, 4> T1, Tensor<float, 3, 3> T2, Tensor<float, 3, 3> T6) {
NVFUSER_DEFINE_MAGIC_ZERO;
if ((((((nvfuser_index_t)blockIdx.x) < (ceilDiv((ceilDiv((1344 * T0.logical_size[3LL]), 4)), 128))) && ((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + (512 * ((nvfuser_index_t)blockIdx.x))) < (1344 * T0.logical_size[3LL]))) && (((4 * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])) / (8 * T0.logical_size[3LL]))) + (((3 + (4 * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])) % (8 * T0.logical_size[3LL])))) / T0.logical_size[3LL]) / 8)) < 12))) {
Array<float, 4, 1> T9;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
T9[i0] = 0;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
T9[i0]
= T2[(((T2.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / (24 * T0.logical_size[3LL]))) + (T2.alloc_stride[1LL] * (((4 * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL]))) + (i0 + nvfuser_zero)) / T2.logical_size[2LL]))) + (T2.alloc_stride[2LL] * (((4 * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL]))) + (i0 + nvfuser_zero)) % T2.logical_size[2LL])))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 4, 1> T8;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
T8[i1] = 0;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
T8[i1]
= T1[((((T1.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / (24 * T0.logical_size[3LL]))) + ((4 * T1.alloc_stride[1LL]) * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])) / (8 * T0.logical_size[3LL])))) + (T1.alloc_stride[1LL] * ((((4 * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])) % (8 * T0.logical_size[3LL]))) + (i1 + nvfuser_zero)) / T0.logical_size[3LL]) / 8))) + (T1.alloc_stride[2LL] * ((((4 * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])) % (8 * T0.logical_size[3LL]))) + (i1 + nvfuser_zero)) / T0.logical_size[3LL]) % 8)))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 4, 1> T7;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 4; ++i2) {
T7[i2] = 0;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 4; ++i2) {
T7[i2]
= T0[(((((T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / (24 * T0.logical_size[3LL]))) + ((4 * T0.alloc_stride[1LL]) * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])) / (8 * T0.logical_size[3LL])))) + (T0.alloc_stride[1LL] * ((((4 * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])) % (8 * T0.logical_size[3LL]))) + (i2 + nvfuser_zero)) / T0.logical_size[3LL]) / 8))) + (T0.alloc_stride[2LL] * ((((4 * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])) % (8 * T0.logical_size[3LL]))) + (i2 + nvfuser_zero)) / T0.logical_size[3LL]) % 8))) + (T0.alloc_stride[3LL] * (((4 * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])) % (8 * T0.logical_size[3LL]))) + (i2 + nvfuser_zero)) % T0.logical_size[3LL])))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 4, 4> T10;
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 4; ++i3) {
Array<float, 1, 1> T3;
T3[0]
= T7[i3]
+ T8[i3];
Array<float, 1, 1> T4;
T4[0]
= T3[0];
Array<float, 1, 1> T5;
T5[0]
= T9[i3];
T10[i3]
= T4[0]
+ T5[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T6[((4 * ((nvfuser_index_t)threadIdx.x)) + (512 * ((nvfuser_index_t)blockIdx.x)))], &T10[0]);
} else {
Array<float, 4, 1> T9;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
T9[i0] = 0;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
if (((((nvfuser_index_t)blockIdx.x) < (ceilDiv((ceilDiv((1344 * T0.logical_size[3LL]), 4)), 128))) && (((3 + (4 * ((nvfuser_index_t)threadIdx.x))) + (512 * ((nvfuser_index_t)blockIdx.x))) < (1344 * T0.logical_size[3LL])))) {
T9[i0]
= T2[(((T2.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / (24 * T0.logical_size[3LL]))) + (T2.alloc_stride[1LL] * (((4 * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL]))) + (i0 + nvfuser_zero)) / T2.logical_size[2LL]))) + (T2.alloc_stride[2LL] * (((4 * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL]))) + (i0 + nvfuser_zero)) % T2.logical_size[2LL])))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 4, 1> T8;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
T8[i1] = 0;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
if ((((((-(1344 * T0.logical_size[3LL])) + (4 * ((nvfuser_index_t)threadIdx.x))) + (512 * ((nvfuser_index_t)blockIdx.x))) < (-(i1 + nvfuser_zero))) && (((4 * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])) / (8 * T0.logical_size[3LL]))) + ((((4 * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])) % (8 * T0.logical_size[3LL]))) + (i1 + nvfuser_zero)) / T0.logical_size[3LL]) / 8)) < 12))) {
T8[i1]
= T1[((((T1.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / (24 * T0.logical_size[3LL]))) + ((4 * T1.alloc_stride[1LL]) * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])) / (8 * T0.logical_size[3LL])))) + (T1.alloc_stride[1LL] * ((((4 * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])) % (8 * T0.logical_size[3LL]))) + (i1 + nvfuser_zero)) / T0.logical_size[3LL]) / 8))) + (T1.alloc_stride[2LL] * ((((4 * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])) % (8 * T0.logical_size[3LL]))) + (i1 + nvfuser_zero)) / T0.logical_size[3LL]) % 8)))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 4, 1> T7;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 4; ++i2) {
T7[i2] = 0;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 4; ++i2) {
if (((((nvfuser_index_t)blockIdx.x) < (ceilDiv((ceilDiv((1344 * T0.logical_size[3LL]), 4)), 128))) && (((3 + (4 * ((nvfuser_index_t)threadIdx.x))) + (512 * ((nvfuser_index_t)blockIdx.x))) < (1344 * T0.logical_size[3LL])))) {
T7[i2]
= T0[(((((T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / (24 * T0.logical_size[3LL]))) + ((4 * T0.alloc_stride[1LL]) * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])) / (8 * T0.logical_size[3LL])))) + (T0.alloc_stride[1LL] * ((((4 * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])) % (8 * T0.logical_size[3LL]))) + (i2 + nvfuser_zero)) / T0.logical_size[3LL]) / 8))) + (T0.alloc_stride[2LL] * ((((4 * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])) % (8 * T0.logical_size[3LL]))) + (i2 + nvfuser_zero)) / T0.logical_size[3LL]) % 8))) + (T0.alloc_stride[3LL] * (((4 * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])) % (8 * T0.logical_size[3LL]))) + (i2 + nvfuser_zero)) % T0.logical_size[3LL])))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 4, 4> T10;
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 4; ++i3) {
Array<float, 1, 1> T3;
T3[0]
= T7[i3]
+ T8[i3];
Array<float, 1, 1> T4;
T4[0]
= T3[0];
Array<float, 1, 1> T5;
T5[0]
= T9[i3];
T10[i3]
= T4[0]
+ T5[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)blockIdx.x) < (ceilDiv((ceilDiv((1344 * T0.logical_size[3LL]), 4)), 128))) && (((3 + (4 * ((nvfuser_index_t)threadIdx.x))) + (512 * ((nvfuser_index_t)blockIdx.x))) < (1344 * T0.logical_size[3LL])))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T6[((4 * ((nvfuser_index_t)threadIdx.x)) + (512 * ((nvfuser_index_t)blockIdx.x)))], &T10[0]);
}
}
}
--- 53997da5d
+++ 03a1b695e
@@ -1,8 +1,8 @@
__global__ void nvfuser_N(Tensor<float, 4, 4> T0, Tensor<float, 4, 4> T1, Tensor<float, 3, 3> T2, Tensor<float, 3, 3> T6) {
NVFUSER_DEFINE_MAGIC_ZERO;
- if (((((((nvfuser_index_t)blockIdx.x) < (ceilDiv((ceilDiv((1344 * T0.logical_size[3LL]), 4)), 128))) && ((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + (512 * ((nvfuser_index_t)blockIdx.x))) < (1344 * T0.logical_size[3LL]))) && (((4 * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])) / (8 * T0.logical_size[3LL]))) + (((3 + (4 * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])) % (8 * T0.logical_size[3LL])))) / T0.logical_size[3LL]) / 8)) < 12)) && ((3 + (4 * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])))) < (T2.logical_size[1LL] * T2.logical_size[2LL])))) {
+ if ((((((nvfuser_index_t)blockIdx.x) < (ceilDiv((ceilDiv((1344 * T0.logical_size[3LL]), 4)), 128))) && ((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + (512 * ((nvfuser_index_t)blockIdx.x))) < (1344 * T0.logical_size[3LL]))) && (((4 * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])) / (8 * T0.logical_size[3LL]))) + (((3 + (4 * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])) % (8 * T0.logical_size[3LL])))) / T0.logical_size[3LL]) / 8)) < 12))) {
Array<float, 4, 1> T9;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
T9[i0] = 0;
}
@@ -63,11 +63,11 @@
T9[i0] = 0;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
- if ((((((nvfuser_index_t)blockIdx.x) < (ceilDiv((ceilDiv((1344 * T0.logical_size[3LL]), 4)), 128))) && (((3 + (4 * ((nvfuser_index_t)threadIdx.x))) + (512 * ((nvfuser_index_t)blockIdx.x))) < (1344 * T0.logical_size[3LL]))) && ((3 + (4 * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL])))) < (T2.logical_size[1LL] * T2.logical_size[2LL])))) {
+ if (((((nvfuser_index_t)blockIdx.x) < (ceilDiv((ceilDiv((1344 * T0.logical_size[3LL]), 4)), 128))) && (((3 + (4 * ((nvfuser_index_t)threadIdx.x))) + (512 * ((nvfuser_index_t)blockIdx.x))) < (1344 * T0.logical_size[3LL])))) {
T9[i0]
= T2[(((T2.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / (24 * T0.logical_size[3LL]))) + (T2.alloc_stride[1LL] * (((4 * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL]))) + (i0 + nvfuser_zero)) / T2.logical_size[2LL]))) + (T2.alloc_stride[2LL] * (((4 * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % (24 * T0.logical_size[3LL]))) + (i0 + nvfuser_zero)) % T2.logical_size[2LL])))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_b09846eb_1911011nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2_E14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_b09846eb_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_b09846eb_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_b09846eb_191103std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_b09846eb_1911011nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_b09846eb_1911011nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_0[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_b09846eb_1911011nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_1[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_b09846eb_1911011nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_2[32],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_b09846eb_1911011nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_3[32]
)
{
.reg .pred %p<29>;
.reg .f32 %f<102>;
.reg .b32 %r<474>;
.reg .b64 %rd<62>;
// demoted variable
.shared .align 4 .u32 _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_b09846eb_1911011nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2_E14nvfuser_zero_s;
ld.param.v2.u32 {%r78, %r79}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_b09846eb_1911011nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_0+16];
ld.param.v2.u32 {%r80, %r81}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_b09846eb_1911011nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_0+24];
ld.param.v2.u32 {%r82, %r83}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_b09846eb_1911011nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_0+32];
ld.param.v2.u32 {%r88, %r89}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_b09846eb_1911011nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_1+24];
ld.param.v2.u32 {%r90, %r91}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_b09846eb_1911011nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_1+32];
ld.param.v2.u32 {%r92, %r93}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_b09846eb_1911011nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_2+8];
ld.param.v2.u32 {%r94, %r95}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_b09846eb_1911011nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_2+16];
ld.param.v2.u32 {%r96, %r97}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_b09846eb_1911011nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_2+24];
ld.param.u64 %rd8, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_b09846eb_1911011nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_3];
ld.param.u64 %rd7, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_b09846eb_1911011nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_2];
ld.param.u64 %rd6, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_b09846eb_1911011nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_1];
ld.param.u64 %rd5, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_b09846eb_1911011nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_0];
cvta.to.global.u64 %rd1, %rd5;
cvta.to.global.u64 %rd2, %rd6;
cvta.to.global.u64 %rd3, %rd7;
mov.u32 %r1, %tid.x;
setp.ne.s32 %p1, %r1, 0;
@%p1 bra $L__BB0_2;
mov.u32 %r104, 0;
st.shared.u32 [_ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_b09846eb_1911011nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2_E14nvfuser_zero_s], %r104;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd9, _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_b09846eb_1911011nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2_E14nvfuser_zero_s;
atom.shared.min.s32 %r105, [%rd9], %r1;
ld.shared.u32 %r15, [_ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_b09846eb_1911011nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2_E14nvfuser_zero_s];
mul.lo.s32 %r16, %r79, 1344;
or.b32 %r106, %r16, 3;
shr.s32 %r107, %r106, 31;
shr.u32 %r108, %r107, 30;
add.s32 %r109, %r106, %r108;
shr.s32 %r110, %r109, 2;
add.s32 %r111, %r110, 127;
shr.s32 %r112, %r111, 31;
shr.u32 %r113, %r112, 25;
add.s32 %r114, %r111, %r113;
shr.s32 %r17, %r114, 7;
mov.u32 %r18, %ctaid.x;
setp.lt.s32 %p2, %r18, %r17;
@%p2 bra $L__BB0_3;
bra.uni $L__BB0_6;
$L__BB0_3:
shl.b32 %r19, %r1, 2;
shl.b32 %r20, %r18, 9;
add.s32 %r115, %r19, %r20;
or.b32 %r116, %r115, 3;
setp.ge.s32 %p3, %r116, %r16;
@%p3 bra $L__BB0_6;
shl.b32 %r117, %r18, 7;
add.s32 %r21, %r117, %r1;
mul.lo.s32 %r22, %r79, 24;
rem.s32 %r23, %r21, %r22;
shl.b32 %r118, %r79, 3;
div.s32 %r119, %r23, %r118;
shl.b32 %r24, %r119, 2;
mul.lo.s32 %r120, %r119, %r118;
sub.s32 %r121, %r23, %r120;
shl.b32 %r25, %r121, 2;
or.b32 %r122, %r25, 3;
div.s32 %r123, %r122, %r79;
shr.s32 %r124, %r123, 31;
shr.u32 %r125, %r124, 29;
add.s32 %r126, %r123, %r125;
shr.s32 %r127, %r126, 3;
add.s32 %r128, %r127, %r24;
setp.gt.s32 %p4, %r128, 11;
@%p4 bra $L__BB0_6;
shl.b32 %r26, %r23, 2;
or.b32 %r129, %r26, 3;
mul.lo.s32 %r130, %r93, %r94;
setp.lt.s32 %p5, %r129, %r130;
@%p5 bra $L__BB0_35;
bra.uni $L__BB0_6;
$L__BB0_35:
shl.b32 %r342, %r15, 1;
div.s32 %r343, %r21, %r22;
mul.lo.s32 %r344, %r343, %r95;
add.s32 %r345, %r26, %r342;
div.s32 %r346, %r345, %r94;
mad.lo.s32 %r347, %r346, %r96, %r344;
mul.lo.s32 %r348, %r346, %r94;
sub.s32 %r349, %r345, %r348;
mad.lo.s32 %r350, %r349, %r97, %r347;
mul.wide.s32 %rd37, %r350, 4;
add.s64 %rd38, %rd3, %rd37;
add.s32 %r351, %r345, 1;
div.s32 %r352, %r351, %r94;
mad.lo.s32 %r353, %r352, %r96, %r344;
mul.lo.s32 %r354, %r352, %r94;
sub.s32 %r355, %r351, %r354;
mad.lo.s32 %r356, %r355, %r97, %r353;
mul.wide.s32 %rd39, %r356, 4;
add.s64 %rd40, %rd3, %rd39;
add.s32 %r357, %r345, 2;
div.s32 %r358, %r357, %r94;
mad.lo.s32 %r359, %r358, %r96, %r344;
mul.lo.s32 %r360, %r358, %r94;
sub.s32 %r361, %r357, %r360;
mad.lo.s32 %r362, %r361, %r97, %r359;
mul.wide.s32 %rd41, %r362, 4;
add.s64 %rd42, %rd3, %rd41;
add.s32 %r363, %r345, 3;
div.s32 %r364, %r363, %r94;
mad.lo.s32 %r365, %r364, %r96, %r344;
mul.lo.s32 %r366, %r364, %r94;
sub.s32 %r367, %r363, %r366;
mad.lo.s32 %r368, %r367, %r97, %r365;
mul.wide.s32 %rd43, %r368, 4;
add.s64 %rd44, %rd3, %rd43;
mul.lo.s32 %r369, %r343, %r88;
shl.b32 %r370, %r15, 3;
add.s32 %r371, %r25, %r370;
div.s32 %r372, %r371, %r79;
shr.s32 %r373, %r372, 31;
shr.u32 %r374, %r373, 29;
add.s32 %r375, %r372, %r374;
shr.s32 %r376, %r375, 3;
and.b32 %r377, %r375, -8;
sub.s32 %r378, %r372, %r377;
add.s32 %r379, %r376, %r24;
mad.lo.s32 %r380, %r379, %r89, %r369;
mad.lo.s32 %r381, %r378, %r90, %r380;
mul.wide.s32 %rd45, %r381, 4;
add.s64 %rd46, %rd2, %rd45;
add.s32 %r382, %r371, 1;
div.s32 %r383, %r382, %r79;
shr.s32 %r384, %r383, 31;
shr.u32 %r385, %r384, 29;
add.s32 %r386, %r383, %r385;
shr.s32 %r387, %r386, 3;
and.b32 %r388, %r386, -8;
sub.s32 %r389, %r383, %r388;
add.s32 %r390, %r387, %r24;
mad.lo.s32 %r391, %r390, %r89, %r369;
mad.lo.s32 %r392, %r389, %r90, %r391;
mul.wide.s32 %rd47, %r392, 4;
add.s64 %rd48, %rd2, %rd47;
add.s32 %r393, %r371, 2;
div.s32 %r394, %r393, %r79;
shr.s32 %r395, %r394, 31;
shr.u32 %r396, %r395, 29;
add.s32 %r397, %r394, %r396;
shr.s32 %r398, %r397, 3;
and.b32 %r399, %r397, -8;
sub.s32 %r400, %r394, %r399;
add.s32 %r401, %r398, %r24;
mad.lo.s32 %r402, %r401, %r89, %r369;
mad.lo.s32 %r403, %r400, %r90, %r402;
mul.wide.s32 %rd49, %r403, 4;
add.s64 %rd50, %rd2, %rd49;
add.s32 %r404, %r371, 3;
div.s32 %r405, %r404, %r79;
shr.s32 %r406, %r405, 31;
shr.u32 %r407, %r406, 29;
add.s32 %r408, %r405, %r407;
shr.s32 %r409, %r408, 3;
and.b32 %r410, %r408, -8;
sub.s32 %r411, %r405, %r410;
add.s32 %r412, %r409, %r24;
mad.lo.s32 %r413, %r412, %r89, %r369;
mad.lo.s32 %r414, %r411, %r90, %r413;
mul.wide.s32 %rd51, %r414, 4;
add.s64 %rd52, %rd2, %rd51;
mul.lo.s32 %r415, %r343, %r80;
shl.b32 %r416, %r15, 5;
add.s32 %r417, %r25, %r416;
div.s32 %r418, %r417, %r79;
shr.s32 %r419, %r418, 31;
shr.u32 %r420, %r419, 29;
add.s32 %r421, %r418, %r420;
shr.s32 %r422, %r421, 3;
and.b32 %r423, %r421, -8;
sub.s32 %r424, %r418, %r423;
mul.lo.s32 %r425, %r418, %r79;
sub.s32 %r426, %r417, %r425;
add.s32 %r427, %r422, %r24;
mad.lo.s32 %r428, %r427, %r81, %r415;
mad.lo.s32 %r429, %r424, %r82, %r428;
mad.lo.s32 %r430, %r426, %r83, %r429;
mul.wide.s32 %rd53, %r430, 4;
add.s64 %rd54, %rd1, %rd53;
add.s32 %r431, %r417, 1;
div.s32 %r432, %r431, %r79;
shr.s32 %r433, %r432, 31;
shr.u32 %r434, %r433, 29;
add.s32 %r435, %r432, %r434;
shr.s32 %r436, %r435, 3;
and.b32 %r437, %r435, -8;
sub.s32 %r438, %r432, %r437;
mul.lo.s32 %r439, %r432, %r79;
sub.s32 %r440, %r431, %r439;
add.s32 %r441, %r436, %r24;
mad.lo.s32 %r442, %r441, %r81, %r415;
mad.lo.s32 %r443, %r438, %r82, %r442;
mad.lo.s32 %r444, %r440, %r83, %r443;
mul.wide.s32 %rd55, %r444, 4;
add.s64 %rd56, %rd1, %rd55;
add.s32 %r445, %r417, 2;
div.s32 %r446, %r445, %r79;
shr.s32 %r447, %r446, 31;
shr.u32 %r448, %r447, 29;
add.s32 %r449, %r446, %r448;
shr.s32 %r450, %r449, 3;
and.b32 %r451, %r449, -8;
sub.s32 %r452, %r446, %r451;
mul.lo.s32 %r453, %r446, %r79;
sub.s32 %r454, %r445, %r453;
add.s32 %r455, %r450, %r24;
mad.lo.s32 %r456, %r455, %r81, %r415;
mad.lo.s32 %r457, %r452, %r82, %r456;
mad.lo.s32 %r458, %r454, %r83, %r457;
mul.wide.s32 %rd57, %r458, 4;
add.s64 %rd58, %rd1, %rd57;
add.s32 %r459, %r417, 3;
div.s32 %r460, %r459, %r79;
shr.s32 %r461, %r460, 31;
shr.u32 %r462, %r461, 29;
add.s32 %r463, %r460, %r462;
shr.s32 %r464, %r463, 3;
and.b32 %r465, %r463, -8;
sub.s32 %r466, %r460, %r465;
mul.lo.s32 %r467, %r460, %r79;
sub.s32 %r468, %r459, %r467;
add.s32 %r469, %r464, %r24;
mad.lo.s32 %r470, %r469, %r81, %r415;
mad.lo.s32 %r471, %r466, %r82, %r470;
mad.lo.s32 %r472, %r468, %r83, %r471;
mul.wide.s32 %rd59, %r472, 4;
add.s64 %rd60, %rd1, %rd59;
ld.global.f32 %f60, [%rd54];
ld.global.f32 %f61, [%rd46];
add.f32 %f62, %f60, %f61;
ld.global.f32 %f63, [%rd38];
add.f32 %f64, %f62, %f63;
mov.b32 %r338, %f64;
ld.global.f32 %f65, [%rd56];
ld.global.f32 %f66, [%rd48];
add.f32 %f67, %f65, %f66;
ld.global.f32 %f68, [%rd40];
add.f32 %f69, %f67, %f68;
mov.b32 %r339, %f69;
ld.global.f32 %f70, [%rd58];
ld.global.f32 %f71, [%rd50];
add.f32 %f72, %f70, %f71;
ld.global.f32 %f73, [%rd42];
add.f32 %f74, %f72, %f73;
mov.b32 %r340, %f74;
ld.global.f32 %f75, [%rd60];
ld.global.f32 %f76, [%rd52];
add.f32 %f77, %f75, %f76;
ld.global.f32 %f78, [%rd44];
add.f32 %f79, %f77, %f78;
mov.b32 %r341, %f79;
mul.wide.s32 %rd61, %r115, 4;
add.s64 %rd36, %rd8, %rd61;
// begin inline asm
st.global.cs.v4.s32 [%rd36], {%r338,%r339,%r340,%r341};
// end inline asm
bra.uni $L__BB0_36;
$L__BB0_6:
setp.ge.s32 %p6, %r18, %r17;
shl.b32 %r131, %r1, 2;
shl.b32 %r132, %r18, 9;
add.s32 %r133, %r131, %r132;
or.b32 %r27, %r133, 3;
shl.b32 %r28, %r15, 1;
shl.b32 %r134, %r18, 7;
add.s32 %r29, %r134, %r1;
mul.lo.s32 %r30, %r79, 24;
mul.lo.s32 %r31, %r93, %r94;
mov.f32 %f90, 0f00000000;
mov.f32 %f80, 0f00000000;
mov.f32 %f91, %f90;
mov.f32 %f92, %f90;
mov.f32 %f93, %f90;
@%p6 bra $L__BB0_19;
setp.ge.s32 %p7, %r27, %r16;
mov.f32 %f90, %f80;
@%p7 bra $L__BB0_10;
rem.s32 %r135, %r29, %r30;
shl.b32 %r32, %r135, 2;
or.b32 %r136, %r32, 3;
setp.ge.s32 %p8, %r136, %r31;
mov.f32 %f90, %f80;
@%p8 bra $L__BB0_10;
div.s32 %r137, %r29, %r30;
mul.lo.s32 %r138, %r137, %r95;
add.s32 %r139, %r32, %r28;
div.s32 %r140, %r139, %r94;
mad.lo.s32 %r141, %r140, %r96, %r138;
mul.lo.s32 %r142, %r140, %r94;
sub.s32 %r143, %r139, %r142;
mad.lo.s32 %r144, %r143, %r97, %r141;
mul.wide.s32 %rd10, %r144, 4;
add.s64 %rd11, %rd3, %rd10;
ld.global.f32 %f90, [%rd11];
$L__BB0_10:
mov.f32 %f92, 0f00000000;
mov.f32 %f91, %f92;
@%p7 bra $L__BB0_13;
rem.s32 %r145, %r29, %r30;
shl.b32 %r33, %r145, 2;
or.b32 %r146, %r33, 3;
setp.ge.s32 %p10, %r146, %r31;
@%p10 bra $L__BB0_13;
div.s32 %r147, %r29, %r30;
mul.lo.s32 %r148, %r147, %r95;
add.s32 %r149, %r33, %r28;
add.s32 %r150, %r149, 1;
div.s32 %r151, %r150, %r94;
mad.lo.s32 %r152, %r151, %r96, %r148;
mul.lo.s32 %r153, %r151, %r94;
sub.s32 %r154, %r150, %r153;
mad.lo.s32 %r155, %r154, %r97, %r152;
mul.wide.s32 %rd12, %r155, 4;
add.s64 %rd13, %rd3, %rd12;
ld.global.f32 %f91, [%rd13];
$L__BB0_13:
@%p7 bra $L__BB0_16;
rem.s32 %r156, %r29, %r30;
shl.b32 %r34, %r156, 2;
or.b32 %r157, %r34, 3;
setp.ge.s32 %p12, %r157, %r31;
@%p12 bra $L__BB0_16;
div.s32 %r158, %r29, %r30;
mul.lo.s32 %r159, %r158, %r95;
add.s32 %r160, %r34, %r28;
add.s32 %r161, %r160, 2;
div.s32 %r162, %r161, %r94;
mad.lo.s32 %r163, %r162, %r96, %r159;
mul.lo.s32 %r164, %r162, %r94;
sub.s32 %r165, %r161, %r164;
mad.lo.s32 %r166, %r165, %r97, %r163;
mul.wide.s32 %rd14, %r166, 4;
add.s64 %rd15, %rd3, %rd14;
ld.global.f32 %f92, [%rd15];
$L__BB0_16:
mov.f32 %f93, 0f00000000;
@%p7 bra $L__BB0_19;
rem.s32 %r167, %r29, %r30;
shl.b32 %r168, %r167, 2;
or.b32 %r35, %r168, 3;
setp.ge.s32 %p14, %r35, %r31;
@%p14 bra $L__BB0_19;
div.s32 %r169, %r29, %r30;
mul.lo.s32 %r170, %r169, %r95;
add.s32 %r171, %r35, %r28;
div.s32 %r172, %r171, %r94;
mad.lo.s32 %r173, %r172, %r96, %r170;
mul.lo.s32 %r174, %r172, %r94;
sub.s32 %r175, %r171, %r174;
mad.lo.s32 %r176, %r175, %r97, %r173;
mul.wide.s32 %rd16, %r176, 4;
add.s64 %rd17, %rd3, %rd16;
ld.global.f32 %f93, [%rd17];
$L__BB0_19:
add.s32 %r36, %r132, %r131;
mad.lo.s32 %r37, %r79, -1344, %r36;
shl.b32 %r38, %r79, 3;
shl.b32 %r39, %r15, 3;
neg.s32 %r179, %r39;
setp.ge.s32 %p15, %r37, %r179;
mov.f32 %f95, 0f00000000;
mov.f32 %f94, %f95;
@%p15 bra $L__BB0_22;
rem.s32 %r180, %r29, %r30;
div.s32 %r181, %r180, %r38;
shl.b32 %r182, %r181, 2;
mul.lo.s32 %r183, %r181, %r38;
sub.s32 %r184, %r180, %r183;
shl.b32 %r185, %r184, 2;
add.s32 %r186, %r185, %r39;
div.s32 %r40, %r186, %r79;
shr.s32 %r187, %r40, 31;
shr.u32 %r188, %r187, 29;
add.s32 %r189, %r40, %r188;
shr.s32 %r190, %r189, 3;
add.s32 %r41, %r190, %r182;
setp.gt.s32 %p16, %r41, 11;
@%p16 bra $L__BB0_22;
div.s32 %r191, %r29, %r30;
mul.lo.s32 %r192, %r191, %r88;
and.b32 %r196, %r189, -8;
sub.s32 %r197, %r40, %r196;
mad.lo.s32 %r198, %r41, %r89, %r192;
mad.lo.s32 %r199, %r197, %r90, %r198;
mul.wide.s32 %rd18, %r199, 4;
add.s64 %rd19, %rd2, %rd18;
ld.global.f32 %f94, [%rd19];
$L__BB0_22:
not.b32 %r200, %r39;
setp.ge.s32 %p17, %r37, %r200;
@%p17 bra $L__BB0_25;
rem.s32 %r201, %r29, %r30;
div.s32 %r202, %r201, %r38;
mul.lo.s32 %r203, %r202, %r38;
sub.s32 %r204, %r201, %r203;
shl.b32 %r205, %r204, 2;
add.s32 %r206, %r205, %r39;
shl.b32 %r207, %r202, 2;
add.s32 %r208, %r206, 1;
div.s32 %r42, %r208, %r79;
shr.s32 %r209, %r42, 31;
shr.u32 %r210, %r209, 29;
add.s32 %r211, %r42, %r210;
shr.s32 %r212, %r211, 3;
add.s32 %r43, %r212, %r207;
setp.gt.s32 %p18, %r43, 11;
@%p18 bra $L__BB0_25;
div.s32 %r213, %r29, %r30;
mul.lo.s32 %r214, %r213, %r88;
and.b32 %r218, %r211, -8;
sub.s32 %r219, %r42, %r218;
mad.lo.s32 %r220, %r43, %r89, %r214;
mad.lo.s32 %r221, %r219, %r90, %r220;
mul.wide.s32 %rd20, %r221, 4;
add.s64 %rd21, %rd2, %rd20;
ld.global.f32 %f95, [%rd21];
$L__BB0_25:
mov.u32 %r222, -2;
sub.s32 %r223, %r222, %r39;
setp.ge.s32 %p19, %r37, %r223;
mov.f32 %f97, 0f00000000;
mov.f32 %f96, %f97;
@%p19 bra $L__BB0_28;
rem.s32 %r224, %r29, %r30;
div.s32 %r225, %r224, %r38;
mul.lo.s32 %r226, %r225, %r38;
sub.s32 %r227, %r224, %r226;
shl.b32 %r228, %r227, 2;
add.s32 %r229, %r228, %r39;
shl.b32 %r230, %r225, 2;
add.s32 %r231, %r229, 2;
div.s32 %r44, %r231, %r79;
shr.s32 %r232, %r44, 31;
shr.u32 %r233, %r232, 29;
add.s32 %r234, %r44, %r233;
shr.s32 %r235, %r234, 3;
add.s32 %r45, %r235, %r230;
setp.gt.s32 %p20, %r45, 11;
@%p20 bra $L__BB0_28;
div.s32 %r236, %r29, %r30;
mul.lo.s32 %r237, %r236, %r88;
and.b32 %r241, %r234, -8;
sub.s32 %r242, %r44, %r241;
mad.lo.s32 %r243, %r45, %r89, %r237;
mad.lo.s32 %r244, %r242, %r90, %r243;
mul.wide.s32 %rd22, %r244, 4;
add.s64 %rd23, %rd2, %rd22;
ld.global.f32 %f96, [%rd23];
$L__BB0_28:
mov.u32 %r245, -3;
sub.s32 %r246, %r245, %r39;
setp.ge.s32 %p21, %r37, %r246;
@%p21 bra $L__BB0_31;
rem.s32 %r247, %r29, %r30;
div.s32 %r248, %r247, %r38;
mul.lo.s32 %r249, %r248, %r38;
sub.s32 %r250, %r247, %r249;
shl.b32 %r251, %r250, 2;
add.s32 %r252, %r251, %r39;
shl.b32 %r253, %r248, 2;
add.s32 %r254, %r252, 3;
div.s32 %r46, %r254, %r79;
shr.s32 %r255, %r46, 31;
shr.u32 %r256, %r255, 29;
add.s32 %r257, %r46, %r256;
shr.s32 %r258, %r257, 3;
add.s32 %r47, %r258, %r253;
setp.gt.s32 %p22, %r47, 11;
@%p22 bra $L__BB0_31;
div.s32 %r259, %r29, %r30;
mul.lo.s32 %r260, %r259, %r88;
and.b32 %r264, %r257, -8;
sub.s32 %r265, %r46, %r264;
mad.lo.s32 %r266, %r47, %r89, %r260;
mad.lo.s32 %r267, %r265, %r90, %r266;
mul.wide.s32 %rd24, %r267, 4;
add.s64 %rd25, %rd2, %rd24;
ld.global.f32 %f97, [%rd25];
$L__BB0_31:
setp.ge.s32 %p23, %r27, %r16;
or.pred %p25, %p6, %p23;
mov.f32 %f98, 0f00000000;
mov.f32 %f99, %f98;
mov.f32 %f100, %f98;
mov.f32 %f101, %f98;
@%p25 bra $L__BB0_33;
div.s32 %r268, %r29, %r30;
mul.lo.s32 %r269, %r268, %r80;
mul.lo.s32 %r270, %r268, %r30;
sub.s32 %r271, %r29, %r270;
div.s32 %r272, %r271, %r38;
shl.b32 %r273, %r272, 2;
mul.lo.s32 %r274, %r272, %r38;
sub.s32 %r275, %r271, %r274;
shl.b32 %r276, %r275, 2;
shl.b32 %r277, %r15, 5;
add.s32 %r278, %r276, %r277;
div.s32 %r279, %r278, %r79;
shr.s32 %r280, %r279, 31;
shr.u32 %r281, %r280, 29;
add.s32 %r282, %r279, %r281;
shr.s32 %r283, %r282, 3;
and.b32 %r284, %r282, -8;
sub.s32 %r285, %r279, %r284;
mul.lo.s32 %r286, %r279, %r79;
sub.s32 %r287, %r278, %r286;
add.s32 %r288, %r283, %r273;
mad.lo.s32 %r289, %r288, %r81, %r269;
mad.lo.s32 %r290, %r285, %r82, %r289;
mad.lo.s32 %r291, %r287, %r83, %r290;
mul.wide.s32 %rd26, %r291, 4;
add.s64 %rd27, %rd1, %rd26;
ld.global.f32 %f101, [%rd27];
or.b32 %r292, %r278, 1;
div.s32 %r293, %r292, %r79;
shr.s32 %r294, %r293, 31;
shr.u32 %r295, %r294, 29;
add.s32 %r296, %r293, %r295;
shr.s32 %r297, %r296, 3;
and.b32 %r298, %r296, -8;
sub.s32 %r299, %r293, %r298;
mul.lo.s32 %r300, %r293, %r79;
sub.s32 %r301, %r292, %r300;
add.s32 %r302, %r297, %r273;
mad.lo.s32 %r303, %r302, %r81, %r269;
mad.lo.s32 %r304, %r299, %r82, %r303;
mad.lo.s32 %r305, %r301, %r83, %r304;
mul.wide.s32 %rd28, %r305, 4;
add.s64 %rd29, %rd1, %rd28;
ld.global.f32 %f100, [%rd29];
or.b32 %r306, %r278, 2;
div.s32 %r307, %r306, %r79;
shr.s32 %r308, %r307, 31;
shr.u32 %r309, %r308, 29;
add.s32 %r310, %r307, %r309;
shr.s32 %r311, %r310, 3;
and.b32 %r312, %r310, -8;
sub.s32 %r313, %r307, %r312;
mul.lo.s32 %r314, %r307, %r79;
sub.s32 %r315, %r306, %r314;
add.s32 %r316, %r311, %r273;
mad.lo.s32 %r317, %r316, %r81, %r269;
mad.lo.s32 %r318, %r313, %r82, %r317;
mad.lo.s32 %r319, %r315, %r83, %r318;
mul.wide.s32 %rd30, %r319, 4;
add.s64 %rd31, %rd1, %rd30;
ld.global.f32 %f99, [%rd31];
or.b32 %r320, %r278, 3;
div.s32 %r321, %r320, %r79;
shr.s32 %r322, %r321, 31;
shr.u32 %r323, %r322, 29;
add.s32 %r324, %r321, %r323;
shr.s32 %r325, %r324, 3;
and.b32 %r326, %r324, -8;
sub.s32 %r327, %r321, %r326;
mul.lo.s32 %r328, %r321, %r79;
sub.s32 %r329, %r320, %r328;
add.s32 %r330, %r325, %r273;
mad.lo.s32 %r331, %r330, %r81, %r269;
mad.lo.s32 %r332, %r327, %r82, %r331;
mad.lo.s32 %r333, %r329, %r83, %r332;
mul.wide.s32 %rd32, %r333, 4;
add.s64 %rd33, %rd1, %rd32;
ld.global.f32 %f98, [%rd33];
$L__BB0_33:
add.f32 %f56, %f101, %f94;
add.f32 %f28, %f56, %f90;
add.f32 %f57, %f100, %f95;
add.f32 %f29, %f57, %f91;
add.f32 %f58, %f99, %f96;
add.f32 %f30, %f58, %f92;
add.f32 %f59, %f98, %f97;
add.f32 %f31, %f59, %f93;
@%p25 bra $L__BB0_36;
mov.b32 %r337, %f31;
mul.wide.s32 %rd35, %r36, 4;
add.s64 %rd34, %rd8, %rd35;
mov.b32 %r334, %f28;
mov.b32 %r335, %f29;
mov.b32 %r336, %f30;
// begin inline asm
st.global.cs.v4.s32 [%rd34], {%r334,%r335,%r336,%r337};
// end inline asm
$L__BB0_36:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_e4dea3bd_1601111nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2_E14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_e4dea3bd_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_e4dea3bd_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_e4dea3bd_160113std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_e4dea3bd_1601111nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_e4dea3bd_1601111nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_0[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_e4dea3bd_1601111nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_1[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_e4dea3bd_1601111nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_2[32],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_e4dea3bd_1601111nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_3[32]
)
{
.reg .pred %p<19>;
.reg .f32 %f<85>;
.reg .b32 %r<453>;
.reg .b64 %rd<62>;
// demoted variable
.shared .align 4 .u32 _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_e4dea3bd_1601111nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2_E14nvfuser_zero_s;
ld.param.v2.u32 {%r70, %r71}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_e4dea3bd_1601111nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_0+16];
ld.param.v2.u32 {%r72, %r73}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_e4dea3bd_1601111nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_0+24];
ld.param.v2.u32 {%r74, %r75}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_e4dea3bd_1601111nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_0+32];
ld.param.v2.u32 {%r80, %r81}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_e4dea3bd_1601111nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_1+24];
ld.param.v2.u32 {%r82, %r83}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_e4dea3bd_1601111nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_1+32];
ld.param.v2.u32 {%r86, %r87}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_e4dea3bd_1601111nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_2+16];
ld.param.v2.u32 {%r88, %r89}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_e4dea3bd_1601111nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_2+24];
ld.param.u64 %rd8, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_e4dea3bd_1601111nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_3];
ld.param.u64 %rd7, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_e4dea3bd_1601111nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_2];
ld.param.u64 %rd6, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_e4dea3bd_1601111nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_1];
ld.param.u64 %rd5, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_e4dea3bd_1601111nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_0];
cvta.to.global.u64 %rd1, %rd5;
cvta.to.global.u64 %rd2, %rd6;
cvta.to.global.u64 %rd3, %rd7;
mov.u32 %r1, %tid.x;
setp.ne.s32 %p2, %r1, 0;
@%p2 bra $L__BB0_2;
mov.u32 %r96, 0;
st.shared.u32 [_ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_e4dea3bd_1601111nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2_E14nvfuser_zero_s], %r96;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd9, _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_e4dea3bd_1601111nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2_E14nvfuser_zero_s;
atom.shared.min.s32 %r97, [%rd9], %r1;
ld.shared.u32 %r14, [_ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_351_cu_e4dea3bd_1601111nvfuser_351ENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2_E14nvfuser_zero_s];
mul.lo.s32 %r15, %r71, 1344;
or.b32 %r98, %r15, 3;
shr.s32 %r99, %r98, 31;
shr.u32 %r100, %r99, 30;
add.s32 %r101, %r98, %r100;
shr.s32 %r102, %r101, 2;
add.s32 %r103, %r102, 127;
shr.s32 %r104, %r103, 31;
shr.u32 %r105, %r104, 25;
add.s32 %r106, %r103, %r105;
shr.s32 %r16, %r106, 7;
mov.u32 %r17, %ctaid.x;
setp.lt.s32 %p3, %r17, %r16;
@%p3 bra $L__BB0_3;
bra.uni $L__BB0_5;
$L__BB0_3:
shl.b32 %r18, %r1, 2;
shl.b32 %r19, %r17, 9;
add.s32 %r107, %r18, %r19;
or.b32 %r108, %r107, 3;
setp.ge.s32 %p4, %r108, %r15;
@%p4 bra $L__BB0_5;
shl.b32 %r109, %r17, 7;
add.s32 %r20, %r109, %r1;
mul.lo.s32 %r21, %r71, 24;
rem.s32 %r22, %r20, %r21;
shl.b32 %r110, %r71, 3;
div.s32 %r111, %r22, %r110;
shl.b32 %r23, %r111, 2;
mul.lo.s32 %r112, %r111, %r110;
sub.s32 %r113, %r22, %r112;
shl.b32 %r24, %r113, 2;
or.b32 %r114, %r24, 3;
div.s32 %r115, %r114, %r71;
shr.s32 %r116, %r115, 31;
shr.u32 %r117, %r116, 29;
add.s32 %r118, %r115, %r117;
shr.s32 %r119, %r118, 3;
add.s32 %r120, %r119, %r23;
setp.lt.s32 %p5, %r120, 12;
@%p5 bra $L__BB0_23;
bra.uni $L__BB0_5;
$L__BB0_23:
shl.b32 %r320, %r14, 1;
div.s32 %r321, %r20, %r21;
mul.lo.s32 %r322, %r321, %r87;
shl.b32 %r323, %r22, 2;
add.s32 %r324, %r323, %r320;
div.s32 %r325, %r324, %r86;
mad.lo.s32 %r326, %r325, %r88, %r322;
mul.lo.s32 %r327, %r325, %r86;
sub.s32 %r328, %r324, %r327;
mad.lo.s32 %r329, %r328, %r89, %r326;
mul.wide.s32 %rd37, %r329, 4;
add.s64 %rd38, %rd3, %rd37;
or.b32 %r330, %r324, 1;
div.s32 %r331, %r330, %r86;
mad.lo.s32 %r332, %r331, %r88, %r322;
mul.lo.s32 %r333, %r331, %r86;
sub.s32 %r334, %r330, %r333;
mad.lo.s32 %r335, %r334, %r89, %r332;
mul.wide.s32 %rd39, %r335, 4;
add.s64 %rd40, %rd3, %rd39;
add.s32 %r336, %r324, 2;
div.s32 %r337, %r336, %r86;
mad.lo.s32 %r338, %r337, %r88, %r322;
mul.lo.s32 %r339, %r337, %r86;
sub.s32 %r340, %r336, %r339;
mad.lo.s32 %r341, %r340, %r89, %r338;
mul.wide.s32 %rd41, %r341, 4;
add.s64 %rd42, %rd3, %rd41;
add.s32 %r342, %r324, 3;
div.s32 %r343, %r342, %r86;
mad.lo.s32 %r344, %r343, %r88, %r322;
mul.lo.s32 %r345, %r343, %r86;
sub.s32 %r346, %r342, %r345;
mad.lo.s32 %r347, %r346, %r89, %r344;
mul.wide.s32 %rd43, %r347, 4;
add.s64 %rd44, %rd3, %rd43;
mul.lo.s32 %r348, %r321, %r80;
shl.b32 %r349, %r14, 3;
add.s32 %r350, %r24, %r349;
div.s32 %r351, %r350, %r71;
shr.s32 %r352, %r351, 31;
shr.u32 %r353, %r352, 29;
add.s32 %r354, %r351, %r353;
shr.s32 %r355, %r354, 3;
and.b32 %r356, %r354, -8;
sub.s32 %r357, %r351, %r356;
add.s32 %r358, %r355, %r23;
mad.lo.s32 %r359, %r358, %r81, %r348;
mad.lo.s32 %r360, %r357, %r82, %r359;
mul.wide.s32 %rd45, %r360, 4;
add.s64 %rd46, %rd2, %rd45;
add.s32 %r361, %r350, 1;
div.s32 %r362, %r361, %r71;
shr.s32 %r363, %r362, 31;
shr.u32 %r364, %r363, 29;
add.s32 %r365, %r362, %r364;
shr.s32 %r366, %r365, 3;
and.b32 %r367, %r365, -8;
sub.s32 %r368, %r362, %r367;
add.s32 %r369, %r366, %r23;
mad.lo.s32 %r370, %r369, %r81, %r348;
mad.lo.s32 %r371, %r368, %r82, %r370;
mul.wide.s32 %rd47, %r371, 4;
add.s64 %rd48, %rd2, %rd47;
add.s32 %r372, %r350, 2;
div.s32 %r373, %r372, %r71;
shr.s32 %r374, %r373, 31;
shr.u32 %r375, %r374, 29;
add.s32 %r376, %r373, %r375;
shr.s32 %r377, %r376, 3;
and.b32 %r378, %r376, -8;
sub.s32 %r379, %r373, %r378;
add.s32 %r380, %r377, %r23;
mad.lo.s32 %r381, %r380, %r81, %r348;
mad.lo.s32 %r382, %r379, %r82, %r381;
mul.wide.s32 %rd49, %r382, 4;
add.s64 %rd50, %rd2, %rd49;
add.s32 %r383, %r350, 3;
div.s32 %r384, %r383, %r71;
shr.s32 %r385, %r384, 31;
shr.u32 %r386, %r385, 29;
add.s32 %r387, %r384, %r386;
shr.s32 %r388, %r387, 3;
and.b32 %r389, %r387, -8;
sub.s32 %r390, %r384, %r389;
add.s32 %r391, %r388, %r23;
mad.lo.s32 %r392, %r391, %r81, %r348;
mad.lo.s32 %r393, %r390, %r82, %r392;
mul.wide.s32 %rd51, %r393, 4;
add.s64 %rd52, %rd2, %rd51;
mul.lo.s32 %r394, %r321, %r72;
shl.b32 %r395, %r14, 5;
add.s32 %r396, %r24, %r395;
div.s32 %r397, %r396, %r71;
shr.s32 %r398, %r397, 31;
shr.u32 %r399, %r398, 29;
add.s32 %r400, %r397, %r399;
shr.s32 %r401, %r400, 3;
and.b32 %r402, %r400, -8;
sub.s32 %r403, %r397, %r402;
mul.lo.s32 %r404, %r397, %r71;
sub.s32 %r405, %r396, %r404;
add.s32 %r406, %r401, %r23;
mad.lo.s32 %r407, %r406, %r73, %r394;
mad.lo.s32 %r408, %r403, %r74, %r407;
mad.lo.s32 %r409, %r405, %r75, %r408;
mul.wide.s32 %rd53, %r409, 4;
add.s64 %rd54, %rd1, %rd53;
add.s32 %r410, %r396, 1;
div.s32 %r411, %r410, %r71;
shr.s32 %r412, %r411, 31;
shr.u32 %r413, %r412, 29;
add.s32 %r414, %r411, %r413;
shr.s32 %r415, %r414, 3;
and.b32 %r416, %r414, -8;
sub.s32 %r417, %r411, %r416;
mul.lo.s32 %r418, %r411, %r71;
sub.s32 %r419, %r410, %r418;
add.s32 %r420, %r415, %r23;
mad.lo.s32 %r421, %r420, %r73, %r394;
mad.lo.s32 %r422, %r417, %r74, %r421;
mad.lo.s32 %r423, %r419, %r75, %r422;
mul.wide.s32 %rd55, %r423, 4;
add.s64 %rd56, %rd1, %rd55;
add.s32 %r424, %r396, 2;
div.s32 %r425, %r424, %r71;
shr.s32 %r426, %r425, 31;
shr.u32 %r427, %r426, 29;
add.s32 %r428, %r425, %r427;
shr.s32 %r429, %r428, 3;
and.b32 %r430, %r428, -8;
sub.s32 %r431, %r425, %r430;
mul.lo.s32 %r432, %r425, %r71;
sub.s32 %r433, %r424, %r432;
add.s32 %r434, %r429, %r23;
mad.lo.s32 %r435, %r434, %r73, %r394;
mad.lo.s32 %r436, %r431, %r74, %r435;
mad.lo.s32 %r437, %r433, %r75, %r436;
mul.wide.s32 %rd57, %r437, 4;
add.s64 %rd58, %rd1, %rd57;
add.s32 %r438, %r396, 3;
div.s32 %r439, %r438, %r71;
shr.s32 %r440, %r439, 31;
shr.u32 %r441, %r440, 29;
add.s32 %r442, %r439, %r441;
shr.s32 %r443, %r442, 3;
and.b32 %r444, %r442, -8;
sub.s32 %r445, %r439, %r444;
mul.lo.s32 %r446, %r439, %r71;
sub.s32 %r447, %r438, %r446;
add.s32 %r448, %r443, %r23;
mad.lo.s32 %r449, %r448, %r73, %r394;
mad.lo.s32 %r450, %r445, %r74, %r449;
mad.lo.s32 %r451, %r447, %r75, %r450;
mul.wide.s32 %rd59, %r451, 4;
add.s64 %rd60, %rd1, %rd59;
ld.global.f32 %f49, [%rd54];
ld.global.f32 %f50, [%rd46];
add.f32 %f51, %f49, %f50;
ld.global.f32 %f52, [%rd38];
add.f32 %f53, %f51, %f52;
mov.b32 %r316, %f53;
ld.global.f32 %f54, [%rd56];
ld.global.f32 %f55, [%rd48];
add.f32 %f56, %f54, %f55;
ld.global.f32 %f57, [%rd40];
add.f32 %f58, %f56, %f57;
mov.b32 %r317, %f58;
ld.global.f32 %f59, [%rd58];
ld.global.f32 %f60, [%rd50];
add.f32 %f61, %f59, %f60;
ld.global.f32 %f62, [%rd42];
add.f32 %f63, %f61, %f62;
mov.b32 %r318, %f63;
ld.global.f32 %f64, [%rd60];
ld.global.f32 %f65, [%rd52];
add.f32 %f66, %f64, %f65;
ld.global.f32 %f67, [%rd44];
add.f32 %f68, %f66, %f67;
mov.b32 %r319, %f68;
mul.wide.s32 %rd61, %r107, 4;
add.s64 %rd36, %rd8, %rd61;
// begin inline asm
st.global.cs.v4.s32 [%rd36], {%r316,%r317,%r318,%r319};
// end inline asm
bra.uni $L__BB0_24;
$L__BB0_5:
shl.b32 %r121, %r1, 2;
shl.b32 %r122, %r17, 9;
add.s32 %r123, %r121, %r122;
or.b32 %r25, %r123, 3;
setp.ge.s32 %p6, %r25, %r15;
shl.b32 %r124, %r17, 7;
add.s32 %r26, %r124, %r1;
mul.lo.s32 %r27, %r71, 24;
setp.ge.s32 %p7, %r17, %r16;
or.pred %p1, %p7, %p6;
mov.f32 %f77, 0f00000000;
mov.f32 %f73, %f77;
mov.f32 %f74, %f77;
mov.f32 %f75, %f77;
mov.f32 %f76, %f77;
@%p1 bra $L__BB0_7;
div.s32 %r125, %r26, %r27;
mul.lo.s32 %r126, %r125, %r87;
mul.lo.s32 %r127, %r125, %r27;
sub.s32 %r128, %r26, %r127;
shl.b32 %r129, %r128, 2;
shl.b32 %r130, %r14, 1;
add.s32 %r131, %r129, %r130;
div.s32 %r132, %r131, %r86;
mad.lo.s32 %r133, %r132, %r88, %r126;
mul.lo.s32 %r134, %r132, %r86;
sub.s32 %r135, %r131, %r134;
mad.lo.s32 %r136, %r135, %r89, %r133;
mul.wide.s32 %rd10, %r136, 4;
add.s64 %rd11, %rd3, %rd10;
ld.global.f32 %f73, [%rd11];
or.b32 %r137, %r131, 1;
div.s32 %r138, %r137, %r86;
mad.lo.s32 %r139, %r138, %r88, %r126;
mul.lo.s32 %r140, %r138, %r86;
sub.s32 %r141, %r137, %r140;
mad.lo.s32 %r142, %r141, %r89, %r139;
mul.wide.s32 %rd12, %r142, 4;
add.s64 %rd13, %rd3, %rd12;
ld.global.f32 %f74, [%rd13];
add.s32 %r143, %r131, 2;
div.s32 %r144, %r143, %r86;
mad.lo.s32 %r145, %r144, %r88, %r126;
mul.lo.s32 %r146, %r144, %r86;
sub.s32 %r147, %r143, %r146;
mad.lo.s32 %r148, %r147, %r89, %r145;
mul.wide.s32 %rd14, %r148, 4;
add.s64 %rd15, %rd3, %rd14;
ld.global.f32 %f75, [%rd15];
add.s32 %r149, %r131, 3;
div.s32 %r150, %r149, %r86;
mad.lo.s32 %r151, %r150, %r88, %r126;
mul.lo.s32 %r152, %r150, %r86;
sub.s32 %r153, %r149, %r152;
mad.lo.s32 %r154, %r153, %r89, %r151;
mul.wide.s32 %rd16, %r154, 4;
add.s64 %rd17, %rd3, %rd16;
ld.global.f32 %f76, [%rd17];
$L__BB0_7:
add.s32 %r28, %r122, %r121;
mad.lo.s32 %r29, %r71, -1344, %r28;
shl.b32 %r30, %r71, 3;
shl.b32 %r31, %r14, 3;
neg.s32 %r157, %r31;
setp.ge.s32 %p8, %r29, %r157;
@%p8 bra $L__BB0_10;
rem.s32 %r158, %r26, %r27;
div.s32 %r159, %r158, %r30;
shl.b32 %r160, %r159, 2;
mul.lo.s32 %r161, %r159, %r30;
sub.s32 %r162, %r158, %r161;
shl.b32 %r163, %r162, 2;
add.s32 %r164, %r163, %r31;
div.s32 %r32, %r164, %r71;
shr.s32 %r165, %r32, 31;
shr.u32 %r166, %r165, 29;
add.s32 %r167, %r32, %r166;
shr.s32 %r168, %r167, 3;
add.s32 %r33, %r168, %r160;
setp.gt.s32 %p9, %r33, 11;
@%p9 bra $L__BB0_10;
div.s32 %r169, %r26, %r27;
mul.lo.s32 %r170, %r169, %r80;
and.b32 %r174, %r167, -8;
sub.s32 %r175, %r32, %r174;
mad.lo.s32 %r176, %r33, %r81, %r170;
mad.lo.s32 %r177, %r175, %r82, %r176;
mul.wide.s32 %rd18, %r177, 4;
add.s64 %rd19, %rd2, %rd18;
ld.global.f32 %f77, [%rd19];
$L__BB0_10:
not.b32 %r178, %r31;
setp.ge.s32 %p10, %r29, %r178;
mov.f32 %f79, 0f00000000;
mov.f32 %f78, %f79;
@%p10 bra $L__BB0_13;
rem.s32 %r179, %r26, %r27;
div.s32 %r180, %r179, %r30;
mul.lo.s32 %r181, %r180, %r30;
sub.s32 %r182, %r179, %r181;
shl.b32 %r183, %r182, 2;
add.s32 %r184, %r183, %r31;
shl.b32 %r185, %r180, 2;
add.s32 %r186, %r184, 1;
div.s32 %r34, %r186, %r71;
shr.s32 %r187, %r34, 31;
shr.u32 %r188, %r187, 29;
add.s32 %r189, %r34, %r188;
shr.s32 %r190, %r189, 3;
add.s32 %r35, %r190, %r185;
setp.gt.s32 %p11, %r35, 11;
@%p11 bra $L__BB0_13;
div.s32 %r191, %r26, %r27;
mul.lo.s32 %r192, %r191, %r80;
and.b32 %r196, %r189, -8;
sub.s32 %r197, %r34, %r196;
mad.lo.s32 %r198, %r35, %r81, %r192;
mad.lo.s32 %r199, %r197, %r82, %r198;
mul.wide.s32 %rd20, %r199, 4;
add.s64 %rd21, %rd2, %rd20;
ld.global.f32 %f78, [%rd21];
$L__BB0_13:
mov.u32 %r200, -2;
sub.s32 %r201, %r200, %r31;
setp.ge.s32 %p12, %r29, %r201;
@%p12 bra $L__BB0_16;
rem.s32 %r202, %r26, %r27;
div.s32 %r203, %r202, %r30;
mul.lo.s32 %r204, %r203, %r30;
sub.s32 %r205, %r202, %r204;
shl.b32 %r206, %r205, 2;
add.s32 %r207, %r206, %r31;
shl.b32 %r208, %r203, 2;
add.s32 %r209, %r207, 2;
div.s32 %r36, %r209, %r71;
shr.s32 %r210, %r36, 31;
shr.u32 %r211, %r210, 29;
add.s32 %r212, %r36, %r211;
shr.s32 %r213, %r212, 3;
add.s32 %r37, %r213, %r208;
setp.gt.s32 %p13, %r37, 11;
@%p13 bra $L__BB0_16;
div.s32 %r214, %r26, %r27;
mul.lo.s32 %r215, %r214, %r80;
and.b32 %r219, %r212, -8;
sub.s32 %r220, %r36, %r219;
mad.lo.s32 %r221, %r37, %r81, %r215;
mad.lo.s32 %r222, %r220, %r82, %r221;
mul.wide.s32 %rd22, %r222, 4;
add.s64 %rd23, %rd2, %rd22;
ld.global.f32 %f79, [%rd23];
$L__BB0_16:
mov.u32 %r223, -3;
sub.s32 %r224, %r223, %r31;
setp.ge.s32 %p14, %r29, %r224;
mov.f32 %f81, 0f00000000;
mov.f32 %f80, %f81;
@%p14 bra $L__BB0_19;
rem.s32 %r225, %r26, %r27;
div.s32 %r226, %r225, %r30;
mul.lo.s32 %r227, %r226, %r30;
sub.s32 %r228, %r225, %r227;
shl.b32 %r229, %r228, 2;
add.s32 %r230, %r229, %r31;
shl.b32 %r231, %r226, 2;
add.s32 %r232, %r230, 3;
div.s32 %r38, %r232, %r71;
shr.s32 %r233, %r38, 31;
shr.u32 %r234, %r233, 29;
add.s32 %r235, %r38, %r234;
shr.s32 %r236, %r235, 3;
add.s32 %r39, %r236, %r231;
setp.gt.s32 %p15, %r39, 11;
@%p15 bra $L__BB0_19;
div.s32 %r237, %r26, %r27;
mul.lo.s32 %r238, %r237, %r80;
and.b32 %r242, %r235, -8;
sub.s32 %r243, %r38, %r242;
mad.lo.s32 %r244, %r39, %r81, %r238;
mad.lo.s32 %r245, %r243, %r82, %r244;
mul.wide.s32 %rd24, %r245, 4;
add.s64 %rd25, %rd2, %rd24;
ld.global.f32 %f80, [%rd25];
$L__BB0_19:
mov.f32 %f82, %f81;
mov.f32 %f83, %f81;
mov.f32 %f84, %f81;
@%p1 bra $L__BB0_21;
div.s32 %r246, %r26, %r27;
mul.lo.s32 %r247, %r246, %r72;
mul.lo.s32 %r248, %r246, %r27;
sub.s32 %r249, %r26, %r248;
div.s32 %r250, %r249, %r30;
shl.b32 %r251, %r250, 2;
mul.lo.s32 %r252, %r250, %r30;
sub.s32 %r253, %r249, %r252;
shl.b32 %r254, %r253, 2;
shl.b32 %r255, %r14, 5;
add.s32 %r256, %r254, %r255;
div.s32 %r257, %r256, %r71;
shr.s32 %r258, %r257, 31;
shr.u32 %r259, %r258, 29;
add.s32 %r260, %r257, %r259;
shr.s32 %r261, %r260, 3;
and.b32 %r262, %r260, -8;
sub.s32 %r263, %r257, %r262;
mul.lo.s32 %r264, %r257, %r71;
sub.s32 %r265, %r256, %r264;
add.s32 %r266, %r261, %r251;
mad.lo.s32 %r267, %r266, %r73, %r247;
mad.lo.s32 %r268, %r263, %r74, %r267;
mad.lo.s32 %r269, %r265, %r75, %r268;
mul.wide.s32 %rd26, %r269, 4;
add.s64 %rd27, %rd1, %rd26;
ld.global.f32 %f84, [%rd27];
or.b32 %r270, %r256, 1;
div.s32 %r271, %r270, %r71;
shr.s32 %r272, %r271, 31;
shr.u32 %r273, %r272, 29;
add.s32 %r274, %r271, %r273;
shr.s32 %r275, %r274, 3;
and.b32 %r276, %r274, -8;
sub.s32 %r277, %r271, %r276;
mul.lo.s32 %r278, %r271, %r71;
sub.s32 %r279, %r270, %r278;
add.s32 %r280, %r275, %r251;
mad.lo.s32 %r281, %r280, %r73, %r247;
mad.lo.s32 %r282, %r277, %r74, %r281;
mad.lo.s32 %r283, %r279, %r75, %r282;
mul.wide.s32 %rd28, %r283, 4;
add.s64 %rd29, %rd1, %rd28;
ld.global.f32 %f83, [%rd29];
or.b32 %r284, %r256, 2;
div.s32 %r285, %r284, %r71;
shr.s32 %r286, %r285, 31;
shr.u32 %r287, %r286, 29;
add.s32 %r288, %r285, %r287;
shr.s32 %r289, %r288, 3;
and.b32 %r290, %r288, -8;
sub.s32 %r291, %r285, %r290;
mul.lo.s32 %r292, %r285, %r71;
sub.s32 %r293, %r284, %r292;
add.s32 %r294, %r289, %r251;
mad.lo.s32 %r295, %r294, %r73, %r247;
mad.lo.s32 %r296, %r291, %r74, %r295;
mad.lo.s32 %r297, %r293, %r75, %r296;
mul.wide.s32 %rd30, %r297, 4;
add.s64 %rd31, %rd1, %rd30;
ld.global.f32 %f82, [%rd31];
or.b32 %r298, %r256, 3;
div.s32 %r299, %r298, %r71;
shr.s32 %r300, %r299, 31;
shr.u32 %r301, %r300, 29;
add.s32 %r302, %r299, %r301;
shr.s32 %r303, %r302, 3;
and.b32 %r304, %r302, -8;
sub.s32 %r305, %r299, %r304;
mul.lo.s32 %r306, %r299, %r71;
sub.s32 %r307, %r298, %r306;
add.s32 %r308, %r303, %r251;
mad.lo.s32 %r309, %r308, %r73, %r247;
mad.lo.s32 %r310, %r305, %r74, %r309;
mad.lo.s32 %r311, %r307, %r75, %r310;
mul.wide.s32 %rd32, %r311, 4;
add.s64 %rd33, %rd1, %rd32;
ld.global.f32 %f81, [%rd33];
$L__BB0_21:
add.f32 %f45, %f84, %f77;
add.f32 %f25, %f45, %f73;
add.f32 %f46, %f83, %f78;
add.f32 %f26, %f46, %f74;
add.f32 %f47, %f82, %f79;
add.f32 %f27, %f47, %f75;
add.f32 %f48, %f81, %f80;
add.f32 %f28, %f48, %f76;
@%p1 bra $L__BB0_24;
mov.b32 %r315, %f28;
mul.wide.s32 %rd35, %r28, 4;
add.s64 %rd34, %rd8, %rd35;
mov.b32 %r312, %f25;
mov.b32 %r313, %f26;
mov.b32 %r314, %f27;
// begin inline asm
st.global.cs.v4.s32 [%rd34], {%r312,%r313,%r314,%r315};
// end inline asm
$L__BB0_24:
ret;
}
--- 53997da5d
+++ 03a1b695e
@@ -20,638 +20,581 @@
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_1[40],
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_2[32],
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_3[32]
)
{
- .reg .pred %p<29>;
- .reg .f32 %f<102>;
- .reg .b32 %r<474>;
+ .reg .pred %p<19>;
+ .reg .f32 %f<85>;
+ .reg .b32 %r<453>;
.reg .b64 %rd<62>;
.shared .align 4 .u32 _ZZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2_E14nvfuser_zero_s;
- ld.param.v2.u32 {%r78, %r79}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_0+16];
- ld.param.v2.u32 {%r80, %r81}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_0+24];
- ld.param.v2.u32 {%r82, %r83}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_0+32];
- ld.param.v2.u32 {%r88, %r89}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_1+24];
- ld.param.v2.u32 {%r90, %r91}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_1+32];
- ld.param.v2.u32 {%r92, %r93}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_2+8];
- ld.param.v2.u32 {%r94, %r95}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_2+16];
- ld.param.v2.u32 {%r96, %r97}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_2+24];
+ ld.param.v2.u32 {%r70, %r71}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_0+16];
+ ld.param.v2.u32 {%r72, %r73}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_0+24];
+ ld.param.v2.u32 {%r74, %r75}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_0+32];
+ ld.param.v2.u32 {%r80, %r81}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_1+24];
+ ld.param.v2.u32 {%r82, %r83}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_1+32];
+ ld.param.v2.u32 {%r86, %r87}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_2+16];
+ ld.param.v2.u32 {%r88, %r89}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_2+24];
ld.param.u64 %rd8, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_3];
ld.param.u64 %rd7, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_2];
ld.param.u64 %rd6, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_1];
ld.param.u64 %rd5, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2__param_0];
cvta.to.global.u64 %rd1, %rd5;
cvta.to.global.u64 %rd2, %rd6;
cvta.to.global.u64 %rd3, %rd7;
mov.u32 %r1, %tid.x;
- setp.ne.s32 %p1, %r1, 0;
- @%p1 bra $L__BB0_2;
-
- mov.u32 %r104, 0;
- st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2_E14nvfuser_zero_s], %r104;
+ setp.ne.s32 %p2, %r1, 0;
+ @%p2 bra $L__BB0_2;
+
+ mov.u32 %r96, 0;
+ st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2_E14nvfuser_zero_s], %r96;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd9, _ZZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2_E14nvfuser_zero_s;
- atom.shared.min.s32 %r105, [%rd9], %r1;
- ld.shared.u32 %r15, [_ZZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2_E14nvfuser_zero_s];
- mul.lo.s32 %r16, %r79, 1344;
- or.b32 %r106, %r16, 3;
- shr.s32 %r107, %r106, 31;
- shr.u32 %r108, %r107, 30;
- add.s32 %r109, %r106, %r108;
- shr.s32 %r110, %r109, 2;
- add.s32 %r111, %r110, 127;
- shr.s32 %r112, %r111, 31;
- shr.u32 %r113, %r112, 25;
- add.s32 %r114, %r111, %r113;
- shr.s32 %r17, %r114, 7;
- mov.u32 %r18, %ctaid.x;
- setp.lt.s32 %p2, %r18, %r17;
- @%p2 bra $L__BB0_3;
- bra.uni $L__BB0_6;
+ atom.shared.min.s32 %r97, [%rd9], %r1;
+ ld.shared.u32 %r14, [_ZZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_NS0_IfLi3ELi3EEES2_E14nvfuser_zero_s];
+ mul.lo.s32 %r15, %r71, 1344;
+ or.b32 %r98, %r15, 3;
+ shr.s32 %r99, %r98, 31;
+ shr.u32 %r100, %r99, 30;
+ add.s32 %r101, %r98, %r100;
+ shr.s32 %r102, %r101, 2;
+ add.s32 %r103, %r102, 127;
+ shr.s32 %r104, %r103, 31;
+ shr.u32 %r105, %r104, 25;
+ add.s32 %r106, %r103, %r105;
+ shr.s32 %r16, %r106, 7;
+ mov.u32 %r17, %ctaid.x;
+ setp.lt.s32 %p3, %r17, %r16;
+ @%p3 bra $L__BB0_3;
+ bra.uni $L__BB0_5;
$L__BB0_3:
- shl.b32 %r19, %r1, 2;
- shl.b32 %r20, %r18, 9;
- add.s32 %r115, %r19, %r20;
- or.b32 %r116, %r115, 3;
- setp.ge.s32 %p3, %r116, %r16;
- @%p3 bra $L__BB0_6;
-
- shl.b32 %r117, %r18, 7;
- add.s32 %r21, %r117, %r1;
- mul.lo.s32 %r22, %r79, 24;
- rem.s32 %r23, %r21, %r22;
- shl.b32 %r118, %r79, 3;
- div.s32 %r119, %r23, %r118;
- shl.b32 %r24, %r119, 2;
- mul.lo.s32 %r120, %r119, %r118;
- sub.s32 %r121, %r23, %r120;
- shl.b32 %r25, %r121, 2;
- or.b32 %r122, %r25, 3;
- div.s32 %r123, %r122, %r79;
- shr.s32 %r124, %r123, 31;
- shr.u32 %r125, %r124, 29;
- add.s32 %r126, %r123, %r125;
- shr.s32 %r127, %r126, 3;
- add.s32 %r128, %r127, %r24;
- setp.gt.s32 %p4, %r128, 11;
- @%p4 bra $L__BB0_6;
-
- shl.b32 %r26, %r23, 2;
- or.b32 %r129, %r26, 3;
- mul.lo.s32 %r130, %r93, %r94;
- setp.lt.s32 %p5, %r129, %r130;
- @%p5 bra $L__BB0_35;
- bra.uni $L__BB0_6;
-
-$L__BB0_35:
- shl.b32 %r342, %r15, 1;
- div.s32 %r343, %r21, %r22;
- mul.lo.s32 %r344, %r343, %r95;
- add.s32 %r345, %r26, %r342;
- div.s32 %r346, %r345, %r94;
- mad.lo.s32 %r347, %r346, %r96, %r344;
- mul.lo.s32 %r348, %r346, %r94;
- sub.s32 %r349, %r345, %r348;
- mad.lo.s32 %r350, %r349, %r97, %r347;
- mul.wide.s32 %rd37, %r350, 4;
+ shl.b32 %r18, %r1, 2;
+ shl.b32 %r19, %r17, 9;
+ add.s32 %r107, %r18, %r19;
+ or.b32 %r108, %r107, 3;
+ setp.ge.s32 %p4, %r108, %r15;
+ @%p4 bra $L__BB0_5;
+
+ shl.b32 %r109, %r17, 7;
+ add.s32 %r20, %r109, %r1;
+ mul.lo.s32 %r21, %r71, 24;
+ rem.s32 %r22, %r20, %r21;
+ shl.b32 %r110, %r71, 3;
+ div.s32 %r111, %r22, %r110;
+ shl.b32 %r23, %r111, 2;
+ mul.lo.s32 %r112, %r111, %r110;
+ sub.s32 %r113, %r22, %r112;
+ shl.b32 %r24, %r113, 2;
+ or.b32 %r114, %r24, 3;
+ div.s32 %r115, %r114, %r71;
+ shr.s32 %r116, %r115, 31;
+ shr.u32 %r117, %r116, 29;
+ add.s32 %r118, %r115, %r117;
+ shr.s32 %r119, %r118, 3;
+ add.s32 %r120, %r119, %r23;
+ setp.lt.s32 %p5, %r120, 12;
+ @%p5 bra $L__BB0_23;
+ bra.uni $L__BB0_5;
+
+$L__BB0_23:
+ shl.b32 %r320, %r14, 1;
+ div.s32 %r321, %r20, %r21;
+ mul.lo.s32 %r322, %r321, %r87;
+ shl.b32 %r323, %r22, 2;
+ add.s32 %r324, %r323, %r320;
+ div.s32 %r325, %r324, %r86;
+ mad.lo.s32 %r326, %r325, %r88, %r322;
+ mul.lo.s32 %r327, %r325, %r86;
+ sub.s32 %r328, %r324, %r327;
+ mad.lo.s32 %r329, %r328, %r89, %r326;
+ mul.wide.s32 %rd37, %r329, 4;
add.s64 %rd38, %rd3, %rd37;
- add.s32 %r351, %r345, 1;
- div.s32 %r352, %r351, %r94;
- mad.lo.s32 %r353, %r352, %r96, %r344;
- mul.lo.s32 %r354, %r352, %r94;
- sub.s32 %r355, %r351, %r354;
- mad.lo.s32 %r356, %r355, %r97, %r353;
- mul.wide.s32 %rd39, %r356, 4;
+ or.b32 %r330, %r324, 1;
+ div.s32 %r331, %r330, %r86;
+ mad.lo.s32 %r332, %r331, %r88, %r322;
+ mul.lo.s32 %r333, %r331, %r86;
+ sub.s32 %r334, %r330, %r333;
+ mad.lo.s32 %r335, %r334, %r89, %r332;
+ mul.wide.s32 %rd39, %r335, 4;
add.s64 %rd40, %rd3, %rd39;
- add.s32 %r357, %r345, 2;
- div.s32 %r358, %r357, %r94;
- mad.lo.s32 %r359, %r358, %r96, %r344;
- mul.lo.s32 %r360, %r358, %r94;
- sub.s32 %r361, %r357, %r360;
- mad.lo.s32 %r362, %r361, %r97, %r359;
- mul.wide.s32 %rd41, %r362, 4;
+ add.s32 %r336, %r324, 2;
+ div.s32 %r337, %r336, %r86;
+ mad.lo.s32 %r338, %r337, %r88, %r322;
+ mul.lo.s32 %r339, %r337, %r86;
+ sub.s32 %r340, %r336, %r339;
+ mad.lo.s32 %r341, %r340, %r89, %r338;
+ mul.wide.s32 %rd41, %r341, 4;
add.s64 %rd42, %rd3, %rd41;
- add.s32 %r363, %r345, 3;
- div.s32 %r364, %r363, %r94;
- mad.lo.s32 %r365, %r364, %r96, %r344;
- mul.lo.s32 %r366, %r364, %r94;
- sub.s32 %r367, %r363, %r366;
- mad.lo.s32 %r368, %r367, %r97, %r365;
- mul.wide.s32 %rd43, %r368, 4;
+ add.s32 %r342, %r324, 3;
+ div.s32 %r343, %r342, %r86;
+ mad.lo.s32 %r344, %r343, %r88, %r322;
+ mul.lo.s32 %r345, %r343, %r86;
+ sub.s32 %r346, %r342, %r345;
+ mad.lo.s32 %r347, %r346, %r89, %r344;
+ mul.wide.s32 %rd43, %r347, 4;
add.s64 %rd44, %rd3, %rd43;
- mul.lo.s32 %r369, %r343, %r88;
- shl.b32 %r370, %r15, 3;
- add.s32 %r371, %r25, %r370;
- div.s32 %r372, %r371, %r79;
- shr.s32 %r373, %r372, 31;
- shr.u32 %r374, %r373, 29;
- add.s32 %r375, %r372, %r374;
- shr.s32 %r376, %r375, 3;
- and.b32 %r377, %r375, -8;
- sub.s32 %r378, %r372, %r377;
- add.s32 %r379, %r376, %r24;
- mad.lo.s32 %r380, %r379, %r89, %r369;
- mad.lo.s32 %r381, %r378, %r90, %r380;
- mul.wide.s32 %rd45, %r381, 4;
+ mul.lo.s32 %r348, %r321, %r80;
+ shl.b32 %r349, %r14, 3;
+ add.s32 %r350, %r24, %r349;
+ div.s32 %r351, %r350, %r71;
+ shr.s32 %r352, %r351, 31;
+ shr.u32 %r353, %r352, 29;
+ add.s32 %r354, %r351, %r353;
+ shr.s32 %r355, %r354, 3;
+ and.b32 %r356, %r354, -8;
+ sub.s32 %r357, %r351, %r356;
+ add.s32 %r358, %r355, %r23;
+ mad.lo.s32 %r359, %r358, %r81, %r348;
+ mad.lo.s32 %r360, %r357, %r82, %r359;
+ mul.wide.s32 %rd45, %r360, 4;
add.s64 %rd46, %rd2, %rd45;
- add.s32 %r382, %r371, 1;
- div.s32 %r383, %r382, %r79;
- shr.s32 %r384, %r383, 31;
- shr.u32 %r385, %r384, 29;
- add.s32 %r386, %r383, %r385;
- shr.s32 %r387, %r386, 3;
- and.b32 %r388, %r386, -8;
- sub.s32 %r389, %r383, %r388;
- add.s32 %r390, %r387, %r24;
- mad.lo.s32 %r391, %r390, %r89, %r369;
- mad.lo.s32 %r392, %r389, %r90, %r391;
- mul.wide.s32 %rd47, %r392, 4;
+ add.s32 %r361, %r350, 1;
+ div.s32 %r362, %r361, %r71;
+ shr.s32 %r363, %r362, 31;
+ shr.u32 %r364, %r363, 29;
+ add.s32 %r365, %r362, %r364;
+ shr.s32 %r366, %r365, 3;
+ and.b32 %r367, %r365, -8;
+ sub.s32 %r368, %r362, %r367;
+ add.s32 %r369, %r366, %r23;
+ mad.lo.s32 %r370, %r369, %r81, %r348;
+ mad.lo.s32 %r371, %r368, %r82, %r370;
+ mul.wide.s32 %rd47, %r371, 4;
add.s64 %rd48, %rd2, %rd47;
- add.s32 %r393, %r371, 2;
- div.s32 %r394, %r393, %r79;
- shr.s32 %r395, %r394, 31;
- shr.u32 %r396, %r395, 29;
- add.s32 %r397, %r394, %r396;
- shr.s32 %r398, %r397, 3;
- and.b32 %r399, %r397, -8;
- sub.s32 %r400, %r394, %r399;
- add.s32 %r401, %r398, %r24;
- mad.lo.s32 %r402, %r401, %r89, %r369;
- mad.lo.s32 %r403, %r400, %r90, %r402;
- mul.wide.s32 %rd49, %r403, 4;
+ add.s32 %r372, %r350, 2;
+ div.s32 %r373, %r372, %r71;
+ shr.s32 %r374, %r373, 31;
+ shr.u32 %r375, %r374, 29;
+ add.s32 %r376, %r373, %r375;
+ shr.s32 %r377, %r376, 3;
+ and.b32 %r378, %r376, -8;
+ sub.s32 %r379, %r373, %r378;
+ add.s32 %r380, %r377, %r23;
+ mad.lo.s32 %r381, %r380, %r81, %r348;
+ mad.lo.s32 %r382, %r379, %r82, %r381;
+ mul.wide.s32 %rd49, %r382, 4;
add.s64 %rd50, %rd2, %rd49;
- add.s32 %r404, %r371, 3;
- div.s32 %r405, %r404, %r79;
- shr.s32 %r406, %r405, 31;
- shr.u32 %r407, %r406, 29;
- add.s32 %r408, %r405, %r407;
- shr.s32 %r409, %r408, 3;
- and.b32 %r410, %r408, -8;
- sub.s32 %r411, %r405, %r410;
- add.s32 %r412, %r409, %r24;
- mad.lo.s32 %r413, %r412, %r89, %r369;
- mad.lo.s32 %r414, %r411, %r90, %r413;
- mul.wide.s32 %rd51, %r414, 4;
+ add.s32 %r383, %r350, 3;
+ div.s32 %r384, %r383, %r71;
+ shr.s32 %r385, %r384, 31;
+ shr.u32 %r386, %r385, 29;
+ add.s32 %r387, %r384, %r386;
+ shr.s32 %r388, %r387, 3;
+ and.b32 %r389, %r387, -8;
+ sub.s32 %r390, %r384, %r389;
+ add.s32 %r391, %r388, %r23;
+ mad.lo.s32 %r392, %r391, %r81, %r348;
+ mad.lo.s32 %r393, %r390, %r82, %r392;
+ mul.wide.s32 %rd51, %r393, 4;
add.s64 %rd52, %rd2, %rd51;
- mul.lo.s32 %r415, %r343, %r80;
- shl.b32 %r416, %r15, 5;
- add.s32 %r417, %r25, %r416;
- div.s32 %r418, %r417, %r79;
- shr.s32 %r419, %r418, 31;
- shr.u32 %r420, %r419, 29;
- add.s32 %r421, %r418, %r420;
- shr.s32 %r422, %r421, 3;
- and.b32 %r423, %r421, -8;
- sub.s32 %r424, %r418, %r423;
- mul.lo.s32 %r425, %r418, %r79;
- sub.s32 %r426, %r417, %r425;
- add.s32 %r427, %r422, %r24;
- mad.lo.s32 %r428, %r427, %r81, %r415;
- mad.lo.s32 %r429, %r424, %r82, %r428;
- mad.lo.s32 %r430, %r426, %r83, %r429;
- mul.wide.s32 %rd53, %r430, 4;
+ mul.lo.s32 %r394, %r321, %r72;
+ shl.b32 %r395, %r14, 5;
+ add.s32 %r396, %r24, %r395;
+ div.s32 %r397, %r396, %r71;
+ shr.s32 %r398, %r397, 31;
+ shr.u32 %r399, %r398, 29;
+ add.s32 %r400, %r397, %r399;
+ shr.s32 %r401, %r400, 3;
+ and.b32 %r402, %r400, -8;
+ sub.s32 %r403, %r397, %r402;
+ mul.lo.s32 %r404, %r397, %r71;
+ sub.s32 %r405, %r396, %r404;
+ add.s32 %r406, %r401, %r23;
+ mad.lo.s32 %r407, %r406, %r73, %r394;
+ mad.lo.s32 %r408, %r403, %r74, %r407;
+ mad.lo.s32 %r409, %r405, %r75, %r408;
+ mul.wide.s32 %rd53, %r409, 4;
add.s64 %rd54, %rd1, %rd53;
- add.s32 %r431, %r417, 1;
- div.s32 %r432, %r431, %r79;
- shr.s32 %r433, %r432, 31;
- shr.u32 %r434, %r433, 29;
- add.s32 %r435, %r432, %r434;
- shr.s32 %r436, %r435, 3;
- and.b32 %r437, %r435, -8;
- sub.s32 %r438, %r432, %r437;
- mul.lo.s32 %r439, %r432, %r79;
- sub.s32 %r440, %r431, %r439;
- add.s32 %r441, %r436, %r24;
- mad.lo.s32 %r442, %r441, %r81, %r415;
- mad.lo.s32 %r443, %r438, %r82, %r442;
- mad.lo.s32 %r444, %r440, %r83, %r443;
- mul.wide.s32 %rd55, %r444, 4;
+ add.s32 %r410, %r396, 1;
+ div.s32 %r411, %r410, %r71;
+ shr.s32 %r412, %r411, 31;
+ shr.u32 %r413, %r412, 29;
+ add.s32 %r414, %r411, %r413;
+ shr.s32 %r415, %r414, 3;
+ and.b32 %r416, %r414, -8;
+ sub.s32 %r417, %r411, %r416;
+ mul.lo.s32 %r418, %r411, %r71;
+ sub.s32 %r419, %r410, %r418;
+ add.s32 %r420, %r415, %r23;
+ mad.lo.s32 %r421, %r420, %r73, %r394;
+ mad.lo.s32 %r422, %r417, %r74, %r421;
+ mad.lo.s32 %r423, %r419, %r75, %r422;
+ mul.wide.s32 %rd55, %r423, 4;
add.s64 %rd56, %rd1, %rd55;
- add.s32 %r445, %r417, 2;
- div.s32 %r446, %r445, %r79;
- shr.s32 %r447, %r446, 31;
- shr.u32 %r448, %r447, 29;
- add.s32 %r449, %r446, %r448;
- shr.s32 %r450, %r449, 3;
- and.b32 %r451, %r449, -8;
- sub.s32 %r452, %r446, %r451;
- mul.lo.s32 %r453, %r446, %r79;
- sub.s32 %r454, %r445, %r453;
- add.s32 %r455, %r450, %r24;
- mad.lo.s32 %r456, %r455, %r81, %r415;
- mad.lo.s32 %r457, %r452, %r82, %r456;
- mad.lo.s32 %r458, %r454, %r83, %r457;
- mul.wide.s32 %rd57, %r458, 4;
+ add.s32 %r424, %r396, 2;
+ div.s32 %r425, %r424, %r71;
+ shr.s32 %r426, %r425, 31;
+ shr.u32 %r427, %r426, 29;
+ add.s32 %r428, %r425, %r427;
+ shr.s32 %r429, %r428, 3;
+ and.b32 %r430, %r428, -8;
+ sub.s32 %r431, %r425, %r430;
+ mul.lo.s32 %r432, %r425, %r71;
+ sub.s32 %r433, %r424, %r432;
+ add.s32 %r434, %r429, %r23;
+ mad.lo.s32 %r435, %r434, %r73, %r394;
+ mad.lo.s32 %r436, %r431, %r74, %r435;
+ mad.lo.s32 %r437, %r433, %r75, %r436;
+ mul.wide.s32 %rd57, %r437, 4;
add.s64 %rd58, %rd1, %rd57;
- add.s32 %r459, %r417, 3;
- div.s32 %r460, %r459, %r79;
- shr.s32 %r461, %r460, 31;
- shr.u32 %r462, %r461, 29;
- add.s32 %r463, %r460, %r462;
- shr.s32 %r464, %r463, 3;
- and.b32 %r465, %r463, -8;
- sub.s32 %r466, %r460, %r465;
- mul.lo.s32 %r467, %r460, %r79;
- sub.s32 %r468, %r459, %r467;
- add.s32 %r469, %r464, %r24;
- mad.lo.s32 %r470, %r469, %r81, %r415;
- mad.lo.s32 %r471, %r466, %r82, %r470;
- mad.lo.s32 %r472, %r468, %r83, %r471;
- mul.wide.s32 %rd59, %r472, 4;
+ add.s32 %r438, %r396, 3;
+ div.s32 %r439, %r438, %r71;
+ shr.s32 %r440, %r439, 31;
+ shr.u32 %r441, %r440, 29;
+ add.s32 %r442, %r439, %r441;
+ shr.s32 %r443, %r442, 3;
+ and.b32 %r444, %r442, -8;
+ sub.s32 %r445, %r439, %r444;
+ mul.lo.s32 %r446, %r439, %r71;
+ sub.s32 %r447, %r438, %r446;
+ add.s32 %r448, %r443, %r23;
+ mad.lo.s32 %r449, %r448, %r73, %r394;
+ mad.lo.s32 %r450, %r445, %r74, %r449;
+ mad.lo.s32 %r451, %r447, %r75, %r450;
+ mul.wide.s32 %rd59, %r451, 4;
add.s64 %rd60, %rd1, %rd59;
- ld.global.f32 %f60, [%rd54];
- ld.global.f32 %f61, [%rd46];
- add.f32 %f62, %f60, %f61;
- ld.global.f32 %f63, [%rd38];
- add.f32 %f64, %f62, %f63;
- mov.b32 %r338, %f64;
- ld.global.f32 %f65, [%rd56];
- ld.global.f32 %f66, [%rd48];
- add.f32 %f67, %f65, %f66;
- ld.global.f32 %f68, [%rd40];
- add.f32 %f69, %f67, %f68;
- mov.b32 %r339, %f69;
- ld.global.f32 %f70, [%rd58];
- ld.global.f32 %f71, [%rd50];
- add.f32 %f72, %f70, %f71;
- ld.global.f32 %f73, [%rd42];
- add.f32 %f74, %f72, %f73;
- mov.b32 %r340, %f74;
- ld.global.f32 %f75, [%rd60];
- ld.global.f32 %f76, [%rd52];
- add.f32 %f77, %f75, %f76;
- ld.global.f32 %f78, [%rd44];
- add.f32 %f79, %f77, %f78;
- mov.b32 %r341, %f79;
- mul.wide.s32 %rd61, %r115, 4;
+ ld.global.f32 %f49, [%rd54];
+ ld.global.f32 %f50, [%rd46];
+ add.f32 %f51, %f49, %f50;
+ ld.global.f32 %f52, [%rd38];
+ add.f32 %f53, %f51, %f52;
+ mov.b32 %r316, %f53;
+ ld.global.f32 %f54, [%rd56];
+ ld.global.f32 %f55, [%rd48];
+ add.f32 %f56, %f54, %f55;
+ ld.global.f32 %f57, [%rd40];
+ add.f32 %f58, %f56, %f57;
+ mov.b32 %r317, %f58;
+ ld.global.f32 %f59, [%rd58];
+ ld.global.f32 %f60, [%rd50];
+ add.f32 %f61, %f59, %f60;
+ ld.global.f32 %f62, [%rd42];
+ add.f32 %f63, %f61, %f62;
+ mov.b32 %r318, %f63;
+ ld.global.f32 %f64, [%rd60];
+ ld.global.f32 %f65, [%rd52];
+ add.f32 %f66, %f64, %f65;
+ ld.global.f32 %f67, [%rd44];
+ add.f32 %f68, %f66, %f67;
+ mov.b32 %r319, %f68;
+ mul.wide.s32 %rd61, %r107, 4;
add.s64 %rd36, %rd8, %rd61;
- st.global.cs.v4.s32 [%rd36], {%r338,%r339,%r340,%r341};
+ st.global.cs.v4.s32 [%rd36], {%r316,%r317,%r318,%r319};
- bra.uni $L__BB0_36;
-
-$L__BB0_6:
- setp.ge.s32 %p6, %r18, %r17;
- shl.b32 %r131, %r1, 2;
- shl.b32 %r132, %r18, 9;
- add.s32 %r133, %r131, %r132;
- or.b32 %r27, %r133, 3;
- shl.b32 %r28, %r15, 1;
- shl.b32 %r134, %r18, 7;
- add.s32 %r29, %r134, %r1;
- mul.lo.s32 %r30, %r79, 24;
- mul.lo.s32 %r31, %r93, %r94;
- mov.f32 %f90, 0f00000000;
- mov.f32 %f80, 0f00000000;
- mov.f32 %f91, %f90;
- mov.f32 %f92, %f90;
- mov.f32 %f93, %f90;
- @%p6 bra $L__BB0_19;
-
- setp.ge.s32 %p7, %r27, %r16;
- mov.f32 %f90, %f80;
- @%p7 bra $L__BB0_10;
-
- rem.s32 %r135, %r29, %r30;
- shl.b32 %r32, %r135, 2;
- or.b32 %r136, %r32, 3;
- setp.ge.s32 %p8, %r136, %r31;
- mov.f32 %f90, %f80;
+ bra.uni $L__BB0_24;
+
+$L__BB0_5:
+ shl.b32 %r121, %r1, 2;
+ shl.b32 %r122, %r17, 9;
+ add.s32 %r123, %r121, %r122;
+ or.b32 %r25, %r123, 3;
+ setp.ge.s32 %p6, %r25, %r15;
+ shl.b32 %r124, %r17, 7;
+ add.s32 %r26, %r124, %r1;
+ mul.lo.s32 %r27, %r71, 24;
+ setp.ge.s32 %p7, %r17, %r16;
+ or.pred %p1, %p7, %p6;
+ mov.f32 %f77, 0f00000000;
+ mov.f32 %f73, %f77;
+ mov.f32 %f74, %f77;
+ mov.f32 %f75, %f77;
+ mov.f32 %f76, %f77;
+ @%p1 bra $L__BB0_7;
+
+ div.s32 %r125, %r26, %r27;
+ mul.lo.s32 %r126, %r125, %r87;
+ mul.lo.s32 %r127, %r125, %r27;
+ sub.s32 %r128, %r26, %r127;
+ shl.b32 %r129, %r128, 2;
+ shl.b32 %r130, %r14, 1;
+ add.s32 %r131, %r129, %r130;
+ div.s32 %r132, %r131, %r86;
+ mad.lo.s32 %r133, %r132, %r88, %r126;
+ mul.lo.s32 %r134, %r132, %r86;
+ sub.s32 %r135, %r131, %r134;
+ mad.lo.s32 %r136, %r135, %r89, %r133;
+ mul.wide.s32 %rd10, %r136, 4;
+ add.s64 %rd11, %rd3, %rd10;
+ ld.global.f32 %f73, [%rd11];
+ or.b32 %r137, %r131, 1;
+ div.s32 %r138, %r137, %r86;
+ mad.lo.s32 %r139, %r138, %r88, %r126;
+ mul.lo.s32 %r140, %r138, %r86;
+ sub.s32 %r141, %r137, %r140;
+ mad.lo.s32 %r142, %r141, %r89, %r139;
+ mul.wide.s32 %rd12, %r142, 4;
+ add.s64 %rd13, %rd3, %rd12;
+ ld.global.f32 %f74, [%rd13];
+ add.s32 %r143, %r131, 2;
+ div.s32 %r144, %r143, %r86;
+ mad.lo.s32 %r145, %r144, %r88, %r126;
+ mul.lo.s32 %r146, %r144, %r86;
+ sub.s32 %r147, %r143, %r146;
+ mad.lo.s32 %r148, %r147, %r89, %r145;
+ mul.wide.s32 %rd14, %r148, 4;
+ add.s64 %rd15, %rd3, %rd14;
+ ld.global.f32 %f75, [%rd15];
+ add.s32 %r149, %r131, 3;
+ div.s32 %r150, %r149, %r86;
+ mad.lo.s32 %r151, %r150, %r88, %r126;
+ mul.lo.s32 %r152, %r150, %r86;
+ sub.s32 %r153, %r149, %r152;
+ mad.lo.s32 %r154, %r153, %r89, %r151;
+ mul.wide.s32 %rd16, %r154, 4;
+ add.s64 %rd17, %rd3, %rd16;
+ ld.global.f32 %f76, [%rd17];
+
+$L__BB0_7:
+ add.s32 %r28, %r122, %r121;
+ mad.lo.s32 %r29, %r71, -1344, %r28;
+ shl.b32 %r30, %r71, 3;
+ shl.b32 %r31, %r14, 3;
+ neg.s32 %r157, %r31;
+ setp.ge.s32 %p8, %r29, %r157;
@%p8 bra $L__BB0_10;
- div.s32 %r137, %r29, %r30;
- mul.lo.s32 %r138, %r137, %r95;
- add.s32 %r139, %r32, %r28;
- div.s32 %r140, %r139, %r94;
- mad.lo.s32 %r141, %r140, %r96, %r138;
- mul.lo.s32 %r142, %r140, %r94;
- sub.s32 %r143, %r139, %r142;
- mad.lo.s32 %r144, %r143, %r97, %r141;
- mul.wide.s32 %rd10, %r144, 4;
- add.s64 %rd11, %rd3, %rd10;
- ld.global.f32 %f90, [%rd11];
+ rem.s32 %r158, %r26, %r27;
+ div.s32 %r159, %r158, %r30;
+ shl.b32 %r160, %r159, 2;
+ mul.lo.s32 %r161, %r159, %r30;
+ sub.s32 %r162, %r158, %r161;
+ shl.b32 %r163, %r162, 2;
+ add.s32 %r164, %r163, %r31;
+ div.s32 %r32, %r164, %r71;
+ shr.s32 %r165, %r32, 31;
+ shr.u32 %r166, %r165, 29;
+ add.s32 %r167, %r32, %r166;
+ shr.s32 %r168, %r167, 3;
+ add.s32 %r33, %r168, %r160;
+ setp.gt.s32 %p9, %r33, 11;
+ @%p9 bra $L__BB0_10;
+
+ div.s32 %r169, %r26, %r27;
+ mul.lo.s32 %r170, %r169, %r80;
+ and.b32 %r174, %r167, -8;
+ sub.s32 %r175, %r32, %r174;
+ mad.lo.s32 %r176, %r33, %r81, %r170;
+ mad.lo.s32 %r177, %r175, %r82, %r176;
+ mul.wide.s32 %rd18, %r177, 4;
+ add.s64 %rd19, %rd2, %rd18;
+ ld.global.f32 %f77, [%rd19];
$L__BB0_10:
- mov.f32 %f92, 0f00000000;
- mov.f32 %f91, %f92;
- @%p7 bra $L__BB0_13;
-
- rem.s32 %r145, %r29, %r30;
- shl.b32 %r33, %r145, 2;
- or.b32 %r146, %r33, 3;
- setp.ge.s32 %p10, %r146, %r31;
+ not.b32 %r178, %r31;
+ setp.ge.s32 %p10, %r29, %r178;
+ mov.f32 %f79, 0f00000000;
+ mov.f32 %f78, %f79;
@%p10 bra $L__BB0_13;
- div.s32 %r147, %r29, %r30;
- mul.lo.s32 %r148, %r147, %r95;
- add.s32 %r149, %r33, %r28;
- add.s32 %r150, %r149, 1;
- div.s32 %r151, %r150, %r94;
- mad.lo.s32 %r152, %r151, %r96, %r148;
- mul.lo.s32 %r153, %r151, %r94;
- sub.s32 %r154, %r150, %r153;
- mad.lo.s32 %r155, %r154, %r97, %r152;
- mul.wide.s32 %rd12, %r155, 4;
- add.s64 %rd13, %rd3, %rd12;
- ld.global.f32 %f91, [%rd13];
+ rem.s32 %r179, %r26, %r27;
+ div.s32 %r180, %r179, %r30;
+ mul.lo.s32 %r181, %r180, %r30;
+ sub.s32 %r182, %r179, %r181;
+ shl.b32 %r183, %r182, 2;
+ add.s32 %r184, %r183, %r31;
+ shl.b32 %r185, %r180, 2;
+ add.s32 %r186, %r184, 1;
+ div.s32 %r34, %r186, %r71;
+ shr.s32 %r187, %r34, 31;
+ shr.u32 %r188, %r187, 29;
+ add.s32 %r189, %r34, %r188;
+ shr.s32 %r190, %r189, 3;
+ add.s32 %r35, %r190, %r185;
+ setp.gt.s32 %p11, %r35, 11;
+ @%p11 bra $L__BB0_13;
+
+ div.s32 %r191, %r26, %r27;
+ mul.lo.s32 %r192, %r191, %r80;
+ and.b32 %r196, %r189, -8;
+ sub.s32 %r197, %r34, %r196;
+ mad.lo.s32 %r198, %r35, %r81, %r192;
+ mad.lo.s32 %r199, %r197, %r82, %r198;
+ mul.wide.s32 %rd20, %r199, 4;
+ add.s64 %rd21, %rd2, %rd20;
+ ld.global.f32 %f78, [%rd21];
$L__BB0_13:
- @%p7 bra $L__BB0_16;
-
- rem.s32 %r156, %r29, %r30;
- shl.b32 %r34, %r156, 2;
- or.b32 %r157, %r34, 3;
- setp.ge.s32 %p12, %r157, %r31;
+ mov.u32 %r200, -2;
+ sub.s32 %r201, %r200, %r31;
+ setp.ge.s32 %p12, %r29, %r201;
@%p12 bra $L__BB0_16;
- div.s32 %r158, %r29, %r30;
- mul.lo.s32 %r159, %r158, %r95;
- add.s32 %r160, %r34, %r28;
- add.s32 %r161, %r160, 2;
- div.s32 %r162, %r161, %r94;
- mad.lo.s32 %r163, %r162, %r96, %r159;
- mul.lo.s32 %r164, %r162, %r94;
- sub.s32 %r165, %r161, %r164;
- mad.lo.s32 %r166, %r165, %r97, %r163;
- mul.wide.s32 %rd14, %r166, 4;
- add.s64 %rd15, %rd3, %rd14;
- ld.global.f32 %f92, [%rd15];
+ rem.s32 %r202, %r26, %r27;
+ div.s32 %r203, %r202, %r30;
+ mul.lo.s32 %r204, %r203, %r30;
+ sub.s32 %r205, %r202, %r204;
+ shl.b32 %r206, %r205, 2;
+ add.s32 %r207, %r206, %r31;
+ shl.b32 %r208, %r203, 2;
+ add.s32 %r209, %r207, 2;
+ div.s32 %r36, %r209, %r71;
+ shr.s32 %r210, %r36, 31;
+ shr.u32 %r211, %r210, 29;
+ add.s32 %r212, %r36, %r211;
+ shr.s32 %r213, %r212, 3;
+ add.s32 %r37, %r213, %r208;
+ setp.gt.s32 %p13, %r37, 11;
+ @%p13 bra $L__BB0_16;
+
+ div.s32 %r214, %r26, %r27;
+ mul.lo.s32 %r215, %r214, %r80;
+ and.b32 %r219, %r212, -8;
+ sub.s32 %r220, %r36, %r219;
+ mad.lo.s32 %r221, %r37, %r81, %r215;
+ mad.lo.s32 %r222, %r220, %r82, %r221;
+ mul.wide.s32 %rd22, %r222, 4;
+ add.s64 %rd23, %rd2, %rd22;
+ ld.global.f32 %f79, [%rd23];
$L__BB0_16:
- mov.f32 %f93, 0f00000000;
- @%p7 bra $L__BB0_19;
-
- rem.s32 %r167, %r29, %r30;
- shl.b32 %r168, %r167, 2;
- or.b32 %r35, %r168, 3;
- setp.ge.s32 %p14, %r35, %r31;
+ mov.u32 %r223, -3;
+ sub.s32 %r224, %r223, %r31;
+ setp.ge.s32 %p14, %r29, %r224;
+ mov.f32 %f81, 0f00000000;
+ mov.f32 %f80, %f81;
@%p14 bra $L__BB0_19;
- div.s32 %r169, %r29, %r30;
- mul.lo.s32 %r170, %r169, %r95;
- add.s32 %r171, %r35, %r28;
- div.s32 %r172, %r171, %r94;
- mad.lo.s32 %r173, %r172, %r96, %r170;
- mul.lo.s32 %r174, %r172, %r94;
- sub.s32 %r175, %r171, %r174;
- mad.lo.s32 %r176, %r175, %r97, %r173;
- mul.wide.s32 %rd16, %r176, 4;
- add.s64 %rd17, %rd3, %rd16;
- ld.global.f32 %f93, [%rd17];
+ rem.s32 %r225, %r26, %r27;
+ div.s32 %r226, %r225, %r30;
+ mul.lo.s32 %r227, %r226, %r30;
+ sub.s32 %r228, %r225, %r227;
+ shl.b32 %r229, %r228, 2;
+ add.s32 %r230, %r229, %r31;
+ shl.b32 %r231, %r226, 2;
+ add.s32 %r232, %r230, 3;
+ div.s32 %r38, %r232, %r71;
+ shr.s32 %r233, %r38, 31;
+ shr.u32 %r234, %r233, 29;
+ add.s32 %r235, %r38, %r234;
+ shr.s32 %r236, %r235, 3;
+ add.s32 %r39, %r236, %r231;
+ setp.gt.s32 %p15, %r39, 11;
+ @%p15 bra $L__BB0_19;
+
+ div.s32 %r237, %r26, %r27;
+ mul.lo.s32 %r238, %r237, %r80;
+ and.b32 %r242, %r235, -8;
+ sub.s32 %r243, %r38, %r242;
+ mad.lo.s32 %r244, %r39, %r81, %r238;
+ mad.lo.s32 %r245, %r243, %r82, %r244;
+ mul.wide.s32 %rd24, %r245, 4;
+ add.s64 %rd25, %rd2, %rd24;
+ ld.global.f32 %f80, [%rd25];
$L__BB0_19:
- add.s32 %r36, %r132, %r131;
- mad.lo.s32 %r37, %r79, -1344, %r36;
- shl.b32 %r38, %r79, 3;
- shl.b32 %r39, %r15, 3;
- neg.s32 %r179, %r39;
- setp.ge.s32 %p15, %r37, %r179;
- mov.f32 %f95, 0f00000000;
- mov.f32 %f94, %f95;
- @%p15 bra $L__BB0_22;
-
- rem.s32 %r180, %r29, %r30;
- div.s32 %r181, %r180, %r38;
- shl.b32 %r182, %r181, 2;
- mul.lo.s32 %r183, %r181, %r38;
- sub.s32 %r184, %r180, %r183;
- shl.b32 %r185, %r184, 2;
- add.s32 %r186, %r185, %r39;
- div.s32 %r40, %r186, %r79;
- shr.s32 %r187, %r40, 31;
- shr.u32 %r188, %r187, 29;
- add.s32 %r189, %r40, %r188;
- shr.s32 %r190, %r189, 3;
- add.s32 %r41, %r190, %r182;
- setp.gt.s32 %p16, %r41, 11;
- @%p16 bra $L__BB0_22;
-
- div.s32 %r191, %r29, %r30;
- mul.lo.s32 %r192, %r191, %r88;
- and.b32 %r196, %r189, -8;
- sub.s32 %r197, %r40, %r196;
- mad.lo.s32 %r198, %r41, %r89, %r192;
- mad.lo.s32 %r199, %r197, %r90, %r198;
- mul.wide.s32 %rd18, %r199, 4;
- add.s64 %rd19, %rd2, %rd18;
- ld.global.f32 %f94, [%rd19];
-
-$L__BB0_22:
- not.b32 %r200, %r39;
- setp.ge.s32 %p17, %r37, %r200;
- @%p17 bra $L__BB0_25;
-
- rem.s32 %r201, %r29, %r30;
- div.s32 %r202, %r201, %r38;
- mul.lo.s32 %r203, %r202, %r38;
- sub.s32 %r204, %r201, %r203;
- shl.b32 %r205, %r204, 2;
- add.s32 %r206, %r205, %r39;
- shl.b32 %r207, %r202, 2;
- add.s32 %r208, %r206, 1;
- div.s32 %r42, %r208, %r79;
- shr.s32 %r209, %r42, 31;
- shr.u32 %r210, %r209, 29;
- add.s32 %r211, %r42, %r210;
- shr.s32 %r212, %r211, 3;
- add.s32 %r43, %r212, %r207;
- setp.gt.s32 %p18, %r43, 11;
- @%p18 bra $L__BB0_25;
-
- div.s32 %r213, %r29, %r30;
- mul.lo.s32 %r214, %r213, %r88;
- and.b32 %r218, %r211, -8;
- sub.s32 %r219, %r42, %r218;
- mad.lo.s32 %r220, %r43, %r89, %r214;
- mad.lo.s32 %r221, %r219, %r90, %r220;
- mul.wide.s32 %rd20, %r221, 4;
- add.s64 %rd21, %rd2, %rd20;
- ld.global.f32 %f95, [%rd21];
-
-$L__BB0_25:
- mov.u32 %r222, -2;
- sub.s32 %r223, %r222, %r39;
- setp.ge.s32 %p19, %r37, %r223;
- mov.f32 %f97, 0f00000000;
- mov.f32 %f96, %f97;
- @%p19 bra $L__BB0_28;
-
- rem.s32 %r224, %r29, %r30;
- div.s32 %r225, %r224, %r38;
- mul.lo.s32 %r226, %r225, %r38;
- sub.s32 %r227, %r224, %r226;
- shl.b32 %r228, %r227, 2;
- add.s32 %r229, %r228, %r39;
- shl.b32 %r230, %r225, 2;
- add.s32 %r231, %r229, 2;
- div.s32 %r44, %r231, %r79;
- shr.s32 %r232, %r44, 31;
- shr.u32 %r233, %r232, 29;
- add.s32 %r234, %r44, %r233;
- shr.s32 %r235, %r234, 3;
- add.s32 %r45, %r235, %r230;
- setp.gt.s32 %p20, %r45, 11;
- @%p20 bra $L__BB0_28;
-
- div.s32 %r236, %r29, %r30;
- mul.lo.s32 %r237, %r236, %r88;
- and.b32 %r241, %r234, -8;
- sub.s32 %r242, %r44, %r241;
- mad.lo.s32 %r243, %r45, %r89, %r237;
- mad.lo.s32 %r244, %r242, %r90, %r243;
- mul.wide.s32 %rd22, %r244, 4;
- add.s64 %rd23, %rd2, %rd22;
- ld.global.f32 %f96, [%rd23];
-
-$L__BB0_28:
- mov.u32 %r245, -3;
- sub.s32 %r246, %r245, %r39;
- setp.ge.s32 %p21, %r37, %r246;
- @%p21 bra $L__BB0_31;
-
- rem.s32 %r247, %r29, %r30;
- div.s32 %r248, %r247, %r38;
- mul.lo.s32 %r249, %r248, %r38;
- sub.s32 %r250, %r247, %r249;
+ mov.f32 %f82, %f81;
+ mov.f32 %f83, %f81;
+ mov.f32 %f84, %f81;
+ @%p1 bra $L__BB0_21;
+
+ div.s32 %r246, %r26, %r27;
+ mul.lo.s32 %r247, %r246, %r72;
+ mul.lo.s32 %r248, %r246, %r27;
+ sub.s32 %r249, %r26, %r248;
+ div.s32 %r250, %r249, %r30;
shl.b32 %r251, %r250, 2;
- add.s32 %r252, %r251, %r39;
- shl.b32 %r253, %r248, 2;
- add.s32 %r254, %r252, 3;
- div.s32 %r46, %r254, %r79;
- shr.s32 %r255, %r46, 31;
- shr.u32 %r256, %r255, 29;
- add.s32 %r257, %r46, %r256;
- shr.s32 %r258, %r257, 3;
- add.s32 %r47, %r258, %r253;
- setp.gt.s32 %p22, %r47, 11;
- @%p22 bra $L__BB0_31;
-
- div.s32 %r259, %r29, %r30;
- mul.lo.s32 %r260, %r259, %r88;
- and.b32 %r264, %r257, -8;
- sub.s32 %r265, %r46, %r264;
- mad.lo.s32 %r266, %r47, %r89, %r260;
- mad.lo.s32 %r267, %r265, %r90, %r266;
- mul.wide.s32 %rd24, %r267, 4;
- add.s64 %rd25, %rd2, %rd24;
- ld.global.f32 %f97, [%rd25];
-
-$L__BB0_31:
- setp.ge.s32 %p23, %r27, %r16;
- or.pred %p25, %p6, %p23;
- mov.f32 %f98, 0f00000000;
- mov.f32 %f99, %f98;
- mov.f32 %f100, %f98;
- mov.f32 %f101, %f98;
- @%p25 bra $L__BB0_33;
-
- div.s32 %r268, %r29, %r30;
- mul.lo.s32 %r269, %r268, %r80;
- mul.lo.s32 %r270, %r268, %r30;
- sub.s32 %r271, %r29, %r270;
- div.s32 %r272, %r271, %r38;
- shl.b32 %r273, %r272, 2;
- mul.lo.s32 %r274, %r272, %r38;
- sub.s32 %r275, %r271, %r274;
- shl.b32 %r276, %r275, 2;
- shl.b32 %r277, %r15, 5;
- add.s32 %r278, %r276, %r277;
- div.s32 %r279, %r278, %r79;
- shr.s32 %r280, %r279, 31;
- shr.u32 %r281, %r280, 29;
- add.s32 %r282, %r279, %r281;
- shr.s32 %r283, %r282, 3;
- and.b32 %r284, %r282, -8;
- sub.s32 %r285, %r279, %r284;
- mul.lo.s32 %r286, %r279, %r79;
- sub.s32 %r287, %r278, %r286;
- add.s32 %r288, %r283, %r273;
- mad.lo.s32 %r289, %r288, %r81, %r269;
- mad.lo.s32 %r290, %r285, %r82, %r289;
- mad.lo.s32 %r291, %r287, %r83, %r290;
- mul.wide.s32 %rd26, %r291, 4;
+ mul.lo.s32 %r252, %r250, %r30;
+ sub.s32 %r253, %r249, %r252;
+ shl.b32 %r254, %r253, 2;
+ shl.b32 %r255, %r14, 5;
+ add.s32 %r256, %r254, %r255;
+ div.s32 %r257, %r256, %r71;
+ shr.s32 %r258, %r257, 31;
+ shr.u32 %r259, %r258, 29;
+ add.s32 %r260, %r257, %r259;
+ shr.s32 %r261, %r260, 3;
+ and.b32 %r262, %r260, -8;
+ sub.s32 %r263, %r257, %r262;
+ mul.lo.s32 %r264, %r257, %r71;
+ sub.s32 %r265, %r256, %r264;
+ add.s32 %r266, %r261, %r251;
+ mad.lo.s32 %r267, %r266, %r73, %r247;
+ mad.lo.s32 %r268, %r263, %r74, %r267;
+ mad.lo.s32 %r269, %r265, %r75, %r268;
+ mul.wide.s32 %rd26, %r269, 4;
add.s64 %rd27, %rd1, %rd26;
- ld.global.f32 %f101, [%rd27];
- or.b32 %r292, %r278, 1;
- div.s32 %r293, %r292, %r79;
- shr.s32 %r294, %r293, 31;
- shr.u32 %r295, %r294, 29;
- add.s32 %r296, %r293, %r295;
- shr.s32 %r297, %r296, 3;
- and.b32 %r298, %r296, -8;
- sub.s32 %r299, %r293, %r298;
- mul.lo.s32 %r300, %r293, %r79;
- sub.s32 %r301, %r292, %r300;
- add.s32 %r302, %r297, %r273;
- mad.lo.s32 %r303, %r302, %r81, %r269;
- mad.lo.s32 %r304, %r299, %r82, %r303;
- mad.lo.s32 %r305, %r301, %r83, %r304;
- mul.wide.s32 %rd28, %r305, 4;
+ ld.global.f32 %f84, [%rd27];
+ or.b32 %r270, %r256, 1;
+ div.s32 %r271, %r270, %r71;
+ shr.s32 %r272, %r271, 31;
+ shr.u32 %r273, %r272, 29;
+ add.s32 %r274, %r271, %r273;
+ shr.s32 %r275, %r274, 3;
+ and.b32 %r276, %r274, -8;
+ sub.s32 %r277, %r271, %r276;
+ mul.lo.s32 %r278, %r271, %r71;
+ sub.s32 %r279, %r270, %r278;
+ add.s32 %r280, %r275, %r251;
+ mad.lo.s32 %r281, %r280, %r73, %r247;
+ mad.lo.s32 %r282, %r277, %r74, %r281;
+ mad.lo.s32 %r283, %r279, %r75, %r282;
+ mul.wide.s32 %rd28, %r283, 4;
add.s64 %rd29, %rd1, %rd28;
- ld.global.f32 %f100, [%rd29];
- or.b32 %r306, %r278, 2;
- div.s32 %r307, %r306, %r79;
- shr.s32 %r308, %r307, 31;
- shr.u32 %r309, %r308, 29;
- add.s32 %r310, %r307, %r309;
- shr.s32 %r311, %r310, 3;
- and.b32 %r312, %r310, -8;
- sub.s32 %r313, %r307, %r312;
- mul.lo.s32 %r314, %r307, %r79;
- sub.s32 %r315, %r306, %r314;
- add.s32 %r316, %r311, %r273;
- mad.lo.s32 %r317, %r316, %r81, %r269;
- mad.lo.s32 %r318, %r313, %r82, %r317;
- mad.lo.s32 %r319, %r315, %r83, %r318;
- mul.wide.s32 %rd30, %r319, 4;
+ ld.global.f32 %f83, [%rd29];
+ or.b32 %r284, %r256, 2;
+ div.s32 %r285, %r284, %r71;
+ shr.s32 %r286, %r285, 31;
+ shr.u32 %r287, %r286, 29;
+ add.s32 %r288, %r285, %r287;
+ shr.s32 %r289, %r288, 3;
+ and.b32 %r290, %r288, -8;
+ sub.s32 %r291, %r285, %r290;
+ mul.lo.s32 %r292, %r285, %r71;
+ sub.s32 %r293, %r284, %r292;
+ add.s32 %r294, %r289, %r251;
+ mad.lo.s32 %r295, %r294, %r73, %r247;
+ mad.lo.s32 %r296, %r291, %r74, %r295;
+ mad.lo.s32 %r297, %r293, %r75, %r296;
+ mul.wide.s32 %rd30, %r297, 4;
add.s64 %rd31, %rd1, %rd30;
- ld.global.f32 %f99, [%rd31];
- or.b32 %r320, %r278, 3;
- div.s32 %r321, %r320, %r79;
- shr.s32 %r322, %r321, 31;
- shr.u32 %r323, %r322, 29;
- add.s32 %r324, %r321, %r323;
- shr.s32 %r325, %r324, 3;
- and.b32 %r326, %r324, -8;
- sub.s32 %r327, %r321, %r326;
- mul.lo.s32 %r328, %r321, %r79;
- sub.s32 %r329, %r320, %r328;
- add.s32 %r330, %r325, %r273;
- mad.lo.s32 %r331, %r330, %r81, %r269;
- mad.lo.s32 %r332, %r327, %r82, %r331;
- mad.lo.s32 %r333, %r329, %r83, %r332;
- mul.wide.s32 %rd32, %r333, 4;
+ ld.global.f32 %f82, [%rd31];
+ or.b32 %r298, %r256, 3;
+ div.s32 %r299, %r298, %r71;
+ shr.s32 %r300, %r299, 31;
+ shr.u32 %r301, %r300, 29;
+ add.s32 %r302, %r299, %r301;
+ shr.s32 %r303, %r302, 3;
+ and.b32 %r304, %r302, -8;
+ sub.s32 %r305, %r299, %r304;
+ mul.lo.s32 %r306, %r299, %r71;
+ sub.s32 %r307, %r298, %r306;
+ add.s32 %r308, %r303, %r251;
+ mad.lo.s32 %r309, %r308, %r73, %r247;
+ mad.lo.s32 %r310, %r305, %r74, %r309;
+ mad.lo.s32 %r311, %r307, %r75, %r310;
+ mul.wide.s32 %rd32, %r311, 4;
add.s64 %rd33, %rd1, %rd32;
- ld.global.f32 %f98, [%rd33];
-
-$L__BB0_33:
- add.f32 %f56, %f101, %f94;
- add.f32 %f28, %f56, %f90;
- add.f32 %f57, %f100, %f95;
- add.f32 %f29, %f57, %f91;
- add.f32 %f58, %f99, %f96;
- add.f32 %f30, %f58, %f92;
- add.f32 %f59, %f98, %f97;
- add.f32 %f31, %f59, %f93;
- @%p25 bra $L__BB0_36;
-
- mov.b32 %r337, %f31;
- mul.wide.s32 %rd35, %r36, 4;
+ ld.global.f32 %f81, [%rd33];
+
+$L__BB0_21:
+ add.f32 %f45, %f84, %f77;
+ add.f32 %f25, %f45, %f73;
+ add.f32 %f46, %f83, %f78;
+ add.f32 %f26, %f46, %f74;
+ add.f32 %f47, %f82, %f79;
+ add.f32 %f27, %f47, %f75;
+ add.f32 %f48, %f81, %f80;
+ add.f32 %f28, %f48, %f76;
+ @%p1 bra $L__BB0_24;
+
+ mov.b32 %r315, %f28;
+ mul.wide.s32 %rd35, %r28, 4;
add.s64 %rd34, %rd8, %rd35;
- mov.b32 %r334, %f28;
- mov.b32 %r335, %f29;
- mov.b32 %r336, %f30;
+ mov.b32 %r312, %f25;
+ mov.b32 %r313, %f26;
+ mov.b32 %r314, %f27;
- st.global.cs.v4.s32 [%rd34], {%r334,%r335,%r336,%r337};
+ st.global.cs.v4.s32 [%rd34], {%r312,%r313,%r314,%r315};
-$L__BB0_36:
+$L__BB0_24:
ret;
}
7: GpuViewTest.FusionReshapeMagicSchedule8
Kernel 1
CUDA
PTX
53997da5d
Diff
03a1b695e
-19
+19 index type: int
registers: 80→ 72
gmem: 27
static smem: 16
stack frame: 32
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 5, 5> T0, Tensor<float, 5, 5> T3, Tensor<float, 3, 3> T9) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
NVFUSER_DEFINE_MAGIC_ZERO;
Array<float, 32, 1> T6;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
if ((((((nvfuser_index_t)threadIdx.x) + 32) < 63) && ((-12 + ((nvfuser_index_t)threadIdx.z)) < (-(4 * (i0 + nvfuser_zero)))))) {
Array<float, 8, 1> T11;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 2; ++i2) {
T11[((2 * i1) + i2)]
= T3[((((((T3.alloc_stride[1LL] * ((nvfuser_index_t)threadIdx.z)) + (T3.alloc_stride[2LL] * ((nvfuser_index_t)blockIdx.x))) + (T3.alloc_stride[0LL] * (((4 * i0) + i1) / 5))) + (T3.alloc_stride[1LL] * (((4 * i0) + i1) % 5))) + (T3.alloc_stride[3LL] * ((((nvfuser_index_t)threadIdx.x) + (32 * (i2 + nvfuser_zero))) / 7))) + (T3.alloc_stride[4LL] * ((((nvfuser_index_t)threadIdx.x) + (32 * (i2 + nvfuser_zero))) % 7)))];
}
}
Array<float, 8, 1> T10;
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 4; ++i3) {
#pragma unroll
for(nvfuser_index_t i4 = 0; i4 < 2; ++i4) {
T10[((2 * i3) + i4)]
= T0[((((((T0.alloc_stride[1LL] * ((nvfuser_index_t)threadIdx.z)) + (T0.alloc_stride[2LL] * ((nvfuser_index_t)blockIdx.x))) + (T0.alloc_stride[0LL] * (((4 * i0) + i3) / 3))) + (T0.alloc_stride[1LL] * (((4 * i0) + i3) % 3))) + (T0.alloc_stride[3LL] * ((((nvfuser_index_t)threadIdx.x) + (32 * (i4 + nvfuser_zero))) / 9))) + (T0.alloc_stride[4LL] * ((((nvfuser_index_t)threadIdx.x) + (32 * (i4 + nvfuser_zero))) % 9)))];
}
}
#pragma unroll
for(nvfuser_index_t i5 = 0; i5 < 4; ++i5) {
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
Array<float, 1, 1> T1;
T1[0]
= sinf(T10[((2 * i5) + i6)]);
Array<float, 1, 1> T2;
T2[0]
= T1[0];
Array<float, 1, 1> T4;
T4[0]
= cosf(T11[((2 * i5) + i6)]);
Array<float, 1, 1> T5;
T5[0]
= T4[0];
T6[(((8 * i0) + (2 * i5)) + i6)]
= T2[0]
+ T5[0];
}
}
} else {
Array<float, 8, 1> T11;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 2; ++i2) {
if ((((-15 + ((nvfuser_index_t)threadIdx.z)) < ((-(4 * i0)) - (i1 + nvfuser_zero))) && ((-63 + ((nvfuser_index_t)threadIdx.x)) < (-(32 * (i2 + nvfuser_zero)))))) {
T11[((2 * i1) + i2)]
= T3[((((((T3.alloc_stride[1LL] * ((nvfuser_index_t)threadIdx.z)) + (T3.alloc_stride[2LL] * ((nvfuser_index_t)blockIdx.x))) + (T3.alloc_stride[0LL] * (((4 * i0) + i1) / 5))) + (T3.alloc_stride[1LL] * (((4 * i0) + i1) % 5))) + (T3.alloc_stride[3LL] * ((((nvfuser_index_t)threadIdx.x) + (32 * (i2 + nvfuser_zero))) / 7))) + (T3.alloc_stride[4LL] * ((((nvfuser_index_t)threadIdx.x) + (32 * (i2 + nvfuser_zero))) % 7)))];
}
}
}
Array<float, 8, 1> T10;
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 4; ++i3) {
#pragma unroll
for(nvfuser_index_t i4 = 0; i4 < 2; ++i4) {
if ((((-15 + ((nvfuser_index_t)threadIdx.z)) < ((-(4 * i0)) - (i3 + nvfuser_zero))) && ((-63 + ((nvfuser_index_t)threadIdx.x)) < (-(32 * (i4 + nvfuser_zero)))))) {
T10[((2 * i3) + i4)]
= T0[((((((T0.alloc_stride[1LL] * ((nvfuser_index_t)threadIdx.z)) + (T0.alloc_stride[2LL] * ((nvfuser_index_t)blockIdx.x))) + (T0.alloc_stride[0LL] * (((4 * i0) + i3) / 3))) + (T0.alloc_stride[1LL] * (((4 * i0) + i3) % 3))) + (T0.alloc_stride[3LL] * ((((nvfuser_index_t)threadIdx.x) + (32 * (i4 + nvfuser_zero))) / 9))) + (T0.alloc_stride[4LL] * ((((nvfuser_index_t)threadIdx.x) + (32 * (i4 + nvfuser_zero))) % 9)))];
}
}
}
#pragma unroll
for(nvfuser_index_t i5 = 0; i5 < 4; ++i5) {
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
Array<float, 1, 1> T1;
if ((((-15 + ((nvfuser_index_t)threadIdx.z)) < ((-(4 * i0)) - (i5 + nvfuser_zero))) && ((-63 + ((nvfuser_index_t)threadIdx.x)) < (-(32 * (i6 + nvfuser_zero)))))) {
T1[0]
= sinf(T10[((2 * i5) + i6)]);
}
Array<float, 1, 1> T2;
if ((((-15 + ((nvfuser_index_t)threadIdx.z)) < ((-(4 * i0)) - (i5 + nvfuser_zero))) && ((-63 + ((nvfuser_index_t)threadIdx.x)) < (-(32 * (i6 + nvfuser_zero)))))) {
T2[0]
= T1[0];
}
Array<float, 1, 1> T4;
if ((((-15 + ((nvfuser_index_t)threadIdx.z)) < ((-(4 * i0)) - (i5 + nvfuser_zero))) && ((-63 + ((nvfuser_index_t)threadIdx.x)) < (-(32 * (i6 + nvfuser_zero)))))) {
T4[0]
= cosf(T11[((2 * i5) + i6)]);
}
Array<float, 1, 1> T5;
if ((((-15 + ((nvfuser_index_t)threadIdx.z)) < ((-(4 * i0)) - (i5 + nvfuser_zero))) && ((-63 + ((nvfuser_index_t)threadIdx.x)) < (-(32 * (i6 + nvfuser_zero)))))) {
T5[0]
= T4[0];
}
if ((((-15 + ((nvfuser_index_t)threadIdx.z)) < ((-(4 * i0)) - (i5 + nvfuser_zero))) && ((-63 + ((nvfuser_index_t)threadIdx.x)) < (-(32 * (i6 + nvfuser_zero)))))) {
T6[(((8 * i0) + (2 * i5)) + i6)]
= T2[0]
+ T5[0];
}
}
}
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 1, 1> T13;
T13[0] = 0.000000000e+00f;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
if ((((((nvfuser_index_t)threadIdx.x) + 32) < 63) && ((-12 + ((nvfuser_index_t)threadIdx.z)) < (-(4 * (i7 + nvfuser_zero)))))) {
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
T13[0]
= T13[0]
+ T6[(((8 * i7) + (2 * i8)) + i9)];
}
}
} else {
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
if ((((-15 + ((nvfuser_index_t)threadIdx.z)) < ((-(4 * i7)) - (i8 + nvfuser_zero))) && ((-63 + ((nvfuser_index_t)threadIdx.x)) < (-(32 * (i9 + nvfuser_zero)))))) {
T13[0]
= T13[0]
+ T6[(((8 * i7) + (2 * i8)) + i9)];
}
}
}
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 1, 1> T7;
T7[0] = 0.000000000e+00f;
blockReduce<true, false, true, true>(T7[0], T13[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T8;
broadcast::blockBroadcast<true, false, true, true>(T8[0], T7[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 4; ++i10) {
if ((((((nvfuser_index_t)threadIdx.x) + 32) < 63) && ((-12 + ((nvfuser_index_t)threadIdx.z)) < (-(4 * (i10 + nvfuser_zero)))))) {
Array<float, 8, 1> T12;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
T12[((2 * i11) + i12)]
= T6[(((8 * i10) + (2 * i11)) + i12)]
+ T8[0];
}
}
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 4; ++i13) {
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
T9[(((((((nvfuser_index_t)threadIdx.x) + (2646 * ((nvfuser_index_t)threadIdx.z))) + (63 * ((nvfuser_index_t)blockIdx.x))) + (10584 * i10)) + (2646 * i13)) + (32 * (i14 + nvfuser_zero)))]
= T12[((2 * i13) + i14)];
}
}
} else {
Array<float, 8, 1> T12;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((-15 + ((nvfuser_index_t)threadIdx.z)) < ((-(4 * i10)) - (i11 + nvfuser_zero))) && ((-63 + ((nvfuser_index_t)threadIdx.x)) < (-(32 * (i12 + nvfuser_zero)))))) {
T12[((2 * i11) + i12)]
= T6[(((8 * i10) + (2 * i11)) + i12)]
+ T8[0];
}
}
}
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 4; ++i13) {
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
if ((((-15 + ((nvfuser_index_t)threadIdx.z)) < ((-(4 * i10)) - (i13 + nvfuser_zero))) && ((-63 + ((nvfuser_index_t)threadIdx.x)) < (-(32 * (i14 + nvfuser_zero)))))) {
T9[(((((((nvfuser_index_t)threadIdx.x) + (2646 * ((nvfuser_index_t)threadIdx.z))) + (63 * ((nvfuser_index_t)blockIdx.x))) + (10584 * i10)) + (2646 * i13)) + (32 * (i14 + nvfuser_zero)))]
= T12[((2 * i13) + i14)];
}
}
}
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
__global__ void nvfuser_N(Tensor<float, 5, 5> T0, Tensor<float, 5, 5> T3, Tensor<float, 3, 3> T9) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
NVFUSER_DEFINE_MAGIC_ZERO;
Array<float, 32, 1> T6;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
if ((((32 + ((nvfuser_index_t)threadIdx.x)) < 63) && ((3 + (4 * (i0 + nvfuser_zero))) < 15))) {
Array<float, 8, 1> T11;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 2; ++i2) {
T11[((2 * i1) + i2)]
= T3[(((((T3.alloc_stride[2LL] * ((nvfuser_index_t)blockIdx.x)) + (T3.alloc_stride[0LL] * (((4 * i0) + i1) / 5))) + (T3.alloc_stride[1LL] * (((4 * i0) + i1) % 5))) + (T3.alloc_stride[3LL] * ((((nvfuser_index_t)threadIdx.x) + (32 * (i2 + nvfuser_zero))) / 7))) + (T3.alloc_stride[4LL] * ((((nvfuser_index_t)threadIdx.x) + (32 * (i2 + nvfuser_zero))) % 7)))];
}
}
Array<float, 8, 1> T10;
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 4; ++i3) {
#pragma unroll
for(nvfuser_index_t i4 = 0; i4 < 2; ++i4) {
T10[((2 * i3) + i4)]
= T0[(((((T0.alloc_stride[2LL] * ((nvfuser_index_t)blockIdx.x)) + (T0.alloc_stride[0LL] * (((4 * i0) + i3) / 3))) + (T0.alloc_stride[1LL] * (((4 * i0) + i3) % 3))) + (T0.alloc_stride[3LL] * ((((nvfuser_index_t)threadIdx.x) + (32 * (i4 + nvfuser_zero))) / 9))) + (T0.alloc_stride[4LL] * ((((nvfuser_index_t)threadIdx.x) + (32 * (i4 + nvfuser_zero))) % 9)))];
}
}
#pragma unroll
for(nvfuser_index_t i5 = 0; i5 < 4; ++i5) {
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
Array<float, 1, 1> T1;
T1[0]
= sinf(T10[((2 * i5) + i6)]);
Array<float, 1, 1> T2;
T2[0]
= T1[0];
Array<float, 1, 1> T4;
T4[0]
= cosf(T11[((2 * i5) + i6)]);
Array<float, 1, 1> T5;
T5[0]
= T4[0];
T6[(((8 * i0) + (2 * i5)) + i6)]
= T2[0]
+ T5[0];
}
}
} else {
Array<float, 8, 1> T11;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 2; ++i2) {
if (((((4 * i0) + (i1 + nvfuser_zero)) < 15) && ((-63 + ((nvfuser_index_t)threadIdx.x)) < (-(32 * (i2 + nvfuser_zero)))))) {
T11[((2 * i1) + i2)]
= T3[(((((T3.alloc_stride[2LL] * ((nvfuser_index_t)blockIdx.x)) + (T3.alloc_stride[0LL] * (((4 * i0) + i1) / 5))) + (T3.alloc_stride[1LL] * (((4 * i0) + i1) % 5))) + (T3.alloc_stride[3LL] * ((((nvfuser_index_t)threadIdx.x) + (32 * (i2 + nvfuser_zero))) / 7))) + (T3.alloc_stride[4LL] * ((((nvfuser_index_t)threadIdx.x) + (32 * (i2 + nvfuser_zero))) % 7)))];
}
}
}
Array<float, 8, 1> T10;
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 4; ++i3) {
#pragma unroll
for(nvfuser_index_t i4 = 0; i4 < 2; ++i4) {
if (((((4 * i0) + (i3 + nvfuser_zero)) < 15) && ((-63 + ((nvfuser_index_t)threadIdx.x)) < (-(32 * (i4 + nvfuser_zero)))))) {
T10[((2 * i3) + i4)]
= T0[(((((T0.alloc_stride[2LL] * ((nvfuser_index_t)blockIdx.x)) + (T0.alloc_stride[0LL] * (((4 * i0) + i3) / 3))) + (T0.alloc_stride[1LL] * (((4 * i0) + i3) % 3))) + (T0.alloc_stride[3LL] * ((((nvfuser_index_t)threadIdx.x) + (32 * (i4 + nvfuser_zero))) / 9))) + (T0.alloc_stride[4LL] * ((((nvfuser_index_t)threadIdx.x) + (32 * (i4 + nvfuser_zero))) % 9)))];
}
}
}
#pragma unroll
for(nvfuser_index_t i5 = 0; i5 < 4; ++i5) {
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
Array<float, 1, 1> T1;
if (((((4 * i0) + (i5 + nvfuser_zero)) < 15) && ((-63 + ((nvfuser_index_t)threadIdx.x)) < (-(32 * (i6 + nvfuser_zero)))))) {
T1[0]
= sinf(T10[((2 * i5) + i6)]);
}
Array<float, 1, 1> T2;
if (((((4 * i0) + (i5 + nvfuser_zero)) < 15) && ((-63 + ((nvfuser_index_t)threadIdx.x)) < (-(32 * (i6 + nvfuser_zero)))))) {
T2[0]
= T1[0];
}
Array<float, 1, 1> T4;
if (((((4 * i0) + (i5 + nvfuser_zero)) < 15) && ((-63 + ((nvfuser_index_t)threadIdx.x)) < (-(32 * (i6 + nvfuser_zero)))))) {
T4[0]
= cosf(T11[((2 * i5) + i6)]);
}
Array<float, 1, 1> T5;
if (((((4 * i0) + (i5 + nvfuser_zero)) < 15) && ((-63 + ((nvfuser_index_t)threadIdx.x)) < (-(32 * (i6 + nvfuser_zero)))))) {
T5[0]
= T4[0];
}
if (((((4 * i0) + (i5 + nvfuser_zero)) < 15) && ((-63 + ((nvfuser_index_t)threadIdx.x)) < (-(32 * (i6 + nvfuser_zero)))))) {
T6[(((8 * i0) + (2 * i5)) + i6)]
= T2[0]
+ T5[0];
}
}
}
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 1, 1> T13;
T13[0] = 0.000000000e+00f;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
if ((((32 + ((nvfuser_index_t)threadIdx.x)) < 63) && ((3 + (4 * (i7 + nvfuser_zero))) < 15))) {
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
T13[0]
= T13[0]
+ T6[(((8 * i7) + (2 * i8)) + i9)];
}
}
} else {
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
if (((((4 * i7) + (i8 + nvfuser_zero)) < 15) && ((-63 + ((nvfuser_index_t)threadIdx.x)) < (-(32 * (i9 + nvfuser_zero)))))) {
T13[0]
= T13[0]
+ T6[(((8 * i7) + (2 * i8)) + i9)];
}
}
}
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 1, 1> T7;
T7[0] = 0.000000000e+00f;
blockReduce<true, false, true, true>(T7[0], T13[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T8;
broadcast::blockBroadcast<true, false, true, true>(T8[0], T7[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 4; ++i10) {
if ((((32 + ((nvfuser_index_t)threadIdx.x)) < 63) && ((3 + (4 * (i10 + nvfuser_zero))) < 15))) {
Array<float, 8, 1> T12;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
T12[((2 * i11) + i12)]
= T6[(((8 * i10) + (2 * i11)) + i12)]
+ T8[0];
}
}
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 4; ++i13) {
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
T9[((((((nvfuser_index_t)threadIdx.x) + (63 * ((nvfuser_index_t)blockIdx.x))) + (10584 * i10)) + (2646 * i13)) + (32 * (i14 + nvfuser_zero)))]
= T12[((2 * i13) + i14)];
}
}
} else {
Array<float, 8, 1> T12;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if (((((4 * i10) + (i11 + nvfuser_zero)) < 15) && ((-63 + ((nvfuser_index_t)threadIdx.x)) < (-(32 * (i12 + nvfuser_zero)))))) {
T12[((2 * i11) + i12)]
= T6[(((8 * i10) + (2 * i11)) + i12)]
+ T8[0];
}
}
}
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 4; ++i13) {
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
if (((((4 * i10) + (i13 + nvfuser_zero)) < 15) && ((-63 + ((nvfuser_index_t)threadIdx.x)) < (-(32 * (i14 + nvfuser_zero)))))) {
T9[((((((nvfuser_index_t)threadIdx.x) + (63 * ((nvfuser_index_t)blockIdx.x))) + (10584 * i10)) + (2646 * i13)) + (32 * (i14 + nvfuser_zero)))]
= T12[((2 * i13) + i14)];
}
}
}
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
--- 53997da5d
+++ 03a1b695e
@@ -3,27 +3,27 @@
void* shared_mem = array;
NVFUSER_DEFINE_MAGIC_ZERO;
Array<float, 32, 1> T6;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
- if ((((((nvfuser_index_t)threadIdx.x) + 32) < 63) && ((-12 + ((nvfuser_index_t)threadIdx.z)) < (-(4 * (i0 + nvfuser_zero)))))) {
+ if ((((32 + ((nvfuser_index_t)threadIdx.x)) < 63) && ((3 + (4 * (i0 + nvfuser_zero))) < 15))) {
Array<float, 8, 1> T11;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 2; ++i2) {
T11[((2 * i1) + i2)]
- = T3[((((((T3.alloc_stride[1LL] * ((nvfuser_index_t)threadIdx.z)) + (T3.alloc_stride[2LL] * ((nvfuser_index_t)blockIdx.x))) + (T3.alloc_stride[0LL] * (((4 * i0) + i1) / 5))) + (T3.alloc_stride[1LL] * (((4 * i0) + i1) % 5))) + (T3.alloc_stride[3LL] * ((((nvfuser_index_t)threadIdx.x) + (32 * (i2 + nvfuser_zero))) / 7))) + (T3.alloc_stride[4LL] * ((((nvfuser_index_t)threadIdx.x) + (32 * (i2 + nvfuser_zero))) % 7)))];
+ = T3[(((((T3.alloc_stride[2LL] * ((nvfuser_index_t)blockIdx.x)) + (T3.alloc_stride[0LL] * (((4 * i0) + i1) / 5))) + (T3.alloc_stride[1LL] * (((4 * i0) + i1) % 5))) + (T3.alloc_stride[3LL] * ((((nvfuser_index_t)threadIdx.x) + (32 * (i2 + nvfuser_zero))) / 7))) + (T3.alloc_stride[4LL] * ((((nvfuser_index_t)threadIdx.x) + (32 * (i2 + nvfuser_zero))) % 7)))];
}
}
Array<float, 8, 1> T10;
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 4; ++i3) {
#pragma unroll
for(nvfuser_index_t i4 = 0; i4 < 2; ++i4) {
T10[((2 * i3) + i4)]
- = T0[((((((T0.alloc_stride[1LL] * ((nvfuser_index_t)threadIdx.z)) + (T0.alloc_stride[2LL] * ((nvfuser_index_t)blockIdx.x))) + (T0.alloc_stride[0LL] * (((4 * i0) + i3) / 3))) + (T0.alloc_stride[1LL] * (((4 * i0) + i3) % 3))) + (T0.alloc_stride[3LL] * ((((nvfuser_index_t)threadIdx.x) + (32 * (i4 + nvfuser_zero))) / 9))) + (T0.alloc_stride[4LL] * ((((nvfuser_index_t)threadIdx.x) + (32 * (i4 + nvfuser_zero))) % 9)))];
+ = T0[(((((T0.alloc_stride[2LL] * ((nvfuser_index_t)blockIdx.x)) + (T0.alloc_stride[0LL] * (((4 * i0) + i3) / 3))) + (T0.alloc_stride[1LL] * (((4 * i0) + i3) % 3))) + (T0.alloc_stride[3LL] * ((((nvfuser_index_t)threadIdx.x) + (32 * (i4 + nvfuser_zero))) / 9))) + (T0.alloc_stride[4LL] * ((((nvfuser_index_t)threadIdx.x) + (32 * (i4 + nvfuser_zero))) % 9)))];
}
}
#pragma unroll
for(nvfuser_index_t i5 = 0; i5 < 4; ++i5) {
#pragma unroll
@@ -49,52 +49,52 @@
Array<float, 8, 1> T11;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 2; ++i2) {
- if ((((-15 + ((nvfuser_index_t)threadIdx.z)) < ((-(4 * i0)) - (i1 + nvfuser_zero))) && ((-63 + ((nvfuser_index_t)threadIdx.x)) < (-(32 * (i2 + nvfuser_zero)))))) {
+ if (((((4 * i0) + (i1 + nvfuser_zero)) < 15) && ((-63 + ((nvfuser_index_t)threadIdx.x)) < (-(32 * (i2 + nvfuser_zero)))))) {
T11[((2 * i1) + i2)]
- = T3[((((((T3.alloc_stride[1LL] * ((nvfuser_index_t)threadIdx.z)) + (T3.alloc_stride[2LL] * ((nvfuser_index_t)blockIdx.x))) + (T3.alloc_stride[0LL] * (((4 * i0) + i1) / 5))) + (T3.alloc_stride[1LL] * (((4 * i0) + i1) % 5))) + (T3.alloc_stride[3LL] * ((((nvfuser_index_t)threadIdx.x) + (32 * (i2 + nvfuser_zero))) / 7))) + (T3.alloc_stride[4LL] * ((((nvfuser_index_t)threadIdx.x) + (32 * (i2 + nvfuser_zero))) % 7)))];
+ = T3[(((((T3.alloc_stride[2LL] * ((nvfuser_index_t)blockIdx.x)) + (T3.alloc_stride[0LL] * (((4 * i0) + i1) / 5))) + (T3.alloc_stride[1LL] * (((4 * i0) + i1) % 5))) + (T3.alloc_stride[3LL] * ((((nvfuser_index_t)threadIdx.x) + (32 * (i2 + nvfuser_zero))) / 7))) + (T3.alloc_stride[4LL] * ((((nvfuser_index_t)threadIdx.x) + (32 * (i2 + nvfuser_zero))) % 7)))];
}
}
}
Array<float, 8, 1> T10;
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 4; ++i3) {
#pragma unroll
for(nvfuser_index_t i4 = 0; i4 < 2; ++i4) {
- if ((((-15 + ((nvfuser_index_t)threadIdx.z)) < ((-(4 * i0)) - (i3 + nvfuser_zero))) && ((-63 + ((nvfuser_index_t)threadIdx.x)) < (-(32 * (i4 + nvfuser_zero)))))) {
+ if (((((4 * i0) + (i3 + nvfuser_zero)) < 15) && ((-63 + ((nvfuser_index_t)threadIdx.x)) < (-(32 * (i4 + nvfuser_zero)))))) {
T10[((2 * i3) + i4)]
- = T0[((((((T0.alloc_stride[1LL] * ((nvfuser_index_t)threadIdx.z)) + (T0.alloc_stride[2LL] * ((nvfuser_index_t)blockIdx.x))) + (T0.alloc_stride[0LL] * (((4 * i0) + i3) / 3))) + (T0.alloc_stride[1LL] * (((4 * i0) + i3) % 3))) + (T0.alloc_stride[3LL] * ((((nvfuser_index_t)threadIdx.x) + (32 * (i4 + nvfuser_zero))) / 9))) + (T0.alloc_stride[4LL] * ((((nvfuser_index_t)threadIdx.x) + (32 * (i4 + nvfuser_zero))) % 9)))];
+ = T0[(((((T0.alloc_stride[2LL] * ((nvfuser_index_t)blockIdx.x)) + (T0.alloc_stride[0LL] * (((4 * i0) + i3) / 3))) + (T0.alloc_stride[1LL] * (((4 * i0) + i3) % 3))) + (T0.alloc_stride[3LL] * ((((nvfuser_index_t)threadIdx.x) + (32 * (i4 + nvfuser_zero))) / 9))) + (T0.alloc_stride[4LL] * ((((nvfuser_index_t)threadIdx.x) + (32 * (i4 + nvfuser_zero))) % 9)))];
}
}
}
#pragma unroll
for(nvfuser_index_t i5 = 0; i5 < 4; ++i5) {
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
Array<float, 1, 1> T1;
- if ((((-15 + ((nvfuser_index_t)threadIdx.z)) < ((-(4 * i0)) - (i5 + nvfuser_zero))) && ((-63 + ((nvfuser_index_t)threadIdx.x)) < (-(32 * (i6 + nvfuser_zero)))))) {
+ if (((((4 * i0) + (i5 + nvfuser_zero)) < 15) && ((-63 + ((nvfuser_index_t)threadIdx.x)) < (-(32 * (i6 + nvfuser_zero)))))) {
T1[0]
= sinf(T10[((2 * i5) + i6)]);
}
Array<float, 1, 1> T2;
- if ((((-15 + ((nvfuser_index_t)threadIdx.z)) < ((-(4 * i0)) - (i5 + nvfuser_zero))) && ((-63 + ((nvfuser_index_t)threadIdx.x)) < (-(32 * (i6 + nvfuser_zero)))))) {
+ if (((((4 * i0) + (i5 + nvfuser_zero)) < 15) && ((-63 + ((nvfuser_index_t)threadIdx.x)) < (-(32 * (i6 + nvfuser_zero)))))) {
T2[0]
= T1[0];
}
Array<float, 1, 1> T4;
- if ((((-15 + ((nvfuser_index_t)threadIdx.z)) < ((-(4 * i0)) - (i5 + nvfuser_zero))) && ((-63 + ((nvfuser_index_t)threadIdx.x)) < (-(32 * (i6 + nvfuser_zero)))))) {
+ if (((((4 * i0) + (i5 + nvfuser_zero)) < 15) && ((-63 + ((nvfuser_index_t)threadIdx.x)) < (-(32 * (i6 + nvfuser_zero)))))) {
T4[0]
= cosf(T11[((2 * i5) + i6)]);
}
Array<float, 1, 1> T5;
- if ((((-15 + ((nvfuser_index_t)threadIdx.z)) < ((-(4 * i0)) - (i5 + nvfuser_zero))) && ((-63 + ((nvfuser_index_t)threadIdx.x)) < (-(32 * (i6 + nvfuser_zero)))))) {
+ if (((((4 * i0) + (i5 + nvfuser_zero)) < 15) && ((-63 + ((nvfuser_index_t)threadIdx.x)) < (-(32 * (i6 + nvfuser_zero)))))) {
T5[0]
= T4[0];
}
- if ((((-15 + ((nvfuser_index_t)threadIdx.z)) < ((-(4 * i0)) - (i5 + nvfuser_zero))) && ((-63 + ((nvfuser_index_t)threadIdx.x)) < (-(32 * (i6 + nvfuser_zero)))))) {
+ if (((((4 * i0) + (i5 + nvfuser_zero)) < 15) && ((-63 + ((nvfuser_index_t)threadIdx.x)) < (-(32 * (i6 + nvfuser_zero)))))) {
T6[(((8 * i0) + (2 * i5)) + i6)]
= T2[0]
+ T5[0];
}
}
@@ -104,11 +104,11 @@
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 1, 1> T13;
T13[0] = 0.000000000e+00f;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
- if ((((((nvfuser_index_t)threadIdx.x) + 32) < 63) && ((-12 + ((nvfuser_index_t)threadIdx.z)) < (-(4 * (i7 + nvfuser_zero)))))) {
+ if ((((32 + ((nvfuser_index_t)threadIdx.x)) < 63) && ((3 + (4 * (i7 + nvfuser_zero))) < 15))) {
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
T13[0]
@@ -119,11 +119,11 @@
} else {
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
- if ((((-15 + ((nvfuser_index_t)threadIdx.z)) < ((-(4 * i7)) - (i8 + nvfuser_zero))) && ((-63 + ((nvfuser_index_t)threadIdx.x)) < (-(32 * (i9 + nvfuser_zero)))))) {
+ if (((((4 * i7) + (i8 + nvfuser_zero)) < 15) && ((-63 + ((nvfuser_index_t)threadIdx.x)) < (-(32 * (i9 + nvfuser_zero)))))) {
T13[0]
= T13[0]
+ T6[(((8 * i7) + (2 * i8)) + i9)];
}
}
@@ -136,11 +136,11 @@
blockReduce<true, false, true, true>(T7[0], T13[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T8;
broadcast::blockBroadcast<true, false, true, true>(T8[0], T7[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 4; ++i10) {
- if ((((((nvfuser_index_t)threadIdx.x) + 32) < 63) && ((-12 + ((nvfuser_index_t)threadIdx.z)) < (-(4 * (i10 + nvfuser_zero)))))) {
+ if ((((32 + ((nvfuser_index_t)threadIdx.x)) < 63) && ((3 + (4 * (i10 + nvfuser_zero))) < 15))) {
Array<float, 8, 1> T12;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
@@ -151,33 +151,33 @@
}
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 4; ++i13) {
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
- T9[(((((((nvfuser_index_t)threadIdx.x) + (2646 * ((nvfuser_index_t)threadIdx.z))) + (63 * ((nvfuser_index_t)blockIdx.x))) + (10584 * i10)) + (2646 * i13)) + (32 * (i14 + nvfuser_zero)))]
+ T9[((((((nvfuser_index_t)threadIdx.x) + (63 * ((nvfuser_index_t)blockIdx.x))) + (10584 * i10)) + (2646 * i13)) + (32 * (i14 + nvfuser_zero)))]
= T12[((2 * i13) + i14)];
}
}
} else {
Array<float, 8, 1> T12;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
- if ((((-15 + ((nvfuser_index_t)threadIdx.z)) < ((-(4 * i10)) - (i11 + nvfuser_zero))) && ((-63 + ((nvfuser_index_t)threadIdx.x)) < (-(32 * (i12 + nvfuser_zero)))))) {
+ if (((((4 * i10) + (i11 + nvfuser_zero)) < 15) && ((-63 + ((nvfuser_index_t)threadIdx.x)) < (-(32 * (i12 + nvfuser_zero)))))) {
T12[((2 * i11) + i12)]
= T6[(((8 * i10) + (2 * i11)) + i12)]
+ T8[0];
}
}
}
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 4; ++i13) {
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
- if ((((-15 + ((nvfuser_index_t)threadIdx.z)) < ((-(4 * i10)) - (i13 + nvfuser_zero))) && ((-63 + ((nvfuser_index_t)threadIdx.x)) < (-(32 * (i14 + nvfuser_zero)))))) {
- T9[(((((((nvfuser_index_t)threadIdx.x) + (2646 * ((nvfuser_index_t)threadIdx.z))) + (63 * ((nvfuser_index_t)blockIdx.x))) + (10584 * i10)) + (2646 * i13)) + (32 * (i14 + nvfuser_zero)))]
+ if (((((4 * i10) + (i13 + nvfuser_zero)) < 15) && ((-63 + ((nvfuser_index_t)threadIdx.x)) < (-(32 * (i14 + nvfuser_zero)))))) {
+ T9[((((((nvfuser_index_t)threadIdx.x) + (63 * ((nvfuser_index_t)blockIdx.x))) + (10584 * i10)) + (2646 * i13)) + (32 * (i14 + nvfuser_zero)))]
= T12[((2 * i13) + i14)];
}
}
}
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_e06bf064_1911011nvfuser_368ENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEEE14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_e06bf064_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_e06bf064_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_e06bf064_191103std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_e06bf064_191105arrayE[];
.global .align 4 .b8 __cudart_i2opi_f[24] = {65, 144, 67, 60, 153, 149, 98, 219, 192, 221, 52, 245, 209, 87, 39, 252, 41, 21, 68, 78, 110, 131, 249, 162};
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_e06bf064_1911011nvfuser_368ENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_e06bf064_1911011nvfuser_368ENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_0[48],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_e06bf064_1911011nvfuser_368ENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_1[48],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_e06bf064_1911011nvfuser_368ENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_2[32]
)
{
.local .align 4 .b8 __local_depot0[28];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<1830>;
.reg .f32 %f<5935>;
.reg .b32 %r<8741>;
.reg .f64 %fd<257>;
.reg .b64 %rd<2796>;
// demoted variable
.shared .align 4 .u32 _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_e06bf064_1911011nvfuser_368ENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEEE14nvfuser_zero_s;
mov.u64 %SPL, __local_depot0;
ld.param.v2.u32 {%r2601, %r2602}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_e06bf064_1911011nvfuser_368ENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_0+24];
ld.param.v2.u32 {%r2603, %r2604}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_e06bf064_1911011nvfuser_368ENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_0+32];
ld.param.v2.u32 {%r2605, %r2606}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_e06bf064_1911011nvfuser_368ENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_0+40];
ld.param.v2.u32 {%r2611, %r2612}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_e06bf064_1911011nvfuser_368ENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_1+24];
ld.param.v2.u32 {%r2613, %r2614}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_e06bf064_1911011nvfuser_368ENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_1+32];
ld.param.v2.u32 {%r2615, %r2616}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_e06bf064_1911011nvfuser_368ENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_1+40];
ld.param.u64 %rd609, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_e06bf064_1911011nvfuser_368ENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_1];
ld.param.u64 %rd608, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_e06bf064_1911011nvfuser_368ENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_0];
ld.param.u64 %rd610, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_e06bf064_1911011nvfuser_368ENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_2];
add.u64 %rd1, %SPL, 0;
cvta.to.global.u64 %rd2, %rd608;
cvta.to.global.u64 %rd3, %rd609;
cvta.to.global.u64 %rd4, %rd610;
mov.u32 %r1, %tid.x;
setp.ne.s32 %p31, %r1, 0;
@%p31 bra $L__BB0_2;
mov.u32 %r2617, 0;
st.shared.u32 [_ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_e06bf064_1911011nvfuser_368ENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEEE14nvfuser_zero_s], %r2617;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd612, _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_e06bf064_1911011nvfuser_368ENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEEE14nvfuser_zero_s;
atom.shared.min.s32 %r2618, [%rd612], %r1;
ld.shared.u32 %r12, [_ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_e06bf064_1911011nvfuser_368ENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEEE14nvfuser_zero_s];
mov.u32 %r13, %tid.z;
add.s32 %r14, %r1, -63;
add.s64 %rd5, %rd1, 24;
setp.gt.s32 %p32, %r1, 30;
@%p32 bra $L__BB0_4;
shl.b32 %r2619, %r12, 2;
neg.s32 %r2620, %r2619;
add.s32 %r2621, %r13, -12;
setp.lt.s32 %p33, %r2621, %r2620;
@%p33 bra $L__BB0_283;
bra.uni $L__BB0_4;
$L__BB0_283:
mov.u32 %r3359, %ctaid.x;
shl.b32 %r3360, %r12, 5;
add.s32 %r3361, %r3360, %r1;
mul.hi.s32 %r3362, %r3361, -1840700269;
add.s32 %r3363, %r3362, %r3361;
shr.u32 %r3364, %r3363, 31;
shr.s32 %r3365, %r3363, 2;
add.s32 %r3366, %r3365, %r3364;
mul.lo.s32 %r3367, %r3366, 7;
sub.s32 %r3368, %r3361, %r3367;
mul.lo.s32 %r3369, %r13, %r2613;
mad.lo.s32 %r3370, %r2614, %r3359, %r3369;
mad.lo.s32 %r3371, %r3366, %r2615, %r3370;
mad.lo.s32 %r3372, %r3368, %r2616, %r3371;
mul.wide.s32 %rd869, %r3372, 4;
add.s64 %rd870, %rd3, %rd869;
ld.global.f32 %f296, [%rd870];
add.s32 %r3373, %r3361, 32;
mul.hi.s32 %r3374, %r3373, -1840700269;
add.s32 %r3375, %r3374, %r3373;
shr.u32 %r3376, %r3375, 31;
shr.s32 %r3377, %r3375, 2;
add.s32 %r3378, %r3377, %r3376;
mul.lo.s32 %r3379, %r3378, 7;
sub.s32 %r3380, %r3373, %r3379;
mad.lo.s32 %r3381, %r3378, %r2615, %r3370;
mad.lo.s32 %r3382, %r3380, %r2616, %r3381;
mul.wide.s32 %rd871, %r3382, 4;
add.s64 %rd872, %rd3, %rd871;
ld.global.f32 %f297, [%rd872];
mul.wide.s32 %rd873, %r2613, 4;
add.s64 %rd874, %rd870, %rd873;
ld.global.f32 %f298, [%rd874];
add.s64 %rd875, %rd872, %rd873;
ld.global.f32 %f299, [%rd875];
add.s64 %rd876, %rd874, %rd873;
ld.global.f32 %f300, [%rd876];
add.s64 %rd877, %rd875, %rd873;
ld.global.f32 %f301, [%rd877];
add.s64 %rd878, %rd876, %rd873;
ld.global.f32 %f302, [%rd878];
add.s64 %rd879, %rd877, %rd873;
ld.global.f32 %f303, [%rd879];
mul.lo.s32 %r3383, %r2604, %r3359;
mul.hi.s32 %r3384, %r3361, 954437177;
shr.u32 %r3385, %r3384, 31;
shr.s32 %r3386, %r3384, 1;
add.s32 %r3387, %r3386, %r3385;
mul.lo.s32 %r3388, %r3387, %r2605;
mul.lo.s32 %r3389, %r3387, 9;
sub.s32 %r3390, %r3361, %r3389;
mul.lo.s32 %r3391, %r3390, %r2606;
mul.lo.s32 %r3392, %r13, %r2603;
add.s32 %r3393, %r3383, %r3392;
add.s32 %r3394, %r3393, %r3388;
add.s32 %r3395, %r3394, %r3391;
mul.wide.s32 %rd880, %r3395, 4;
add.s64 %rd881, %rd2, %rd880;
ld.global.f32 %f304, [%rd881];
mul.hi.s32 %r3396, %r3373, 954437177;
shr.u32 %r3397, %r3396, 31;
shr.s32 %r3398, %r3396, 1;
add.s32 %r3399, %r3398, %r3397;
mul.lo.s32 %r3400, %r3399, %r2605;
mul.lo.s32 %r3401, %r3399, 9;
sub.s32 %r3402, %r3373, %r3401;
mul.lo.s32 %r3403, %r3402, %r2606;
add.s32 %r3404, %r3393, %r3400;
add.s32 %r3405, %r3404, %r3403;
mul.wide.s32 %rd882, %r3405, 4;
add.s64 %rd883, %rd2, %rd882;
ld.global.f32 %f305, [%rd883];
mul.wide.s32 %rd884, %r2603, 4;
add.s64 %rd885, %rd881, %rd884;
ld.global.f32 %f306, [%rd885];
add.s64 %rd886, %rd883, %rd884;
ld.global.f32 %f307, [%rd886];
add.s64 %rd887, %rd885, %rd884;
ld.global.f32 %f308, [%rd887];
add.s64 %rd888, %rd886, %rd884;
ld.global.f32 %f309, [%rd888];
add.s32 %r3406, %r2602, %r3383;
add.s32 %r3407, %r3406, %r3392;
add.s32 %r3408, %r3407, %r3388;
add.s32 %r3409, %r3408, %r3391;
mul.wide.s32 %rd889, %r3409, 4;
add.s64 %rd890, %rd2, %rd889;
ld.global.f32 %f310, [%rd890];
add.s32 %r3410, %r3407, %r3400;
add.s32 %r3411, %r3410, %r3403;
mul.wide.s32 %rd891, %r3411, 4;
add.s64 %rd892, %rd2, %rd891;
ld.global.f32 %f311, [%rd892];
mul.f32 %f2610, %f304, 0f3F22F983;
cvt.rni.s32.f32 %r8293, %f2610;
cvt.rn.f32.s32 %f2611, %r8293;
mov.f32 %f2612, 0fBFC90FDA;
fma.rn.f32 %f2613, %f2611, %f2612, %f304;
mov.f32 %f2614, 0fB3A22168;
fma.rn.f32 %f2615, %f2611, %f2614, %f2613;
mov.f32 %f2616, 0fA7C234C5;
fma.rn.f32 %f5277, %f2611, %f2616, %f2615;
abs.f32 %f313, %f304;
setp.ltu.f32 %p264, %f313, 0f47CE4780;
@%p264 bra $L__BB0_291;
setp.eq.f32 %p265, %f313, 0f7F800000;
@%p265 bra $L__BB0_290;
bra.uni $L__BB0_285;
$L__BB0_290:
mov.f32 %f2619, 0f00000000;
mul.rn.f32 %f5277, %f304, %f2619;
mov.u32 %r8293, 0;
bra.uni $L__BB0_291;
$L__BB0_4:
mov.u32 %r15, %ctaid.x;
add.s32 %r2622, %r13, -15;
neg.s32 %r16, %r12;
setp.ge.s32 %p34, %r2622, %r16;
mul.lo.s32 %r2623, %r13, %r2613;
mad.lo.s32 %r17, %r2614, %r15, %r2623;
@%p34 bra $L__BB0_7;
shl.b32 %r18, %r12, 5;
neg.s32 %r2624, %r18;
setp.ge.s32 %p35, %r14, %r2624;
@%p35 bra $L__BB0_7;
add.s32 %r2625, %r18, %r1;
mul.hi.s32 %r2626, %r2625, -1840700269;
add.s32 %r2627, %r2626, %r2625;
shr.u32 %r2628, %r2627, 31;
shr.s32 %r2629, %r2627, 2;
add.s32 %r2630, %r2629, %r2628;
mul.lo.s32 %r2631, %r2630, 7;
sub.s32 %r2632, %r2625, %r2631;
mad.lo.s32 %r2633, %r2630, %r2615, %r17;
mad.lo.s32 %r2634, %r2632, %r2616, %r2633;
mul.wide.s32 %rd613, %r2634, 4;
add.s64 %rd614, %rd3, %rd613;
ld.global.f32 %f5531, [%rd614];
$L__BB0_7:
@%p34 bra $L__BB0_10;
shl.b32 %r19, %r12, 5;
mov.u32 %r2636, -32;
sub.s32 %r2637, %r2636, %r19;
setp.ge.s32 %p37, %r14, %r2637;
@%p37 bra $L__BB0_10;
add.s32 %r2638, %r19, %r1;
add.s32 %r2639, %r2638, 32;
mul.hi.s32 %r2640, %r2639, -1840700269;
add.s32 %r2641, %r2640, %r2639;
shr.u32 %r2642, %r2641, 31;
shr.s32 %r2643, %r2641, 2;
add.s32 %r2644, %r2643, %r2642;
mul.lo.s32 %r2645, %r2644, 7;
sub.s32 %r2646, %r2639, %r2645;
mad.lo.s32 %r2647, %r2644, %r2615, %r17;
mad.lo.s32 %r2648, %r2646, %r2616, %r2647;
mul.wide.s32 %rd615, %r2648, 4;
add.s64 %rd616, %rd3, %rd615;
ld.global.f32 %f5339, [%rd616];
$L__BB0_10:
add.s32 %r8185, %r13, -15;
not.b32 %r20, %r12;
setp.ge.s32 %p38, %r8185, %r20;
add.s32 %r21, %r17, %r2613;
@%p38 bra $L__BB0_13;
shl.b32 %r22, %r12, 5;
neg.s32 %r2650, %r22;
setp.ge.s32 %p39, %r14, %r2650;
@%p39 bra $L__BB0_13;
add.s32 %r2651, %r22, %r1;
mul.hi.s32 %r2652, %r2651, -1840700269;
add.s32 %r2653, %r2652, %r2651;
shr.u32 %r2654, %r2653, 31;
shr.s32 %r2655, %r2653, 2;
add.s32 %r2656, %r2655, %r2654;
mul.lo.s32 %r2657, %r2656, 7;
sub.s32 %r2658, %r2651, %r2657;
mad.lo.s32 %r2659, %r2656, %r2615, %r21;
mad.lo.s32 %r2660, %r2658, %r2616, %r2659;
mul.wide.s32 %rd617, %r2660, 4;
add.s64 %rd618, %rd3, %rd617;
ld.global.f32 %f5338, [%rd618];
$L__BB0_13:
not.b32 %r8225, %r12;
add.s32 %r8224, %r13, -15;
setp.ge.s32 %p1829, %r8224, %r8225;
@%p1829 bra $L__BB0_16;
shl.b32 %r23, %r12, 5;
mov.u32 %r2662, -32;
sub.s32 %r2663, %r2662, %r23;
setp.ge.s32 %p41, %r14, %r2663;
@%p41 bra $L__BB0_16;
add.s32 %r2664, %r23, %r1;
add.s32 %r2665, %r2664, 32;
mul.hi.s32 %r2666, %r2665, -1840700269;
add.s32 %r2667, %r2666, %r2665;
shr.u32 %r2668, %r2667, 31;
shr.s32 %r2669, %r2667, 2;
add.s32 %r2670, %r2669, %r2668;
mul.lo.s32 %r2671, %r2670, 7;
sub.s32 %r2672, %r2665, %r2671;
mad.lo.s32 %r2673, %r2670, %r2615, %r21;
mad.lo.s32 %r2674, %r2672, %r2616, %r2673;
mul.wide.s32 %rd619, %r2674, 4;
add.s64 %rd620, %rd3, %rd619;
ld.global.f32 %f5337, [%rd620];
$L__BB0_16:
add.s32 %r8186, %r13, -15;
mov.u32 %r2676, -2;
sub.s32 %r24, %r2676, %r12;
setp.ge.s32 %p42, %r8186, %r24;
add.s32 %r25, %r21, %r2613;
@%p42 bra $L__BB0_19;
shl.b32 %r26, %r12, 5;
neg.s32 %r2677, %r26;
setp.ge.s32 %p43, %r14, %r2677;
@%p43 bra $L__BB0_19;
add.s32 %r2678, %r26, %r1;
mul.hi.s32 %r2679, %r2678, -1840700269;
add.s32 %r2680, %r2679, %r2678;
shr.u32 %r2681, %r2680, 31;
shr.s32 %r2682, %r2680, 2;
add.s32 %r2683, %r2682, %r2681;
mul.lo.s32 %r2684, %r2683, 7;
sub.s32 %r2685, %r2678, %r2684;
mad.lo.s32 %r2686, %r2683, %r2615, %r25;
mad.lo.s32 %r2687, %r2685, %r2616, %r2686;
mul.wide.s32 %rd621, %r2687, 4;
add.s64 %rd622, %rd3, %rd621;
ld.global.f32 %f5336, [%rd622];
$L__BB0_19:
mov.u32 %r8220, -2;
sub.s32 %r8219, %r8220, %r12;
add.s32 %r8218, %r13, -15;
setp.ge.s32 %p1827, %r8218, %r8219;
@%p1827 bra $L__BB0_22;
shl.b32 %r27, %r12, 5;
mov.u32 %r2689, -32;
sub.s32 %r2690, %r2689, %r27;
setp.ge.s32 %p45, %r14, %r2690;
@%p45 bra $L__BB0_22;
add.s32 %r2691, %r27, %r1;
add.s32 %r2692, %r2691, 32;
mul.hi.s32 %r2693, %r2692, -1840700269;
add.s32 %r2694, %r2693, %r2692;
shr.u32 %r2695, %r2694, 31;
shr.s32 %r2696, %r2694, 2;
add.s32 %r2697, %r2696, %r2695;
mul.lo.s32 %r2698, %r2697, 7;
sub.s32 %r2699, %r2692, %r2698;
mad.lo.s32 %r2700, %r2697, %r2615, %r25;
mad.lo.s32 %r2701, %r2699, %r2616, %r2700;
mul.wide.s32 %rd623, %r2701, 4;
add.s64 %rd624, %rd3, %rd623;
ld.global.f32 %f5335, [%rd624];
$L__BB0_22:
add.s32 %r8051, %r13, -15;
mov.u32 %r2703, -3;
sub.s32 %r28, %r2703, %r12;
setp.ge.s32 %p46, %r8051, %r28;
add.s32 %r29, %r25, %r2613;
@%p46 bra $L__BB0_25;
shl.b32 %r30, %r12, 5;
neg.s32 %r2704, %r30;
setp.ge.s32 %p47, %r14, %r2704;
@%p47 bra $L__BB0_25;
add.s32 %r2705, %r30, %r1;
mul.hi.s32 %r2706, %r2705, -1840700269;
add.s32 %r2707, %r2706, %r2705;
shr.u32 %r2708, %r2707, 31;
shr.s32 %r2709, %r2707, 2;
add.s32 %r2710, %r2709, %r2708;
mul.lo.s32 %r2711, %r2710, 7;
sub.s32 %r2712, %r2705, %r2711;
mad.lo.s32 %r2713, %r2710, %r2615, %r29;
mad.lo.s32 %r2714, %r2712, %r2616, %r2713;
mul.wide.s32 %rd625, %r2714, 4;
add.s64 %rd626, %rd3, %rd625;
ld.global.f32 %f5334, [%rd626];
$L__BB0_25:
mov.u32 %r8223, -3;
sub.s32 %r8222, %r8223, %r12;
add.s32 %r8221, %r13, -15;
setp.ge.s32 %p1828, %r8221, %r8222;
@%p1828 bra $L__BB0_28;
shl.b32 %r31, %r12, 5;
mov.u32 %r2716, -32;
sub.s32 %r2717, %r2716, %r31;
setp.ge.s32 %p49, %r14, %r2717;
@%p49 bra $L__BB0_28;
add.s32 %r2718, %r31, %r1;
add.s32 %r2719, %r2718, 32;
mul.hi.s32 %r2720, %r2719, -1840700269;
add.s32 %r2721, %r2720, %r2719;
shr.u32 %r2722, %r2721, 31;
shr.s32 %r2723, %r2721, 2;
add.s32 %r2724, %r2723, %r2722;
mul.lo.s32 %r2725, %r2724, 7;
sub.s32 %r2726, %r2719, %r2725;
mad.lo.s32 %r2727, %r2724, %r2615, %r29;
mad.lo.s32 %r2728, %r2726, %r2616, %r2727;
mul.wide.s32 %rd627, %r2728, 4;
add.s64 %rd628, %rd3, %rd627;
ld.global.f32 %f5333, [%rd628];
$L__BB0_28:
neg.s32 %r8215, %r12;
add.s32 %r8214, %r13, -15;
setp.ge.s32 %p1825, %r8214, %r8215;
mov.u32 %r8052, %ctaid.x;
mul.lo.s32 %r32, %r13, %r2603;
mul.lo.s32 %r33, %r2604, %r8052;
add.s32 %r34, %r33, %r32;
@%p1825 bra $L__BB0_31;
shl.b32 %r35, %r12, 5;
neg.s32 %r2730, %r35;
setp.ge.s32 %p51, %r14, %r2730;
@%p51 bra $L__BB0_31;
add.s32 %r2731, %r35, %r1;
mul.hi.s32 %r2732, %r2731, 954437177;
shr.u32 %r2733, %r2732, 31;
shr.s32 %r2734, %r2732, 1;
add.s32 %r2735, %r2734, %r2733;
mul.lo.s32 %r2736, %r2735, 9;
sub.s32 %r2737, %r2731, %r2736;
mad.lo.s32 %r2738, %r2735, %r2605, %r34;
mad.lo.s32 %r2739, %r2737, %r2606, %r2738;
mul.wide.s32 %rd629, %r2739, 4;
add.s64 %rd630, %rd2, %rd629;
ld.global.f32 %f5348, [%rd630];
$L__BB0_31:
neg.s32 %r8194, %r12;
add.s32 %r8193, %r13, -15;
setp.ge.s32 %p1820, %r8193, %r8194;
@%p1820 bra $L__BB0_34;
shl.b32 %r36, %r12, 5;
mov.u32 %r2741, -32;
sub.s32 %r2742, %r2741, %r36;
setp.ge.s32 %p53, %r14, %r2742;
@%p53 bra $L__BB0_34;
add.s32 %r2743, %r36, %r1;
add.s32 %r2744, %r2743, 32;
mul.hi.s32 %r2745, %r2744, 954437177;
shr.u32 %r2746, %r2745, 31;
shr.s32 %r2747, %r2745, 1;
add.s32 %r2748, %r2747, %r2746;
mul.lo.s32 %r2749, %r2748, 9;
sub.s32 %r2750, %r2744, %r2749;
mad.lo.s32 %r2751, %r2748, %r2605, %r34;
mad.lo.s32 %r2752, %r2750, %r2606, %r2751;
mul.wide.s32 %rd631, %r2752, 4;
add.s64 %rd632, %rd2, %rd631;
ld.global.f32 %f5347, [%rd632];
$L__BB0_34:
not.b32 %r8217, %r12;
add.s32 %r8216, %r13, -15;
setp.ge.s32 %p1826, %r8216, %r8217;
add.s32 %r37, %r34, %r2603;
@%p1826 bra $L__BB0_37;
shl.b32 %r38, %r12, 5;
neg.s32 %r2754, %r38;
setp.ge.s32 %p55, %r14, %r2754;
@%p55 bra $L__BB0_37;
add.s32 %r2755, %r38, %r1;
mul.hi.s32 %r2756, %r2755, 954437177;
shr.u32 %r2757, %r2756, 31;
shr.s32 %r2758, %r2756, 1;
add.s32 %r2759, %r2758, %r2757;
mul.lo.s32 %r2760, %r2759, 9;
sub.s32 %r2761, %r2755, %r2760;
mad.lo.s32 %r2762, %r2759, %r2605, %r37;
mad.lo.s32 %r2763, %r2761, %r2606, %r2762;
mul.wide.s32 %rd633, %r2763, 4;
add.s64 %rd634, %rd2, %rd633;
ld.global.f32 %f5346, [%rd634];
$L__BB0_37:
not.b32 %r8196, %r12;
add.s32 %r8195, %r13, -15;
setp.ge.s32 %p1821, %r8195, %r8196;
@%p1821 bra $L__BB0_40;
shl.b32 %r39, %r12, 5;
mov.u32 %r2765, -32;
sub.s32 %r2766, %r2765, %r39;
setp.ge.s32 %p57, %r14, %r2766;
@%p57 bra $L__BB0_40;
add.s32 %r2767, %r39, %r1;
add.s32 %r2768, %r2767, 32;
mul.hi.s32 %r2769, %r2768, 954437177;
shr.u32 %r2770, %r2769, 31;
shr.s32 %r2771, %r2769, 1;
add.s32 %r2772, %r2771, %r2770;
mul.lo.s32 %r2773, %r2772, 9;
sub.s32 %r2774, %r2768, %r2773;
mad.lo.s32 %r2775, %r2772, %r2605, %r37;
mad.lo.s32 %r2776, %r2774, %r2606, %r2775;
mul.wide.s32 %rd635, %r2776, 4;
add.s64 %rd636, %rd2, %rd635;
ld.global.f32 %f5345, [%rd636];
$L__BB0_40:
mov.u32 %r8189, -2;
sub.s32 %r8188, %r8189, %r12;
add.s32 %r8187, %r13, -15;
setp.ge.s32 %p1818, %r8187, %r8188;
add.s32 %r40, %r37, %r2603;
@%p1818 bra $L__BB0_43;
shl.b32 %r41, %r12, 5;
neg.s32 %r2778, %r41;
setp.ge.s32 %p59, %r14, %r2778;
@%p59 bra $L__BB0_43;
add.s32 %r2779, %r41, %r1;
mul.hi.s32 %r2780, %r2779, 954437177;
shr.u32 %r2781, %r2780, 31;
shr.s32 %r2782, %r2780, 1;
add.s32 %r2783, %r2782, %r2781;
mul.lo.s32 %r2784, %r2783, 9;
sub.s32 %r2785, %r2779, %r2784;
mad.lo.s32 %r2786, %r2783, %r2605, %r40;
mad.lo.s32 %r2787, %r2785, %r2606, %r2786;
mul.wide.s32 %rd637, %r2787, 4;
add.s64 %rd638, %rd2, %rd637;
ld.global.f32 %f5344, [%rd638];
$L__BB0_43:
mov.u32 %r8060, -2;
sub.s32 %r8059, %r8060, %r12;
add.s32 %r8058, %r13, -15;
setp.ge.s32 %p1802, %r8058, %r8059;
@%p1802 bra $L__BB0_46;
shl.b32 %r42, %r12, 5;
mov.u32 %r2789, -32;
sub.s32 %r2790, %r2789, %r42;
setp.ge.s32 %p61, %r14, %r2790;
@%p61 bra $L__BB0_46;
add.s32 %r2791, %r42, %r1;
add.s32 %r2792, %r2791, 32;
mul.hi.s32 %r2793, %r2792, 954437177;
shr.u32 %r2794, %r2793, 31;
shr.s32 %r2795, %r2793, 1;
add.s32 %r2796, %r2795, %r2794;
mul.lo.s32 %r2797, %r2796, 9;
sub.s32 %r2798, %r2792, %r2797;
mad.lo.s32 %r2799, %r2796, %r2605, %r40;
mad.lo.s32 %r2800, %r2798, %r2606, %r2799;
mul.wide.s32 %rd639, %r2800, 4;
add.s64 %rd640, %rd2, %rd639;
ld.global.f32 %f5343, [%rd640];
$L__BB0_46:
mov.u32 %r8192, -3;
sub.s32 %r8191, %r8192, %r12;
add.s32 %r8190, %r13, -15;
setp.ge.s32 %p1819, %r8190, %r8191;
mul.lo.s32 %r8055, %r13, %r2603;
mov.u32 %r8054, %ctaid.x;
mul.lo.s32 %r8053, %r2604, %r8054;
add.s32 %r2802, %r2602, %r8053;
add.s32 %r43, %r2802, %r8055;
@%p1819 bra $L__BB0_49;
shl.b32 %r44, %r12, 5;
neg.s32 %r2803, %r44;
setp.ge.s32 %p63, %r14, %r2803;
@%p63 bra $L__BB0_49;
add.s32 %r2804, %r44, %r1;
mul.hi.s32 %r2805, %r2804, 954437177;
shr.u32 %r2806, %r2805, 31;
shr.s32 %r2807, %r2805, 1;
add.s32 %r2808, %r2807, %r2806;
mul.lo.s32 %r2809, %r2808, 9;
sub.s32 %r2810, %r2804, %r2809;
mad.lo.s32 %r2811, %r2808, %r2605, %r43;
mad.lo.s32 %r2812, %r2810, %r2606, %r2811;
mul.wide.s32 %rd641, %r2812, 4;
add.s64 %rd642, %rd2, %rd641;
ld.global.f32 %f5342, [%rd642];
$L__BB0_49:
mov.u32 %r8063, -3;
sub.s32 %r8062, %r8063, %r12;
add.s32 %r8061, %r13, -15;
setp.ge.s32 %p1803, %r8061, %r8062;
@%p1803 bra $L__BB0_52;
shl.b32 %r45, %r12, 5;
mov.u32 %r2814, -32;
sub.s32 %r2815, %r2814, %r45;
setp.ge.s32 %p65, %r14, %r2815;
@%p65 bra $L__BB0_52;
add.s32 %r2816, %r45, %r1;
add.s32 %r2817, %r2816, 32;
mul.hi.s32 %r2818, %r2817, 954437177;
shr.u32 %r2819, %r2818, 31;
shr.s32 %r2820, %r2818, 1;
add.s32 %r2821, %r2820, %r2819;
mul.lo.s32 %r2822, %r2821, 9;
sub.s32 %r2823, %r2817, %r2822;
mad.lo.s32 %r2824, %r2821, %r2605, %r43;
mad.lo.s32 %r2825, %r2823, %r2606, %r2824;
mul.wide.s32 %rd643, %r2825, 4;
add.s64 %rd644, %rd2, %rd643;
ld.global.f32 %f5341, [%rd644];
$L__BB0_52:
neg.s32 %r8057, %r12;
add.s32 %r8056, %r13, -15;
setp.ge.s32 %p1801, %r8056, %r8057;
@%p1801 bra $L__BB0_80;
shl.b32 %r2827, %r12, 5;
neg.s32 %r2828, %r2827;
setp.ge.s32 %p67, %r14, %r2828;
@%p67 bra $L__BB0_66;
mul.f32 %f2259, %f5348, 0f3F22F983;
cvt.rni.s32.f32 %r8229, %f2259;
cvt.rn.f32.s32 %f2260, %r8229;
mov.f32 %f2261, 0fBFC90FDA;
fma.rn.f32 %f2262, %f2260, %f2261, %f5348;
mov.f32 %f2263, 0fB3A22168;
fma.rn.f32 %f2264, %f2260, %f2263, %f2262;
mov.f32 %f2265, 0fA7C234C5;
fma.rn.f32 %f5180, %f2260, %f2265, %f2264;
abs.f32 %f34, %f5348;
setp.ltu.f32 %p68, %f34, 0f47CE4780;
@%p68 bra $L__BB0_62;
setp.eq.f32 %p69, %f34, 0f7F800000;
@%p69 bra $L__BB0_61;
bra.uni $L__BB0_56;
$L__BB0_61:
mov.f32 %f2268, 0f00000000;
mul.rn.f32 %f5180, %f5348, %f2268;
mov.u32 %r8229, 0;
bra.uni $L__BB0_62;
$L__BB0_285:
mov.b32 %r348, %f304;
shr.u32 %r3413, %r348, 23;
and.b32 %r3414, %r3413, 255;
add.s32 %r349, %r3414, -128;
shl.b32 %r3415, %r348, 8;
or.b32 %r350, %r3415, -2147483648;
shr.u32 %r351, %r349, 5;
mov.u64 %rd2530, 0;
mov.u32 %r8290, 0;
mov.u64 %rd896, __cudart_i2opi_f;
mov.u64 %rd2531, %rd2530;
$L__BB0_286:
.pragma "nounroll";
shl.b64 %rd895, %rd2530, 2;
add.s64 %rd897, %rd896, %rd895;
ld.global.nc.u32 %r3416, [%rd897];
mad.wide.u32 %rd898, %r3416, %r350, %rd2531;
shr.u64 %rd2531, %rd898, 32;
add.s64 %rd899, %rd1, %rd895;
st.local.u32 [%rd899], %rd898;
add.s32 %r8290, %r8290, 1;
cvt.s64.s32 %rd2530, %r8290;
setp.ne.s32 %p266, %r8290, 6;
@%p266 bra $L__BB0_286;
st.local.u32 [%rd5], %rd2531;
mov.u32 %r3417, 4;
sub.s32 %r354, %r3417, %r351;
mov.u32 %r3418, 6;
sub.s32 %r3419, %r3418, %r351;
mul.wide.s32 %rd900, %r3419, 4;
add.s64 %rd901, %rd1, %rd900;
ld.local.u32 %r8291, [%rd901];
ld.local.u32 %r8292, [%rd901+-4];
and.b32 %r357, %r349, 31;
setp.eq.s32 %p267, %r357, 0;
@%p267 bra $L__BB0_289;
mov.u32 %r3420, 32;
sub.s32 %r3421, %r3420, %r357;
shr.u32 %r3422, %r8292, %r3421;
shl.b32 %r3423, %r8291, %r357;
add.s32 %r8291, %r3422, %r3423;
mul.wide.s32 %rd902, %r354, 4;
add.s64 %rd903, %rd1, %rd902;
ld.local.u32 %r3424, [%rd903];
shr.u32 %r3425, %r3424, %r3421;
shl.b32 %r3426, %r8292, %r357;
add.s32 %r8292, %r3425, %r3426;
$L__BB0_289:
and.b32 %r3427, %r348, -2147483648;
shr.u32 %r3428, %r8292, 30;
shl.b32 %r3429, %r8291, 2;
or.b32 %r3430, %r3428, %r3429;
shr.u32 %r3431, %r3430, 31;
shr.u32 %r3432, %r8291, 30;
add.s32 %r3433, %r3431, %r3432;
neg.s32 %r3434, %r3433;
setp.eq.s32 %p268, %r3427, 0;
selp.b32 %r8293, %r3433, %r3434, %p268;
setp.ne.s32 %p269, %r3431, 0;
xor.b32 %r3435, %r3427, -2147483648;
selp.b32 %r3436, %r3435, %r3427, %p269;
selp.b32 %r3437, -1, 0, %p269;
xor.b32 %r3438, %r3430, %r3437;
shl.b32 %r3439, %r8292, 2;
xor.b32 %r3440, %r3439, %r3437;
cvt.u64.u32 %rd904, %r3438;
cvt.u64.u32 %rd905, %r3440;
bfi.b64 %rd906, %rd904, %rd905, 32, 32;
cvt.rn.f64.s64 %fd33, %rd906;
mul.f64 %fd34, %fd33, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2617, %fd34;
setp.eq.s32 %p270, %r3436, 0;
neg.f32 %f2618, %f2617;
selp.f32 %f5277, %f2617, %f2618, %p270;
$L__BB0_291:
and.b32 %r364, %r8293, 1;
setp.eq.s32 %p271, %r364, 0;
selp.f32 %f317, %f5277, 0f3F800000, %p271;
mul.rn.f32 %f318, %f5277, %f5277;
mov.f32 %f5278, 0fB94D4153;
@%p271 bra $L__BB0_293;
mov.f32 %f2621, 0fBAB607ED;
mov.f32 %f2622, 0f37CBAC00;
fma.rn.f32 %f5278, %f2622, %f318, %f2621;
$L__BB0_293:
selp.f32 %f2623, 0f3C0885E4, 0f3D2AAABB, %p271;
fma.rn.f32 %f2624, %f5278, %f318, %f2623;
selp.f32 %f2625, 0fBE2AAAA8, 0fBEFFFFFF, %p271;
fma.rn.f32 %f2626, %f2624, %f318, %f2625;
mov.f32 %f2627, 0f00000000;
fma.rn.f32 %f2628, %f318, %f317, %f2627;
fma.rn.f32 %f5279, %f2626, %f2628, %f317;
and.b32 %r3442, %r8293, 2;
setp.eq.s32 %p273, %r3442, 0;
@%p273 bra $L__BB0_295;
mov.f32 %f2630, 0fBF800000;
fma.rn.f32 %f5279, %f5279, %f2630, %f2627;
$L__BB0_295:
mul.f32 %f2631, %f296, 0f3F22F983;
cvt.rni.s32.f32 %r8297, %f2631;
cvt.rn.f32.s32 %f2632, %r8297;
mov.f32 %f2633, 0fBFC90FDA;
fma.rn.f32 %f2634, %f2632, %f2633, %f296;
mov.f32 %f2635, 0fB3A22168;
fma.rn.f32 %f2636, %f2632, %f2635, %f2634;
mov.f32 %f2637, 0fA7C234C5;
fma.rn.f32 %f5280, %f2632, %f2637, %f2636;
abs.f32 %f325, %f296;
setp.ltu.f32 %p274, %f325, 0f47CE4780;
@%p274 bra $L__BB0_303;
setp.eq.f32 %p275, %f325, 0f7F800000;
@%p275 bra $L__BB0_302;
bra.uni $L__BB0_297;
$L__BB0_302:
mov.f32 %f2640, 0f00000000;
mul.rn.f32 %f5280, %f296, %f2640;
mov.u32 %r8297, 0;
bra.uni $L__BB0_303;
$L__BB0_297:
mov.b32 %r366, %f296;
shr.u32 %r3444, %r366, 23;
and.b32 %r3445, %r3444, 255;
add.s32 %r367, %r3445, -128;
shl.b32 %r3446, %r366, 8;
or.b32 %r368, %r3446, -2147483648;
shr.u32 %r369, %r367, 5;
mov.u64 %rd2532, 0;
mov.u32 %r8294, 0;
mov.u64 %rd910, __cudart_i2opi_f;
mov.u64 %rd2533, %rd2532;
$L__BB0_298:
.pragma "nounroll";
shl.b64 %rd909, %rd2532, 2;
add.s64 %rd911, %rd910, %rd909;
ld.global.nc.u32 %r3447, [%rd911];
mad.wide.u32 %rd912, %r3447, %r368, %rd2533;
shr.u64 %rd2533, %rd912, 32;
add.s64 %rd913, %rd1, %rd909;
st.local.u32 [%rd913], %rd912;
add.s32 %r8294, %r8294, 1;
cvt.s64.s32 %rd2532, %r8294;
setp.ne.s32 %p276, %r8294, 6;
@%p276 bra $L__BB0_298;
st.local.u32 [%rd5], %rd2533;
mov.u32 %r3448, 4;
sub.s32 %r372, %r3448, %r369;
mov.u32 %r3449, 6;
sub.s32 %r3450, %r3449, %r369;
mul.wide.s32 %rd914, %r3450, 4;
add.s64 %rd915, %rd1, %rd914;
ld.local.u32 %r8295, [%rd915];
ld.local.u32 %r8296, [%rd915+-4];
and.b32 %r375, %r367, 31;
setp.eq.s32 %p277, %r375, 0;
@%p277 bra $L__BB0_301;
mov.u32 %r3451, 32;
sub.s32 %r3452, %r3451, %r375;
shr.u32 %r3453, %r8296, %r3452;
shl.b32 %r3454, %r8295, %r375;
add.s32 %r8295, %r3453, %r3454;
mul.wide.s32 %rd916, %r372, 4;
add.s64 %rd917, %rd1, %rd916;
ld.local.u32 %r3455, [%rd917];
shr.u32 %r3456, %r3455, %r3452;
shl.b32 %r3457, %r8296, %r375;
add.s32 %r8296, %r3456, %r3457;
$L__BB0_301:
and.b32 %r3458, %r366, -2147483648;
shr.u32 %r3459, %r8296, 30;
shl.b32 %r3460, %r8295, 2;
or.b32 %r3461, %r3459, %r3460;
shr.u32 %r3462, %r3461, 31;
shr.u32 %r3463, %r8295, 30;
add.s32 %r3464, %r3462, %r3463;
neg.s32 %r3465, %r3464;
setp.eq.s32 %p278, %r3458, 0;
selp.b32 %r8297, %r3464, %r3465, %p278;
setp.ne.s32 %p279, %r3462, 0;
xor.b32 %r3466, %r3458, -2147483648;
selp.b32 %r3467, %r3466, %r3458, %p279;
selp.b32 %r3468, -1, 0, %p279;
xor.b32 %r3469, %r3461, %r3468;
shl.b32 %r3470, %r8296, 2;
xor.b32 %r3471, %r3470, %r3468;
cvt.u64.u32 %rd918, %r3469;
cvt.u64.u32 %rd919, %r3471;
bfi.b64 %rd920, %rd918, %rd919, 32, 32;
cvt.rn.f64.s64 %fd35, %rd920;
mul.f64 %fd36, %fd35, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2638, %fd36;
setp.eq.s32 %p280, %r3467, 0;
neg.f32 %f2639, %f2638;
selp.f32 %f5280, %f2638, %f2639, %p280;
$L__BB0_303:
add.s32 %r382, %r8297, 1;
and.b32 %r383, %r382, 1;
setp.eq.s32 %p281, %r383, 0;
selp.f32 %f329, %f5280, 0f3F800000, %p281;
mul.rn.f32 %f330, %f5280, %f5280;
mov.f32 %f5281, 0fB94D4153;
@%p281 bra $L__BB0_305;
mov.f32 %f2642, 0fBAB607ED;
mov.f32 %f2643, 0f37CBAC00;
fma.rn.f32 %f5281, %f2643, %f330, %f2642;
$L__BB0_305:
selp.f32 %f2644, 0f3C0885E4, 0f3D2AAABB, %p281;
fma.rn.f32 %f2645, %f5281, %f330, %f2644;
selp.f32 %f2646, 0fBE2AAAA8, 0fBEFFFFFF, %p281;
fma.rn.f32 %f2647, %f2645, %f330, %f2646;
mov.f32 %f2648, 0f00000000;
fma.rn.f32 %f2649, %f330, %f329, %f2648;
fma.rn.f32 %f5282, %f2647, %f2649, %f329;
and.b32 %r3473, %r382, 2;
setp.eq.s32 %p283, %r3473, 0;
@%p283 bra $L__BB0_307;
mov.f32 %f2651, 0fBF800000;
fma.rn.f32 %f5282, %f5282, %f2651, %f2648;
$L__BB0_307:
add.f32 %f5332, %f5279, %f5282;
mul.f32 %f2652, %f305, 0f3F22F983;
cvt.rni.s32.f32 %r8301, %f2652;
cvt.rn.f32.s32 %f2653, %r8301;
mov.f32 %f2654, 0fBFC90FDA;
fma.rn.f32 %f2655, %f2653, %f2654, %f305;
mov.f32 %f2656, 0fB3A22168;
fma.rn.f32 %f2657, %f2653, %f2656, %f2655;
mov.f32 %f2658, 0fA7C234C5;
fma.rn.f32 %f5283, %f2653, %f2658, %f2657;
abs.f32 %f338, %f305;
setp.ltu.f32 %p284, %f338, 0f47CE4780;
@%p284 bra $L__BB0_315;
setp.eq.f32 %p285, %f338, 0f7F800000;
@%p285 bra $L__BB0_314;
bra.uni $L__BB0_309;
$L__BB0_314:
mov.f32 %f2661, 0f00000000;
mul.rn.f32 %f5283, %f305, %f2661;
mov.u32 %r8301, 0;
bra.uni $L__BB0_315;
$L__BB0_309:
mov.b32 %r385, %f305;
shr.u32 %r3475, %r385, 23;
and.b32 %r3476, %r3475, 255;
add.s32 %r386, %r3476, -128;
shl.b32 %r3477, %r385, 8;
or.b32 %r387, %r3477, -2147483648;
shr.u32 %r388, %r386, 5;
mov.u64 %rd2534, 0;
mov.u32 %r8298, 0;
mov.u64 %rd924, __cudart_i2opi_f;
mov.u64 %rd2535, %rd2534;
$L__BB0_310:
.pragma "nounroll";
shl.b64 %rd923, %rd2534, 2;
add.s64 %rd925, %rd924, %rd923;
ld.global.nc.u32 %r3478, [%rd925];
mad.wide.u32 %rd926, %r3478, %r387, %rd2535;
shr.u64 %rd2535, %rd926, 32;
add.s64 %rd927, %rd1, %rd923;
st.local.u32 [%rd927], %rd926;
add.s32 %r8298, %r8298, 1;
cvt.s64.s32 %rd2534, %r8298;
setp.ne.s32 %p286, %r8298, 6;
@%p286 bra $L__BB0_310;
st.local.u32 [%rd5], %rd2535;
mov.u32 %r3479, 4;
sub.s32 %r391, %r3479, %r388;
mov.u32 %r3480, 6;
sub.s32 %r3481, %r3480, %r388;
mul.wide.s32 %rd928, %r3481, 4;
add.s64 %rd929, %rd1, %rd928;
ld.local.u32 %r8299, [%rd929];
ld.local.u32 %r8300, [%rd929+-4];
and.b32 %r394, %r386, 31;
setp.eq.s32 %p287, %r394, 0;
@%p287 bra $L__BB0_313;
mov.u32 %r3482, 32;
sub.s32 %r3483, %r3482, %r394;
shr.u32 %r3484, %r8300, %r3483;
shl.b32 %r3485, %r8299, %r394;
add.s32 %r8299, %r3484, %r3485;
mul.wide.s32 %rd930, %r391, 4;
add.s64 %rd931, %rd1, %rd930;
ld.local.u32 %r3486, [%rd931];
shr.u32 %r3487, %r3486, %r3483;
shl.b32 %r3488, %r8300, %r394;
add.s32 %r8300, %r3487, %r3488;
$L__BB0_313:
and.b32 %r3489, %r385, -2147483648;
shr.u32 %r3490, %r8300, 30;
shl.b32 %r3491, %r8299, 2;
or.b32 %r3492, %r3490, %r3491;
shr.u32 %r3493, %r3492, 31;
shr.u32 %r3494, %r8299, 30;
add.s32 %r3495, %r3493, %r3494;
neg.s32 %r3496, %r3495;
setp.eq.s32 %p288, %r3489, 0;
selp.b32 %r8301, %r3495, %r3496, %p288;
setp.ne.s32 %p289, %r3493, 0;
xor.b32 %r3497, %r3489, -2147483648;
selp.b32 %r3498, %r3497, %r3489, %p289;
selp.b32 %r3499, -1, 0, %p289;
xor.b32 %r3500, %r3492, %r3499;
shl.b32 %r3501, %r8300, 2;
xor.b32 %r3502, %r3501, %r3499;
cvt.u64.u32 %rd932, %r3500;
cvt.u64.u32 %rd933, %r3502;
bfi.b64 %rd934, %rd932, %rd933, 32, 32;
cvt.rn.f64.s64 %fd37, %rd934;
mul.f64 %fd38, %fd37, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2659, %fd38;
setp.eq.s32 %p290, %r3498, 0;
neg.f32 %f2660, %f2659;
selp.f32 %f5283, %f2659, %f2660, %p290;
$L__BB0_315:
and.b32 %r401, %r8301, 1;
setp.eq.s32 %p291, %r401, 0;
selp.f32 %f342, %f5283, 0f3F800000, %p291;
mul.rn.f32 %f343, %f5283, %f5283;
mov.f32 %f5284, 0fB94D4153;
@%p291 bra $L__BB0_317;
mov.f32 %f2663, 0fBAB607ED;
mov.f32 %f2664, 0f37CBAC00;
fma.rn.f32 %f5284, %f2664, %f343, %f2663;
$L__BB0_317:
selp.f32 %f2665, 0f3C0885E4, 0f3D2AAABB, %p291;
fma.rn.f32 %f2666, %f5284, %f343, %f2665;
selp.f32 %f2667, 0fBE2AAAA8, 0fBEFFFFFF, %p291;
fma.rn.f32 %f2668, %f2666, %f343, %f2667;
mov.f32 %f2669, 0f00000000;
fma.rn.f32 %f2670, %f343, %f342, %f2669;
fma.rn.f32 %f5285, %f2668, %f2670, %f342;
and.b32 %r3504, %r8301, 2;
setp.eq.s32 %p293, %r3504, 0;
@%p293 bra $L__BB0_319;
mov.f32 %f2672, 0fBF800000;
fma.rn.f32 %f5285, %f5285, %f2672, %f2669;
$L__BB0_319:
mul.f32 %f2673, %f297, 0f3F22F983;
cvt.rni.s32.f32 %r8305, %f2673;
cvt.rn.f32.s32 %f2674, %r8305;
mov.f32 %f2675, 0fBFC90FDA;
fma.rn.f32 %f2676, %f2674, %f2675, %f297;
mov.f32 %f2677, 0fB3A22168;
fma.rn.f32 %f2678, %f2674, %f2677, %f2676;
mov.f32 %f2679, 0fA7C234C5;
fma.rn.f32 %f5286, %f2674, %f2679, %f2678;
abs.f32 %f350, %f297;
setp.ltu.f32 %p294, %f350, 0f47CE4780;
@%p294 bra $L__BB0_327;
setp.eq.f32 %p295, %f350, 0f7F800000;
@%p295 bra $L__BB0_326;
bra.uni $L__BB0_321;
$L__BB0_326:
mov.f32 %f2682, 0f00000000;
mul.rn.f32 %f5286, %f297, %f2682;
mov.u32 %r8305, 0;
bra.uni $L__BB0_327;
$L__BB0_321:
mov.b32 %r403, %f297;
shr.u32 %r3506, %r403, 23;
and.b32 %r3507, %r3506, 255;
add.s32 %r404, %r3507, -128;
shl.b32 %r3508, %r403, 8;
or.b32 %r405, %r3508, -2147483648;
shr.u32 %r406, %r404, 5;
mov.u64 %rd2536, 0;
mov.u32 %r8302, 0;
mov.u64 %rd938, __cudart_i2opi_f;
mov.u64 %rd2537, %rd2536;
$L__BB0_322:
.pragma "nounroll";
shl.b64 %rd937, %rd2536, 2;
add.s64 %rd939, %rd938, %rd937;
ld.global.nc.u32 %r3509, [%rd939];
mad.wide.u32 %rd940, %r3509, %r405, %rd2537;
shr.u64 %rd2537, %rd940, 32;
add.s64 %rd941, %rd1, %rd937;
st.local.u32 [%rd941], %rd940;
add.s32 %r8302, %r8302, 1;
cvt.s64.s32 %rd2536, %r8302;
setp.ne.s32 %p296, %r8302, 6;
@%p296 bra $L__BB0_322;
st.local.u32 [%rd5], %rd2537;
mov.u32 %r3510, 4;
sub.s32 %r409, %r3510, %r406;
mov.u32 %r3511, 6;
sub.s32 %r3512, %r3511, %r406;
mul.wide.s32 %rd942, %r3512, 4;
add.s64 %rd943, %rd1, %rd942;
ld.local.u32 %r8303, [%rd943];
ld.local.u32 %r8304, [%rd943+-4];
and.b32 %r412, %r404, 31;
setp.eq.s32 %p297, %r412, 0;
@%p297 bra $L__BB0_325;
mov.u32 %r3513, 32;
sub.s32 %r3514, %r3513, %r412;
shr.u32 %r3515, %r8304, %r3514;
shl.b32 %r3516, %r8303, %r412;
add.s32 %r8303, %r3515, %r3516;
mul.wide.s32 %rd944, %r409, 4;
add.s64 %rd945, %rd1, %rd944;
ld.local.u32 %r3517, [%rd945];
shr.u32 %r3518, %r3517, %r3514;
shl.b32 %r3519, %r8304, %r412;
add.s32 %r8304, %r3518, %r3519;
$L__BB0_325:
and.b32 %r3520, %r403, -2147483648;
shr.u32 %r3521, %r8304, 30;
shl.b32 %r3522, %r8303, 2;
or.b32 %r3523, %r3521, %r3522;
shr.u32 %r3524, %r3523, 31;
shr.u32 %r3525, %r8303, 30;
add.s32 %r3526, %r3524, %r3525;
neg.s32 %r3527, %r3526;
setp.eq.s32 %p298, %r3520, 0;
selp.b32 %r8305, %r3526, %r3527, %p298;
setp.ne.s32 %p299, %r3524, 0;
xor.b32 %r3528, %r3520, -2147483648;
selp.b32 %r3529, %r3528, %r3520, %p299;
selp.b32 %r3530, -1, 0, %p299;
xor.b32 %r3531, %r3523, %r3530;
shl.b32 %r3532, %r8304, 2;
xor.b32 %r3533, %r3532, %r3530;
cvt.u64.u32 %rd946, %r3531;
cvt.u64.u32 %rd947, %r3533;
bfi.b64 %rd948, %rd946, %rd947, 32, 32;
cvt.rn.f64.s64 %fd39, %rd948;
mul.f64 %fd40, %fd39, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2680, %fd40;
setp.eq.s32 %p300, %r3529, 0;
neg.f32 %f2681, %f2680;
selp.f32 %f5286, %f2680, %f2681, %p300;
$L__BB0_327:
add.s32 %r419, %r8305, 1;
and.b32 %r420, %r419, 1;
setp.eq.s32 %p301, %r420, 0;
selp.f32 %f354, %f5286, 0f3F800000, %p301;
mul.rn.f32 %f355, %f5286, %f5286;
mov.f32 %f5287, 0fB94D4153;
@%p301 bra $L__BB0_329;
mov.f32 %f2684, 0fBAB607ED;
mov.f32 %f2685, 0f37CBAC00;
fma.rn.f32 %f5287, %f2685, %f355, %f2684;
$L__BB0_329:
selp.f32 %f2686, 0f3C0885E4, 0f3D2AAABB, %p301;
fma.rn.f32 %f2687, %f5287, %f355, %f2686;
selp.f32 %f2688, 0fBE2AAAA8, 0fBEFFFFFF, %p301;
fma.rn.f32 %f2689, %f2687, %f355, %f2688;
mov.f32 %f2690, 0f00000000;
fma.rn.f32 %f2691, %f355, %f354, %f2690;
fma.rn.f32 %f5288, %f2689, %f2691, %f354;
and.b32 %r3535, %r419, 2;
setp.eq.s32 %p303, %r3535, 0;
@%p303 bra $L__BB0_331;
mov.f32 %f2693, 0fBF800000;
fma.rn.f32 %f5288, %f5288, %f2693, %f2690;
$L__BB0_331:
add.f32 %f5331, %f5285, %f5288;
mul.f32 %f2694, %f306, 0f3F22F983;
cvt.rni.s32.f32 %r8309, %f2694;
cvt.rn.f32.s32 %f2695, %r8309;
mov.f32 %f2696, 0fBFC90FDA;
fma.rn.f32 %f2697, %f2695, %f2696, %f306;
mov.f32 %f2698, 0fB3A22168;
fma.rn.f32 %f2699, %f2695, %f2698, %f2697;
mov.f32 %f2700, 0fA7C234C5;
fma.rn.f32 %f5289, %f2695, %f2700, %f2699;
abs.f32 %f363, %f306;
setp.ltu.f32 %p304, %f363, 0f47CE4780;
@%p304 bra $L__BB0_339;
setp.eq.f32 %p305, %f363, 0f7F800000;
@%p305 bra $L__BB0_338;
bra.uni $L__BB0_333;
$L__BB0_338:
mov.f32 %f2703, 0f00000000;
mul.rn.f32 %f5289, %f306, %f2703;
mov.u32 %r8309, 0;
bra.uni $L__BB0_339;
$L__BB0_333:
mov.b32 %r422, %f306;
shr.u32 %r3537, %r422, 23;
and.b32 %r3538, %r3537, 255;
add.s32 %r423, %r3538, -128;
shl.b32 %r3539, %r422, 8;
or.b32 %r424, %r3539, -2147483648;
shr.u32 %r425, %r423, 5;
mov.u64 %rd2538, 0;
mov.u32 %r8306, 0;
mov.u64 %rd952, __cudart_i2opi_f;
mov.u64 %rd2539, %rd2538;
$L__BB0_334:
.pragma "nounroll";
shl.b64 %rd951, %rd2538, 2;
add.s64 %rd953, %rd952, %rd951;
ld.global.nc.u32 %r3540, [%rd953];
mad.wide.u32 %rd954, %r3540, %r424, %rd2539;
shr.u64 %rd2539, %rd954, 32;
add.s64 %rd955, %rd1, %rd951;
st.local.u32 [%rd955], %rd954;
add.s32 %r8306, %r8306, 1;
cvt.s64.s32 %rd2538, %r8306;
setp.ne.s32 %p306, %r8306, 6;
@%p306 bra $L__BB0_334;
st.local.u32 [%rd5], %rd2539;
mov.u32 %r3541, 4;
sub.s32 %r428, %r3541, %r425;
mov.u32 %r3542, 6;
sub.s32 %r3543, %r3542, %r425;
mul.wide.s32 %rd956, %r3543, 4;
add.s64 %rd957, %rd1, %rd956;
ld.local.u32 %r8307, [%rd957];
ld.local.u32 %r8308, [%rd957+-4];
and.b32 %r431, %r423, 31;
setp.eq.s32 %p307, %r431, 0;
@%p307 bra $L__BB0_337;
mov.u32 %r3544, 32;
sub.s32 %r3545, %r3544, %r431;
shr.u32 %r3546, %r8308, %r3545;
shl.b32 %r3547, %r8307, %r431;
add.s32 %r8307, %r3546, %r3547;
mul.wide.s32 %rd958, %r428, 4;
add.s64 %rd959, %rd1, %rd958;
ld.local.u32 %r3548, [%rd959];
shr.u32 %r3549, %r3548, %r3545;
shl.b32 %r3550, %r8308, %r431;
add.s32 %r8308, %r3549, %r3550;
$L__BB0_337:
and.b32 %r3551, %r422, -2147483648;
shr.u32 %r3552, %r8308, 30;
shl.b32 %r3553, %r8307, 2;
or.b32 %r3554, %r3552, %r3553;
shr.u32 %r3555, %r3554, 31;
shr.u32 %r3556, %r8307, 30;
add.s32 %r3557, %r3555, %r3556;
neg.s32 %r3558, %r3557;
setp.eq.s32 %p308, %r3551, 0;
selp.b32 %r8309, %r3557, %r3558, %p308;
setp.ne.s32 %p309, %r3555, 0;
xor.b32 %r3559, %r3551, -2147483648;
selp.b32 %r3560, %r3559, %r3551, %p309;
selp.b32 %r3561, -1, 0, %p309;
xor.b32 %r3562, %r3554, %r3561;
shl.b32 %r3563, %r8308, 2;
xor.b32 %r3564, %r3563, %r3561;
cvt.u64.u32 %rd960, %r3562;
cvt.u64.u32 %rd961, %r3564;
bfi.b64 %rd962, %rd960, %rd961, 32, 32;
cvt.rn.f64.s64 %fd41, %rd962;
mul.f64 %fd42, %fd41, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2701, %fd42;
setp.eq.s32 %p310, %r3560, 0;
neg.f32 %f2702, %f2701;
selp.f32 %f5289, %f2701, %f2702, %p310;
$L__BB0_339:
and.b32 %r438, %r8309, 1;
setp.eq.s32 %p311, %r438, 0;
selp.f32 %f367, %f5289, 0f3F800000, %p311;
mul.rn.f32 %f368, %f5289, %f5289;
mov.f32 %f5290, 0fB94D4153;
@%p311 bra $L__BB0_341;
mov.f32 %f2705, 0fBAB607ED;
mov.f32 %f2706, 0f37CBAC00;
fma.rn.f32 %f5290, %f2706, %f368, %f2705;
$L__BB0_341:
selp.f32 %f2707, 0f3C0885E4, 0f3D2AAABB, %p311;
fma.rn.f32 %f2708, %f5290, %f368, %f2707;
selp.f32 %f2709, 0fBE2AAAA8, 0fBEFFFFFF, %p311;
fma.rn.f32 %f2710, %f2708, %f368, %f2709;
mov.f32 %f2711, 0f00000000;
fma.rn.f32 %f2712, %f368, %f367, %f2711;
fma.rn.f32 %f5291, %f2710, %f2712, %f367;
and.b32 %r3566, %r8309, 2;
setp.eq.s32 %p313, %r3566, 0;
@%p313 bra $L__BB0_343;
mov.f32 %f2714, 0fBF800000;
fma.rn.f32 %f5291, %f5291, %f2714, %f2711;
$L__BB0_343:
mul.f32 %f2715, %f298, 0f3F22F983;
cvt.rni.s32.f32 %r8313, %f2715;
cvt.rn.f32.s32 %f2716, %r8313;
mov.f32 %f2717, 0fBFC90FDA;
fma.rn.f32 %f2718, %f2716, %f2717, %f298;
mov.f32 %f2719, 0fB3A22168;
fma.rn.f32 %f2720, %f2716, %f2719, %f2718;
mov.f32 %f2721, 0fA7C234C5;
fma.rn.f32 %f5292, %f2716, %f2721, %f2720;
abs.f32 %f375, %f298;
setp.ltu.f32 %p314, %f375, 0f47CE4780;
@%p314 bra $L__BB0_351;
setp.eq.f32 %p315, %f375, 0f7F800000;
@%p315 bra $L__BB0_350;
bra.uni $L__BB0_345;
$L__BB0_350:
mov.f32 %f2724, 0f00000000;
mul.rn.f32 %f5292, %f298, %f2724;
mov.u32 %r8313, 0;
bra.uni $L__BB0_351;
$L__BB0_345:
mov.b32 %r440, %f298;
shr.u32 %r3568, %r440, 23;
and.b32 %r3569, %r3568, 255;
add.s32 %r441, %r3569, -128;
shl.b32 %r3570, %r440, 8;
or.b32 %r442, %r3570, -2147483648;
shr.u32 %r443, %r441, 5;
mov.u64 %rd2540, 0;
mov.u32 %r8310, 0;
mov.u64 %rd966, __cudart_i2opi_f;
mov.u64 %rd2541, %rd2540;
$L__BB0_346:
.pragma "nounroll";
shl.b64 %rd965, %rd2540, 2;
add.s64 %rd967, %rd966, %rd965;
ld.global.nc.u32 %r3571, [%rd967];
mad.wide.u32 %rd968, %r3571, %r442, %rd2541;
shr.u64 %rd2541, %rd968, 32;
add.s64 %rd969, %rd1, %rd965;
st.local.u32 [%rd969], %rd968;
add.s32 %r8310, %r8310, 1;
cvt.s64.s32 %rd2540, %r8310;
setp.ne.s32 %p316, %r8310, 6;
@%p316 bra $L__BB0_346;
st.local.u32 [%rd5], %rd2541;
mov.u32 %r3572, 4;
sub.s32 %r446, %r3572, %r443;
mov.u32 %r3573, 6;
sub.s32 %r3574, %r3573, %r443;
mul.wide.s32 %rd970, %r3574, 4;
add.s64 %rd971, %rd1, %rd970;
ld.local.u32 %r8311, [%rd971];
ld.local.u32 %r8312, [%rd971+-4];
and.b32 %r449, %r441, 31;
setp.eq.s32 %p317, %r449, 0;
@%p317 bra $L__BB0_349;
mov.u32 %r3575, 32;
sub.s32 %r3576, %r3575, %r449;
shr.u32 %r3577, %r8312, %r3576;
shl.b32 %r3578, %r8311, %r449;
add.s32 %r8311, %r3577, %r3578;
mul.wide.s32 %rd972, %r446, 4;
add.s64 %rd973, %rd1, %rd972;
ld.local.u32 %r3579, [%rd973];
shr.u32 %r3580, %r3579, %r3576;
shl.b32 %r3581, %r8312, %r449;
add.s32 %r8312, %r3580, %r3581;
$L__BB0_349:
and.b32 %r3582, %r440, -2147483648;
shr.u32 %r3583, %r8312, 30;
shl.b32 %r3584, %r8311, 2;
or.b32 %r3585, %r3583, %r3584;
shr.u32 %r3586, %r3585, 31;
shr.u32 %r3587, %r8311, 30;
add.s32 %r3588, %r3586, %r3587;
neg.s32 %r3589, %r3588;
setp.eq.s32 %p318, %r3582, 0;
selp.b32 %r8313, %r3588, %r3589, %p318;
setp.ne.s32 %p319, %r3586, 0;
xor.b32 %r3590, %r3582, -2147483648;
selp.b32 %r3591, %r3590, %r3582, %p319;
selp.b32 %r3592, -1, 0, %p319;
xor.b32 %r3593, %r3585, %r3592;
shl.b32 %r3594, %r8312, 2;
xor.b32 %r3595, %r3594, %r3592;
cvt.u64.u32 %rd974, %r3593;
cvt.u64.u32 %rd975, %r3595;
bfi.b64 %rd976, %rd974, %rd975, 32, 32;
cvt.rn.f64.s64 %fd43, %rd976;
mul.f64 %fd44, %fd43, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2722, %fd44;
setp.eq.s32 %p320, %r3591, 0;
neg.f32 %f2723, %f2722;
selp.f32 %f5292, %f2722, %f2723, %p320;
$L__BB0_351:
add.s32 %r456, %r8313, 1;
and.b32 %r457, %r456, 1;
setp.eq.s32 %p321, %r457, 0;
selp.f32 %f379, %f5292, 0f3F800000, %p321;
mul.rn.f32 %f380, %f5292, %f5292;
mov.f32 %f5293, 0fB94D4153;
@%p321 bra $L__BB0_353;
mov.f32 %f2726, 0fBAB607ED;
mov.f32 %f2727, 0f37CBAC00;
fma.rn.f32 %f5293, %f2727, %f380, %f2726;
$L__BB0_353:
selp.f32 %f2728, 0f3C0885E4, 0f3D2AAABB, %p321;
fma.rn.f32 %f2729, %f5293, %f380, %f2728;
selp.f32 %f2730, 0fBE2AAAA8, 0fBEFFFFFF, %p321;
fma.rn.f32 %f2731, %f2729, %f380, %f2730;
mov.f32 %f2732, 0f00000000;
fma.rn.f32 %f2733, %f380, %f379, %f2732;
fma.rn.f32 %f5294, %f2731, %f2733, %f379;
and.b32 %r3597, %r456, 2;
setp.eq.s32 %p323, %r3597, 0;
@%p323 bra $L__BB0_355;
mov.f32 %f2735, 0fBF800000;
fma.rn.f32 %f5294, %f5294, %f2735, %f2732;
$L__BB0_355:
add.f32 %f5330, %f5291, %f5294;
mul.f32 %f2736, %f307, 0f3F22F983;
cvt.rni.s32.f32 %r8317, %f2736;
cvt.rn.f32.s32 %f2737, %r8317;
mov.f32 %f2738, 0fBFC90FDA;
fma.rn.f32 %f2739, %f2737, %f2738, %f307;
mov.f32 %f2740, 0fB3A22168;
fma.rn.f32 %f2741, %f2737, %f2740, %f2739;
mov.f32 %f2742, 0fA7C234C5;
fma.rn.f32 %f5295, %f2737, %f2742, %f2741;
abs.f32 %f388, %f307;
setp.ltu.f32 %p324, %f388, 0f47CE4780;
@%p324 bra $L__BB0_363;
setp.eq.f32 %p325, %f388, 0f7F800000;
@%p325 bra $L__BB0_362;
bra.uni $L__BB0_357;
$L__BB0_362:
mov.f32 %f2745, 0f00000000;
mul.rn.f32 %f5295, %f307, %f2745;
mov.u32 %r8317, 0;
bra.uni $L__BB0_363;
$L__BB0_357:
mov.b32 %r459, %f307;
shr.u32 %r3599, %r459, 23;
and.b32 %r3600, %r3599, 255;
add.s32 %r460, %r3600, -128;
shl.b32 %r3601, %r459, 8;
or.b32 %r461, %r3601, -2147483648;
shr.u32 %r462, %r460, 5;
mov.u64 %rd2542, 0;
mov.u32 %r8314, 0;
mov.u64 %rd980, __cudart_i2opi_f;
mov.u64 %rd2543, %rd2542;
$L__BB0_358:
.pragma "nounroll";
shl.b64 %rd979, %rd2542, 2;
add.s64 %rd981, %rd980, %rd979;
ld.global.nc.u32 %r3602, [%rd981];
mad.wide.u32 %rd982, %r3602, %r461, %rd2543;
shr.u64 %rd2543, %rd982, 32;
add.s64 %rd983, %rd1, %rd979;
st.local.u32 [%rd983], %rd982;
add.s32 %r8314, %r8314, 1;
cvt.s64.s32 %rd2542, %r8314;
setp.ne.s32 %p326, %r8314, 6;
@%p326 bra $L__BB0_358;
st.local.u32 [%rd5], %rd2543;
mov.u32 %r3603, 4;
sub.s32 %r465, %r3603, %r462;
mov.u32 %r3604, 6;
sub.s32 %r3605, %r3604, %r462;
mul.wide.s32 %rd984, %r3605, 4;
add.s64 %rd985, %rd1, %rd984;
ld.local.u32 %r8315, [%rd985];
ld.local.u32 %r8316, [%rd985+-4];
and.b32 %r468, %r460, 31;
setp.eq.s32 %p327, %r468, 0;
@%p327 bra $L__BB0_361;
mov.u32 %r3606, 32;
sub.s32 %r3607, %r3606, %r468;
shr.u32 %r3608, %r8316, %r3607;
shl.b32 %r3609, %r8315, %r468;
add.s32 %r8315, %r3608, %r3609;
mul.wide.s32 %rd986, %r465, 4;
add.s64 %rd987, %rd1, %rd986;
ld.local.u32 %r3610, [%rd987];
shr.u32 %r3611, %r3610, %r3607;
shl.b32 %r3612, %r8316, %r468;
add.s32 %r8316, %r3611, %r3612;
$L__BB0_361:
and.b32 %r3613, %r459, -2147483648;
shr.u32 %r3614, %r8316, 30;
shl.b32 %r3615, %r8315, 2;
or.b32 %r3616, %r3614, %r3615;
shr.u32 %r3617, %r3616, 31;
shr.u32 %r3618, %r8315, 30;
add.s32 %r3619, %r3617, %r3618;
neg.s32 %r3620, %r3619;
setp.eq.s32 %p328, %r3613, 0;
selp.b32 %r8317, %r3619, %r3620, %p328;
setp.ne.s32 %p329, %r3617, 0;
xor.b32 %r3621, %r3613, -2147483648;
selp.b32 %r3622, %r3621, %r3613, %p329;
selp.b32 %r3623, -1, 0, %p329;
xor.b32 %r3624, %r3616, %r3623;
shl.b32 %r3625, %r8316, 2;
xor.b32 %r3626, %r3625, %r3623;
cvt.u64.u32 %rd988, %r3624;
cvt.u64.u32 %rd989, %r3626;
bfi.b64 %rd990, %rd988, %rd989, 32, 32;
cvt.rn.f64.s64 %fd45, %rd990;
mul.f64 %fd46, %fd45, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2743, %fd46;
setp.eq.s32 %p330, %r3622, 0;
neg.f32 %f2744, %f2743;
selp.f32 %f5295, %f2743, %f2744, %p330;
$L__BB0_363:
and.b32 %r475, %r8317, 1;
setp.eq.s32 %p331, %r475, 0;
selp.f32 %f392, %f5295, 0f3F800000, %p331;
mul.rn.f32 %f393, %f5295, %f5295;
mov.f32 %f5296, 0fB94D4153;
@%p331 bra $L__BB0_365;
mov.f32 %f2747, 0fBAB607ED;
mov.f32 %f2748, 0f37CBAC00;
fma.rn.f32 %f5296, %f2748, %f393, %f2747;
$L__BB0_365:
selp.f32 %f2749, 0f3C0885E4, 0f3D2AAABB, %p331;
fma.rn.f32 %f2750, %f5296, %f393, %f2749;
selp.f32 %f2751, 0fBE2AAAA8, 0fBEFFFFFF, %p331;
fma.rn.f32 %f2752, %f2750, %f393, %f2751;
mov.f32 %f2753, 0f00000000;
fma.rn.f32 %f2754, %f393, %f392, %f2753;
fma.rn.f32 %f5297, %f2752, %f2754, %f392;
and.b32 %r3628, %r8317, 2;
setp.eq.s32 %p333, %r3628, 0;
@%p333 bra $L__BB0_367;
mov.f32 %f2756, 0fBF800000;
fma.rn.f32 %f5297, %f5297, %f2756, %f2753;
$L__BB0_367:
mul.f32 %f2757, %f299, 0f3F22F983;
cvt.rni.s32.f32 %r8321, %f2757;
cvt.rn.f32.s32 %f2758, %r8321;
mov.f32 %f2759, 0fBFC90FDA;
fma.rn.f32 %f2760, %f2758, %f2759, %f299;
mov.f32 %f2761, 0fB3A22168;
fma.rn.f32 %f2762, %f2758, %f2761, %f2760;
mov.f32 %f2763, 0fA7C234C5;
fma.rn.f32 %f5298, %f2758, %f2763, %f2762;
abs.f32 %f400, %f299;
setp.ltu.f32 %p334, %f400, 0f47CE4780;
@%p334 bra $L__BB0_375;
setp.eq.f32 %p335, %f400, 0f7F800000;
@%p335 bra $L__BB0_374;
bra.uni $L__BB0_369;
$L__BB0_374:
mov.f32 %f2766, 0f00000000;
mul.rn.f32 %f5298, %f299, %f2766;
mov.u32 %r8321, 0;
bra.uni $L__BB0_375;
$L__BB0_369:
mov.b32 %r477, %f299;
shr.u32 %r3630, %r477, 23;
and.b32 %r3631, %r3630, 255;
add.s32 %r478, %r3631, -128;
shl.b32 %r3632, %r477, 8;
or.b32 %r479, %r3632, -2147483648;
shr.u32 %r480, %r478, 5;
mov.u64 %rd2544, 0;
mov.u32 %r8318, 0;
mov.u64 %rd994, __cudart_i2opi_f;
mov.u64 %rd2545, %rd2544;
$L__BB0_370:
.pragma "nounroll";
shl.b64 %rd993, %rd2544, 2;
add.s64 %rd995, %rd994, %rd993;
ld.global.nc.u32 %r3633, [%rd995];
mad.wide.u32 %rd996, %r3633, %r479, %rd2545;
shr.u64 %rd2545, %rd996, 32;
add.s64 %rd997, %rd1, %rd993;
st.local.u32 [%rd997], %rd996;
add.s32 %r8318, %r8318, 1;
cvt.s64.s32 %rd2544, %r8318;
setp.ne.s32 %p336, %r8318, 6;
@%p336 bra $L__BB0_370;
st.local.u32 [%rd5], %rd2545;
mov.u32 %r3634, 4;
sub.s32 %r483, %r3634, %r480;
mov.u32 %r3635, 6;
sub.s32 %r3636, %r3635, %r480;
mul.wide.s32 %rd998, %r3636, 4;
add.s64 %rd999, %rd1, %rd998;
ld.local.u32 %r8319, [%rd999];
ld.local.u32 %r8320, [%rd999+-4];
and.b32 %r486, %r478, 31;
setp.eq.s32 %p337, %r486, 0;
@%p337 bra $L__BB0_373;
mov.u32 %r3637, 32;
sub.s32 %r3638, %r3637, %r486;
shr.u32 %r3639, %r8320, %r3638;
shl.b32 %r3640, %r8319, %r486;
add.s32 %r8319, %r3639, %r3640;
mul.wide.s32 %rd1000, %r483, 4;
add.s64 %rd1001, %rd1, %rd1000;
ld.local.u32 %r3641, [%rd1001];
shr.u32 %r3642, %r3641, %r3638;
shl.b32 %r3643, %r8320, %r486;
add.s32 %r8320, %r3642, %r3643;
$L__BB0_373:
and.b32 %r3644, %r477, -2147483648;
shr.u32 %r3645, %r8320, 30;
shl.b32 %r3646, %r8319, 2;
or.b32 %r3647, %r3645, %r3646;
shr.u32 %r3648, %r3647, 31;
shr.u32 %r3649, %r8319, 30;
add.s32 %r3650, %r3648, %r3649;
neg.s32 %r3651, %r3650;
setp.eq.s32 %p338, %r3644, 0;
selp.b32 %r8321, %r3650, %r3651, %p338;
setp.ne.s32 %p339, %r3648, 0;
xor.b32 %r3652, %r3644, -2147483648;
selp.b32 %r3653, %r3652, %r3644, %p339;
selp.b32 %r3654, -1, 0, %p339;
xor.b32 %r3655, %r3647, %r3654;
shl.b32 %r3656, %r8320, 2;
xor.b32 %r3657, %r3656, %r3654;
cvt.u64.u32 %rd1002, %r3655;
cvt.u64.u32 %rd1003, %r3657;
bfi.b64 %rd1004, %rd1002, %rd1003, 32, 32;
cvt.rn.f64.s64 %fd47, %rd1004;
mul.f64 %fd48, %fd47, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2764, %fd48;
setp.eq.s32 %p340, %r3653, 0;
neg.f32 %f2765, %f2764;
selp.f32 %f5298, %f2764, %f2765, %p340;
$L__BB0_375:
add.s32 %r493, %r8321, 1;
and.b32 %r494, %r493, 1;
setp.eq.s32 %p341, %r494, 0;
selp.f32 %f404, %f5298, 0f3F800000, %p341;
mul.rn.f32 %f405, %f5298, %f5298;
mov.f32 %f5299, 0fB94D4153;
@%p341 bra $L__BB0_377;
mov.f32 %f2768, 0fBAB607ED;
mov.f32 %f2769, 0f37CBAC00;
fma.rn.f32 %f5299, %f2769, %f405, %f2768;
$L__BB0_377:
selp.f32 %f2770, 0f3C0885E4, 0f3D2AAABB, %p341;
fma.rn.f32 %f2771, %f5299, %f405, %f2770;
selp.f32 %f2772, 0fBE2AAAA8, 0fBEFFFFFF, %p341;
fma.rn.f32 %f2773, %f2771, %f405, %f2772;
mov.f32 %f2774, 0f00000000;
fma.rn.f32 %f2775, %f405, %f404, %f2774;
fma.rn.f32 %f5300, %f2773, %f2775, %f404;
and.b32 %r3659, %r493, 2;
setp.eq.s32 %p343, %r3659, 0;
@%p343 bra $L__BB0_379;
mov.f32 %f2777, 0fBF800000;
fma.rn.f32 %f5300, %f5300, %f2777, %f2774;
$L__BB0_379:
add.f32 %f5329, %f5297, %f5300;
mul.f32 %f2778, %f308, 0f3F22F983;
cvt.rni.s32.f32 %r8325, %f2778;
cvt.rn.f32.s32 %f2779, %r8325;
mov.f32 %f2780, 0fBFC90FDA;
fma.rn.f32 %f2781, %f2779, %f2780, %f308;
mov.f32 %f2782, 0fB3A22168;
fma.rn.f32 %f2783, %f2779, %f2782, %f2781;
mov.f32 %f2784, 0fA7C234C5;
fma.rn.f32 %f5301, %f2779, %f2784, %f2783;
abs.f32 %f413, %f308;
setp.ltu.f32 %p344, %f413, 0f47CE4780;
@%p344 bra $L__BB0_387;
setp.eq.f32 %p345, %f413, 0f7F800000;
@%p345 bra $L__BB0_386;
bra.uni $L__BB0_381;
$L__BB0_386:
mov.f32 %f2787, 0f00000000;
mul.rn.f32 %f5301, %f308, %f2787;
mov.u32 %r8325, 0;
bra.uni $L__BB0_387;
$L__BB0_381:
mov.b32 %r496, %f308;
shr.u32 %r3661, %r496, 23;
and.b32 %r3662, %r3661, 255;
add.s32 %r497, %r3662, -128;
shl.b32 %r3663, %r496, 8;
or.b32 %r498, %r3663, -2147483648;
shr.u32 %r499, %r497, 5;
mov.u64 %rd2546, 0;
mov.u32 %r8322, 0;
mov.u64 %rd1008, __cudart_i2opi_f;
mov.u64 %rd2547, %rd2546;
$L__BB0_382:
.pragma "nounroll";
shl.b64 %rd1007, %rd2546, 2;
add.s64 %rd1009, %rd1008, %rd1007;
ld.global.nc.u32 %r3664, [%rd1009];
mad.wide.u32 %rd1010, %r3664, %r498, %rd2547;
shr.u64 %rd2547, %rd1010, 32;
add.s64 %rd1011, %rd1, %rd1007;
st.local.u32 [%rd1011], %rd1010;
add.s32 %r8322, %r8322, 1;
cvt.s64.s32 %rd2546, %r8322;
setp.ne.s32 %p346, %r8322, 6;
@%p346 bra $L__BB0_382;
st.local.u32 [%rd5], %rd2547;
mov.u32 %r3665, 4;
sub.s32 %r502, %r3665, %r499;
mov.u32 %r3666, 6;
sub.s32 %r3667, %r3666, %r499;
mul.wide.s32 %rd1012, %r3667, 4;
add.s64 %rd1013, %rd1, %rd1012;
ld.local.u32 %r8323, [%rd1013];
ld.local.u32 %r8324, [%rd1013+-4];
and.b32 %r505, %r497, 31;
setp.eq.s32 %p347, %r505, 0;
@%p347 bra $L__BB0_385;
mov.u32 %r3668, 32;
sub.s32 %r3669, %r3668, %r505;
shr.u32 %r3670, %r8324, %r3669;
shl.b32 %r3671, %r8323, %r505;
add.s32 %r8323, %r3670, %r3671;
mul.wide.s32 %rd1014, %r502, 4;
add.s64 %rd1015, %rd1, %rd1014;
ld.local.u32 %r3672, [%rd1015];
shr.u32 %r3673, %r3672, %r3669;
shl.b32 %r3674, %r8324, %r505;
add.s32 %r8324, %r3673, %r3674;
$L__BB0_385:
and.b32 %r3675, %r496, -2147483648;
shr.u32 %r3676, %r8324, 30;
shl.b32 %r3677, %r8323, 2;
or.b32 %r3678, %r3676, %r3677;
shr.u32 %r3679, %r3678, 31;
shr.u32 %r3680, %r8323, 30;
add.s32 %r3681, %r3679, %r3680;
neg.s32 %r3682, %r3681;
setp.eq.s32 %p348, %r3675, 0;
selp.b32 %r8325, %r3681, %r3682, %p348;
setp.ne.s32 %p349, %r3679, 0;
xor.b32 %r3683, %r3675, -2147483648;
selp.b32 %r3684, %r3683, %r3675, %p349;
selp.b32 %r3685, -1, 0, %p349;
xor.b32 %r3686, %r3678, %r3685;
shl.b32 %r3687, %r8324, 2;
xor.b32 %r3688, %r3687, %r3685;
cvt.u64.u32 %rd1016, %r3686;
cvt.u64.u32 %rd1017, %r3688;
bfi.b64 %rd1018, %rd1016, %rd1017, 32, 32;
cvt.rn.f64.s64 %fd49, %rd1018;
mul.f64 %fd50, %fd49, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2785, %fd50;
setp.eq.s32 %p350, %r3684, 0;
neg.f32 %f2786, %f2785;
selp.f32 %f5301, %f2785, %f2786, %p350;
$L__BB0_387:
and.b32 %r512, %r8325, 1;
setp.eq.s32 %p351, %r512, 0;
selp.f32 %f417, %f5301, 0f3F800000, %p351;
mul.rn.f32 %f418, %f5301, %f5301;
mov.f32 %f5302, 0fB94D4153;
@%p351 bra $L__BB0_389;
mov.f32 %f2789, 0fBAB607ED;
mov.f32 %f2790, 0f37CBAC00;
fma.rn.f32 %f5302, %f2790, %f418, %f2789;
$L__BB0_389:
selp.f32 %f2791, 0f3C0885E4, 0f3D2AAABB, %p351;
fma.rn.f32 %f2792, %f5302, %f418, %f2791;
selp.f32 %f2793, 0fBE2AAAA8, 0fBEFFFFFF, %p351;
fma.rn.f32 %f2794, %f2792, %f418, %f2793;
mov.f32 %f2795, 0f00000000;
fma.rn.f32 %f2796, %f418, %f417, %f2795;
fma.rn.f32 %f5303, %f2794, %f2796, %f417;
and.b32 %r3690, %r8325, 2;
setp.eq.s32 %p353, %r3690, 0;
@%p353 bra $L__BB0_391;
mov.f32 %f2798, 0fBF800000;
fma.rn.f32 %f5303, %f5303, %f2798, %f2795;
$L__BB0_391:
mul.f32 %f2799, %f300, 0f3F22F983;
cvt.rni.s32.f32 %r8329, %f2799;
cvt.rn.f32.s32 %f2800, %r8329;
mov.f32 %f2801, 0fBFC90FDA;
fma.rn.f32 %f2802, %f2800, %f2801, %f300;
mov.f32 %f2803, 0fB3A22168;
fma.rn.f32 %f2804, %f2800, %f2803, %f2802;
mov.f32 %f2805, 0fA7C234C5;
fma.rn.f32 %f5304, %f2800, %f2805, %f2804;
abs.f32 %f425, %f300;
setp.ltu.f32 %p354, %f425, 0f47CE4780;
@%p354 bra $L__BB0_399;
setp.eq.f32 %p355, %f425, 0f7F800000;
@%p355 bra $L__BB0_398;
bra.uni $L__BB0_393;
$L__BB0_398:
mov.f32 %f2808, 0f00000000;
mul.rn.f32 %f5304, %f300, %f2808;
mov.u32 %r8329, 0;
bra.uni $L__BB0_399;
$L__BB0_393:
mov.b32 %r514, %f300;
shr.u32 %r3692, %r514, 23;
and.b32 %r3693, %r3692, 255;
add.s32 %r515, %r3693, -128;
shl.b32 %r3694, %r514, 8;
or.b32 %r516, %r3694, -2147483648;
shr.u32 %r517, %r515, 5;
mov.u64 %rd2548, 0;
mov.u32 %r8326, 0;
mov.u64 %rd1022, __cudart_i2opi_f;
mov.u64 %rd2549, %rd2548;
$L__BB0_394:
.pragma "nounroll";
shl.b64 %rd1021, %rd2548, 2;
add.s64 %rd1023, %rd1022, %rd1021;
ld.global.nc.u32 %r3695, [%rd1023];
mad.wide.u32 %rd1024, %r3695, %r516, %rd2549;
shr.u64 %rd2549, %rd1024, 32;
add.s64 %rd1025, %rd1, %rd1021;
st.local.u32 [%rd1025], %rd1024;
add.s32 %r8326, %r8326, 1;
cvt.s64.s32 %rd2548, %r8326;
setp.ne.s32 %p356, %r8326, 6;
@%p356 bra $L__BB0_394;
st.local.u32 [%rd5], %rd2549;
mov.u32 %r3696, 4;
sub.s32 %r520, %r3696, %r517;
mov.u32 %r3697, 6;
sub.s32 %r3698, %r3697, %r517;
mul.wide.s32 %rd1026, %r3698, 4;
add.s64 %rd1027, %rd1, %rd1026;
ld.local.u32 %r8327, [%rd1027];
ld.local.u32 %r8328, [%rd1027+-4];
and.b32 %r523, %r515, 31;
setp.eq.s32 %p357, %r523, 0;
@%p357 bra $L__BB0_397;
mov.u32 %r3699, 32;
sub.s32 %r3700, %r3699, %r523;
shr.u32 %r3701, %r8328, %r3700;
shl.b32 %r3702, %r8327, %r523;
add.s32 %r8327, %r3701, %r3702;
mul.wide.s32 %rd1028, %r520, 4;
add.s64 %rd1029, %rd1, %rd1028;
ld.local.u32 %r3703, [%rd1029];
shr.u32 %r3704, %r3703, %r3700;
shl.b32 %r3705, %r8328, %r523;
add.s32 %r8328, %r3704, %r3705;
$L__BB0_397:
and.b32 %r3706, %r514, -2147483648;
shr.u32 %r3707, %r8328, 30;
shl.b32 %r3708, %r8327, 2;
or.b32 %r3709, %r3707, %r3708;
shr.u32 %r3710, %r3709, 31;
shr.u32 %r3711, %r8327, 30;
add.s32 %r3712, %r3710, %r3711;
neg.s32 %r3713, %r3712;
setp.eq.s32 %p358, %r3706, 0;
selp.b32 %r8329, %r3712, %r3713, %p358;
setp.ne.s32 %p359, %r3710, 0;
xor.b32 %r3714, %r3706, -2147483648;
selp.b32 %r3715, %r3714, %r3706, %p359;
selp.b32 %r3716, -1, 0, %p359;
xor.b32 %r3717, %r3709, %r3716;
shl.b32 %r3718, %r8328, 2;
xor.b32 %r3719, %r3718, %r3716;
cvt.u64.u32 %rd1030, %r3717;
cvt.u64.u32 %rd1031, %r3719;
bfi.b64 %rd1032, %rd1030, %rd1031, 32, 32;
cvt.rn.f64.s64 %fd51, %rd1032;
mul.f64 %fd52, %fd51, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2806, %fd52;
setp.eq.s32 %p360, %r3715, 0;
neg.f32 %f2807, %f2806;
selp.f32 %f5304, %f2806, %f2807, %p360;
$L__BB0_399:
add.s32 %r530, %r8329, 1;
and.b32 %r531, %r530, 1;
setp.eq.s32 %p361, %r531, 0;
selp.f32 %f429, %f5304, 0f3F800000, %p361;
mul.rn.f32 %f430, %f5304, %f5304;
mov.f32 %f5305, 0fB94D4153;
@%p361 bra $L__BB0_401;
mov.f32 %f2810, 0fBAB607ED;
mov.f32 %f2811, 0f37CBAC00;
fma.rn.f32 %f5305, %f2811, %f430, %f2810;
$L__BB0_401:
selp.f32 %f2812, 0f3C0885E4, 0f3D2AAABB, %p361;
fma.rn.f32 %f2813, %f5305, %f430, %f2812;
selp.f32 %f2814, 0fBE2AAAA8, 0fBEFFFFFF, %p361;
fma.rn.f32 %f2815, %f2813, %f430, %f2814;
mov.f32 %f2816, 0f00000000;
fma.rn.f32 %f2817, %f430, %f429, %f2816;
fma.rn.f32 %f5306, %f2815, %f2817, %f429;
and.b32 %r3721, %r530, 2;
setp.eq.s32 %p363, %r3721, 0;
@%p363 bra $L__BB0_403;
mov.f32 %f2819, 0fBF800000;
fma.rn.f32 %f5306, %f5306, %f2819, %f2816;
$L__BB0_403:
add.f32 %f5328, %f5303, %f5306;
mul.f32 %f2820, %f309, 0f3F22F983;
cvt.rni.s32.f32 %r8333, %f2820;
cvt.rn.f32.s32 %f2821, %r8333;
mov.f32 %f2822, 0fBFC90FDA;
fma.rn.f32 %f2823, %f2821, %f2822, %f309;
mov.f32 %f2824, 0fB3A22168;
fma.rn.f32 %f2825, %f2821, %f2824, %f2823;
mov.f32 %f2826, 0fA7C234C5;
fma.rn.f32 %f5307, %f2821, %f2826, %f2825;
abs.f32 %f438, %f309;
setp.ltu.f32 %p364, %f438, 0f47CE4780;
@%p364 bra $L__BB0_411;
setp.eq.f32 %p365, %f438, 0f7F800000;
@%p365 bra $L__BB0_410;
bra.uni $L__BB0_405;
$L__BB0_410:
mov.f32 %f2829, 0f00000000;
mul.rn.f32 %f5307, %f309, %f2829;
mov.u32 %r8333, 0;
bra.uni $L__BB0_411;
$L__BB0_405:
mov.b32 %r533, %f309;
shr.u32 %r3723, %r533, 23;
and.b32 %r3724, %r3723, 255;
add.s32 %r534, %r3724, -128;
shl.b32 %r3725, %r533, 8;
or.b32 %r535, %r3725, -2147483648;
shr.u32 %r536, %r534, 5;
mov.u64 %rd2550, 0;
mov.u32 %r8330, 0;
mov.u64 %rd1036, __cudart_i2opi_f;
mov.u64 %rd2551, %rd2550;
$L__BB0_406:
.pragma "nounroll";
shl.b64 %rd1035, %rd2550, 2;
add.s64 %rd1037, %rd1036, %rd1035;
ld.global.nc.u32 %r3726, [%rd1037];
mad.wide.u32 %rd1038, %r3726, %r535, %rd2551;
shr.u64 %rd2551, %rd1038, 32;
add.s64 %rd1039, %rd1, %rd1035;
st.local.u32 [%rd1039], %rd1038;
add.s32 %r8330, %r8330, 1;
cvt.s64.s32 %rd2550, %r8330;
setp.ne.s32 %p366, %r8330, 6;
@%p366 bra $L__BB0_406;
st.local.u32 [%rd5], %rd2551;
mov.u32 %r3727, 4;
sub.s32 %r539, %r3727, %r536;
mov.u32 %r3728, 6;
sub.s32 %r3729, %r3728, %r536;
mul.wide.s32 %rd1040, %r3729, 4;
add.s64 %rd1041, %rd1, %rd1040;
ld.local.u32 %r8331, [%rd1041];
ld.local.u32 %r8332, [%rd1041+-4];
and.b32 %r542, %r534, 31;
setp.eq.s32 %p367, %r542, 0;
@%p367 bra $L__BB0_409;
mov.u32 %r3730, 32;
sub.s32 %r3731, %r3730, %r542;
shr.u32 %r3732, %r8332, %r3731;
shl.b32 %r3733, %r8331, %r542;
add.s32 %r8331, %r3732, %r3733;
mul.wide.s32 %rd1042, %r539, 4;
add.s64 %rd1043, %rd1, %rd1042;
ld.local.u32 %r3734, [%rd1043];
shr.u32 %r3735, %r3734, %r3731;
shl.b32 %r3736, %r8332, %r542;
add.s32 %r8332, %r3735, %r3736;
$L__BB0_409:
and.b32 %r3737, %r533, -2147483648;
shr.u32 %r3738, %r8332, 30;
shl.b32 %r3739, %r8331, 2;
or.b32 %r3740, %r3738, %r3739;
shr.u32 %r3741, %r3740, 31;
shr.u32 %r3742, %r8331, 30;
add.s32 %r3743, %r3741, %r3742;
neg.s32 %r3744, %r3743;
setp.eq.s32 %p368, %r3737, 0;
selp.b32 %r8333, %r3743, %r3744, %p368;
setp.ne.s32 %p369, %r3741, 0;
xor.b32 %r3745, %r3737, -2147483648;
selp.b32 %r3746, %r3745, %r3737, %p369;
selp.b32 %r3747, -1, 0, %p369;
xor.b32 %r3748, %r3740, %r3747;
shl.b32 %r3749, %r8332, 2;
xor.b32 %r3750, %r3749, %r3747;
cvt.u64.u32 %rd1044, %r3748;
cvt.u64.u32 %rd1045, %r3750;
bfi.b64 %rd1046, %rd1044, %rd1045, 32, 32;
cvt.rn.f64.s64 %fd53, %rd1046;
mul.f64 %fd54, %fd53, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2827, %fd54;
setp.eq.s32 %p370, %r3746, 0;
neg.f32 %f2828, %f2827;
selp.f32 %f5307, %f2827, %f2828, %p370;
$L__BB0_411:
and.b32 %r549, %r8333, 1;
setp.eq.s32 %p371, %r549, 0;
selp.f32 %f442, %f5307, 0f3F800000, %p371;
mul.rn.f32 %f443, %f5307, %f5307;
mov.f32 %f5308, 0fB94D4153;
@%p371 bra $L__BB0_413;
mov.f32 %f2831, 0fBAB607ED;
mov.f32 %f2832, 0f37CBAC00;
fma.rn.f32 %f5308, %f2832, %f443, %f2831;
$L__BB0_413:
selp.f32 %f2833, 0f3C0885E4, 0f3D2AAABB, %p371;
fma.rn.f32 %f2834, %f5308, %f443, %f2833;
selp.f32 %f2835, 0fBE2AAAA8, 0fBEFFFFFF, %p371;
fma.rn.f32 %f2836, %f2834, %f443, %f2835;
mov.f32 %f2837, 0f00000000;
fma.rn.f32 %f2838, %f443, %f442, %f2837;
fma.rn.f32 %f5309, %f2836, %f2838, %f442;
and.b32 %r3752, %r8333, 2;
setp.eq.s32 %p373, %r3752, 0;
@%p373 bra $L__BB0_415;
mov.f32 %f2840, 0fBF800000;
fma.rn.f32 %f5309, %f5309, %f2840, %f2837;
$L__BB0_415:
mul.f32 %f2841, %f301, 0f3F22F983;
cvt.rni.s32.f32 %r8337, %f2841;
cvt.rn.f32.s32 %f2842, %r8337;
mov.f32 %f2843, 0fBFC90FDA;
fma.rn.f32 %f2844, %f2842, %f2843, %f301;
mov.f32 %f2845, 0fB3A22168;
fma.rn.f32 %f2846, %f2842, %f2845, %f2844;
mov.f32 %f2847, 0fA7C234C5;
fma.rn.f32 %f5310, %f2842, %f2847, %f2846;
abs.f32 %f450, %f301;
setp.ltu.f32 %p374, %f450, 0f47CE4780;
@%p374 bra $L__BB0_423;
setp.eq.f32 %p375, %f450, 0f7F800000;
@%p375 bra $L__BB0_422;
bra.uni $L__BB0_417;
$L__BB0_422:
mov.f32 %f2850, 0f00000000;
mul.rn.f32 %f5310, %f301, %f2850;
mov.u32 %r8337, 0;
bra.uni $L__BB0_423;
$L__BB0_417:
mov.b32 %r551, %f301;
shr.u32 %r3754, %r551, 23;
and.b32 %r3755, %r3754, 255;
add.s32 %r552, %r3755, -128;
shl.b32 %r3756, %r551, 8;
or.b32 %r553, %r3756, -2147483648;
shr.u32 %r554, %r552, 5;
mov.u64 %rd2552, 0;
mov.u32 %r8334, 0;
mov.u64 %rd1050, __cudart_i2opi_f;
mov.u64 %rd2553, %rd2552;
$L__BB0_418:
.pragma "nounroll";
shl.b64 %rd1049, %rd2552, 2;
add.s64 %rd1051, %rd1050, %rd1049;
ld.global.nc.u32 %r3757, [%rd1051];
mad.wide.u32 %rd1052, %r3757, %r553, %rd2553;
shr.u64 %rd2553, %rd1052, 32;
add.s64 %rd1053, %rd1, %rd1049;
st.local.u32 [%rd1053], %rd1052;
add.s32 %r8334, %r8334, 1;
cvt.s64.s32 %rd2552, %r8334;
setp.ne.s32 %p376, %r8334, 6;
@%p376 bra $L__BB0_418;
st.local.u32 [%rd5], %rd2553;
mov.u32 %r3758, 4;
sub.s32 %r557, %r3758, %r554;
mov.u32 %r3759, 6;
sub.s32 %r3760, %r3759, %r554;
mul.wide.s32 %rd1054, %r3760, 4;
add.s64 %rd1055, %rd1, %rd1054;
ld.local.u32 %r8335, [%rd1055];
ld.local.u32 %r8336, [%rd1055+-4];
and.b32 %r560, %r552, 31;
setp.eq.s32 %p377, %r560, 0;
@%p377 bra $L__BB0_421;
mov.u32 %r3761, 32;
sub.s32 %r3762, %r3761, %r560;
shr.u32 %r3763, %r8336, %r3762;
shl.b32 %r3764, %r8335, %r560;
add.s32 %r8335, %r3763, %r3764;
mul.wide.s32 %rd1056, %r557, 4;
add.s64 %rd1057, %rd1, %rd1056;
ld.local.u32 %r3765, [%rd1057];
shr.u32 %r3766, %r3765, %r3762;
shl.b32 %r3767, %r8336, %r560;
add.s32 %r8336, %r3766, %r3767;
$L__BB0_421:
and.b32 %r3768, %r551, -2147483648;
shr.u32 %r3769, %r8336, 30;
shl.b32 %r3770, %r8335, 2;
or.b32 %r3771, %r3769, %r3770;
shr.u32 %r3772, %r3771, 31;
shr.u32 %r3773, %r8335, 30;
add.s32 %r3774, %r3772, %r3773;
neg.s32 %r3775, %r3774;
setp.eq.s32 %p378, %r3768, 0;
selp.b32 %r8337, %r3774, %r3775, %p378;
setp.ne.s32 %p379, %r3772, 0;
xor.b32 %r3776, %r3768, -2147483648;
selp.b32 %r3777, %r3776, %r3768, %p379;
selp.b32 %r3778, -1, 0, %p379;
xor.b32 %r3779, %r3771, %r3778;
shl.b32 %r3780, %r8336, 2;
xor.b32 %r3781, %r3780, %r3778;
cvt.u64.u32 %rd1058, %r3779;
cvt.u64.u32 %rd1059, %r3781;
bfi.b64 %rd1060, %rd1058, %rd1059, 32, 32;
cvt.rn.f64.s64 %fd55, %rd1060;
mul.f64 %fd56, %fd55, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2848, %fd56;
setp.eq.s32 %p380, %r3777, 0;
neg.f32 %f2849, %f2848;
selp.f32 %f5310, %f2848, %f2849, %p380;
$L__BB0_423:
add.s32 %r567, %r8337, 1;
and.b32 %r568, %r567, 1;
setp.eq.s32 %p381, %r568, 0;
selp.f32 %f454, %f5310, 0f3F800000, %p381;
mul.rn.f32 %f455, %f5310, %f5310;
mov.f32 %f5311, 0fB94D4153;
@%p381 bra $L__BB0_425;
mov.f32 %f2852, 0fBAB607ED;
mov.f32 %f2853, 0f37CBAC00;
fma.rn.f32 %f5311, %f2853, %f455, %f2852;
$L__BB0_425:
selp.f32 %f2854, 0f3C0885E4, 0f3D2AAABB, %p381;
fma.rn.f32 %f2855, %f5311, %f455, %f2854;
selp.f32 %f2856, 0fBE2AAAA8, 0fBEFFFFFF, %p381;
fma.rn.f32 %f2857, %f2855, %f455, %f2856;
mov.f32 %f2858, 0f00000000;
fma.rn.f32 %f2859, %f455, %f454, %f2858;
fma.rn.f32 %f5312, %f2857, %f2859, %f454;
and.b32 %r3783, %r567, 2;
setp.eq.s32 %p383, %r3783, 0;
@%p383 bra $L__BB0_427;
mov.f32 %f2861, 0fBF800000;
fma.rn.f32 %f5312, %f5312, %f2861, %f2858;
$L__BB0_427:
add.f32 %f5327, %f5309, %f5312;
mul.f32 %f2862, %f310, 0f3F22F983;
cvt.rni.s32.f32 %r8341, %f2862;
cvt.rn.f32.s32 %f2863, %r8341;
mov.f32 %f2864, 0fBFC90FDA;
fma.rn.f32 %f2865, %f2863, %f2864, %f310;
mov.f32 %f2866, 0fB3A22168;
fma.rn.f32 %f2867, %f2863, %f2866, %f2865;
mov.f32 %f2868, 0fA7C234C5;
fma.rn.f32 %f5313, %f2863, %f2868, %f2867;
abs.f32 %f463, %f310;
setp.ltu.f32 %p384, %f463, 0f47CE4780;
@%p384 bra $L__BB0_435;
setp.eq.f32 %p385, %f463, 0f7F800000;
@%p385 bra $L__BB0_434;
bra.uni $L__BB0_429;
$L__BB0_434:
mov.f32 %f2871, 0f00000000;
mul.rn.f32 %f5313, %f310, %f2871;
mov.u32 %r8341, 0;
bra.uni $L__BB0_435;
$L__BB0_429:
mov.b32 %r570, %f310;
shr.u32 %r3785, %r570, 23;
and.b32 %r3786, %r3785, 255;
add.s32 %r571, %r3786, -128;
shl.b32 %r3787, %r570, 8;
or.b32 %r572, %r3787, -2147483648;
shr.u32 %r573, %r571, 5;
mov.u64 %rd2554, 0;
mov.u32 %r8338, 0;
mov.u64 %rd1064, __cudart_i2opi_f;
mov.u64 %rd2555, %rd2554;
$L__BB0_430:
.pragma "nounroll";
shl.b64 %rd1063, %rd2554, 2;
add.s64 %rd1065, %rd1064, %rd1063;
ld.global.nc.u32 %r3788, [%rd1065];
mad.wide.u32 %rd1066, %r3788, %r572, %rd2555;
shr.u64 %rd2555, %rd1066, 32;
add.s64 %rd1067, %rd1, %rd1063;
st.local.u32 [%rd1067], %rd1066;
add.s32 %r8338, %r8338, 1;
cvt.s64.s32 %rd2554, %r8338;
setp.ne.s32 %p386, %r8338, 6;
@%p386 bra $L__BB0_430;
st.local.u32 [%rd5], %rd2555;
mov.u32 %r3789, 4;
sub.s32 %r576, %r3789, %r573;
mov.u32 %r3790, 6;
sub.s32 %r3791, %r3790, %r573;
mul.wide.s32 %rd1068, %r3791, 4;
add.s64 %rd1069, %rd1, %rd1068;
ld.local.u32 %r8339, [%rd1069];
ld.local.u32 %r8340, [%rd1069+-4];
and.b32 %r579, %r571, 31;
setp.eq.s32 %p387, %r579, 0;
@%p387 bra $L__BB0_433;
mov.u32 %r3792, 32;
sub.s32 %r3793, %r3792, %r579;
shr.u32 %r3794, %r8340, %r3793;
shl.b32 %r3795, %r8339, %r579;
add.s32 %r8339, %r3794, %r3795;
mul.wide.s32 %rd1070, %r576, 4;
add.s64 %rd1071, %rd1, %rd1070;
ld.local.u32 %r3796, [%rd1071];
shr.u32 %r3797, %r3796, %r3793;
shl.b32 %r3798, %r8340, %r579;
add.s32 %r8340, %r3797, %r3798;
$L__BB0_433:
and.b32 %r3799, %r570, -2147483648;
shr.u32 %r3800, %r8340, 30;
shl.b32 %r3801, %r8339, 2;
or.b32 %r3802, %r3800, %r3801;
shr.u32 %r3803, %r3802, 31;
shr.u32 %r3804, %r8339, 30;
add.s32 %r3805, %r3803, %r3804;
neg.s32 %r3806, %r3805;
setp.eq.s32 %p388, %r3799, 0;
selp.b32 %r8341, %r3805, %r3806, %p388;
setp.ne.s32 %p389, %r3803, 0;
xor.b32 %r3807, %r3799, -2147483648;
selp.b32 %r3808, %r3807, %r3799, %p389;
selp.b32 %r3809, -1, 0, %p389;
xor.b32 %r3810, %r3802, %r3809;
shl.b32 %r3811, %r8340, 2;
xor.b32 %r3812, %r3811, %r3809;
cvt.u64.u32 %rd1072, %r3810;
cvt.u64.u32 %rd1073, %r3812;
bfi.b64 %rd1074, %rd1072, %rd1073, 32, 32;
cvt.rn.f64.s64 %fd57, %rd1074;
mul.f64 %fd58, %fd57, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2869, %fd58;
setp.eq.s32 %p390, %r3808, 0;
neg.f32 %f2870, %f2869;
selp.f32 %f5313, %f2869, %f2870, %p390;
$L__BB0_435:
and.b32 %r586, %r8341, 1;
setp.eq.s32 %p391, %r586, 0;
selp.f32 %f467, %f5313, 0f3F800000, %p391;
mul.rn.f32 %f468, %f5313, %f5313;
mov.f32 %f5314, 0fB94D4153;
@%p391 bra $L__BB0_437;
mov.f32 %f2873, 0fBAB607ED;
mov.f32 %f2874, 0f37CBAC00;
fma.rn.f32 %f5314, %f2874, %f468, %f2873;
$L__BB0_437:
selp.f32 %f2875, 0f3C0885E4, 0f3D2AAABB, %p391;
fma.rn.f32 %f2876, %f5314, %f468, %f2875;
selp.f32 %f2877, 0fBE2AAAA8, 0fBEFFFFFF, %p391;
fma.rn.f32 %f2878, %f2876, %f468, %f2877;
mov.f32 %f2879, 0f00000000;
fma.rn.f32 %f2880, %f468, %f467, %f2879;
fma.rn.f32 %f5315, %f2878, %f2880, %f467;
and.b32 %r3814, %r8341, 2;
setp.eq.s32 %p393, %r3814, 0;
@%p393 bra $L__BB0_439;
mov.f32 %f2882, 0fBF800000;
fma.rn.f32 %f5315, %f5315, %f2882, %f2879;
$L__BB0_439:
mul.f32 %f2883, %f302, 0f3F22F983;
cvt.rni.s32.f32 %r8345, %f2883;
cvt.rn.f32.s32 %f2884, %r8345;
mov.f32 %f2885, 0fBFC90FDA;
fma.rn.f32 %f2886, %f2884, %f2885, %f302;
mov.f32 %f2887, 0fB3A22168;
fma.rn.f32 %f2888, %f2884, %f2887, %f2886;
mov.f32 %f2889, 0fA7C234C5;
fma.rn.f32 %f5316, %f2884, %f2889, %f2888;
abs.f32 %f475, %f302;
setp.ltu.f32 %p394, %f475, 0f47CE4780;
@%p394 bra $L__BB0_447;
setp.eq.f32 %p395, %f475, 0f7F800000;
@%p395 bra $L__BB0_446;
bra.uni $L__BB0_441;
$L__BB0_446:
mov.f32 %f2892, 0f00000000;
mul.rn.f32 %f5316, %f302, %f2892;
mov.u32 %r8345, 0;
bra.uni $L__BB0_447;
$L__BB0_441:
mov.b32 %r588, %f302;
shr.u32 %r3816, %r588, 23;
and.b32 %r3817, %r3816, 255;
add.s32 %r589, %r3817, -128;
shl.b32 %r3818, %r588, 8;
or.b32 %r590, %r3818, -2147483648;
shr.u32 %r591, %r589, 5;
mov.u64 %rd2556, 0;
mov.u32 %r8342, 0;
mov.u64 %rd1078, __cudart_i2opi_f;
mov.u64 %rd2557, %rd2556;
$L__BB0_442:
.pragma "nounroll";
shl.b64 %rd1077, %rd2556, 2;
add.s64 %rd1079, %rd1078, %rd1077;
ld.global.nc.u32 %r3819, [%rd1079];
mad.wide.u32 %rd1080, %r3819, %r590, %rd2557;
shr.u64 %rd2557, %rd1080, 32;
add.s64 %rd1081, %rd1, %rd1077;
st.local.u32 [%rd1081], %rd1080;
add.s32 %r8342, %r8342, 1;
cvt.s64.s32 %rd2556, %r8342;
setp.ne.s32 %p396, %r8342, 6;
@%p396 bra $L__BB0_442;
st.local.u32 [%rd5], %rd2557;
mov.u32 %r3820, 4;
sub.s32 %r594, %r3820, %r591;
mov.u32 %r3821, 6;
sub.s32 %r3822, %r3821, %r591;
mul.wide.s32 %rd1082, %r3822, 4;
add.s64 %rd1083, %rd1, %rd1082;
ld.local.u32 %r8343, [%rd1083];
ld.local.u32 %r8344, [%rd1083+-4];
and.b32 %r597, %r589, 31;
setp.eq.s32 %p397, %r597, 0;
@%p397 bra $L__BB0_445;
mov.u32 %r3823, 32;
sub.s32 %r3824, %r3823, %r597;
shr.u32 %r3825, %r8344, %r3824;
shl.b32 %r3826, %r8343, %r597;
add.s32 %r8343, %r3825, %r3826;
mul.wide.s32 %rd1084, %r594, 4;
add.s64 %rd1085, %rd1, %rd1084;
ld.local.u32 %r3827, [%rd1085];
shr.u32 %r3828, %r3827, %r3824;
shl.b32 %r3829, %r8344, %r597;
add.s32 %r8344, %r3828, %r3829;
$L__BB0_445:
and.b32 %r3830, %r588, -2147483648;
shr.u32 %r3831, %r8344, 30;
shl.b32 %r3832, %r8343, 2;
or.b32 %r3833, %r3831, %r3832;
shr.u32 %r3834, %r3833, 31;
shr.u32 %r3835, %r8343, 30;
add.s32 %r3836, %r3834, %r3835;
neg.s32 %r3837, %r3836;
setp.eq.s32 %p398, %r3830, 0;
selp.b32 %r8345, %r3836, %r3837, %p398;
setp.ne.s32 %p399, %r3834, 0;
xor.b32 %r3838, %r3830, -2147483648;
selp.b32 %r3839, %r3838, %r3830, %p399;
selp.b32 %r3840, -1, 0, %p399;
xor.b32 %r3841, %r3833, %r3840;
shl.b32 %r3842, %r8344, 2;
xor.b32 %r3843, %r3842, %r3840;
cvt.u64.u32 %rd1086, %r3841;
cvt.u64.u32 %rd1087, %r3843;
bfi.b64 %rd1088, %rd1086, %rd1087, 32, 32;
cvt.rn.f64.s64 %fd59, %rd1088;
mul.f64 %fd60, %fd59, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2890, %fd60;
setp.eq.s32 %p400, %r3839, 0;
neg.f32 %f2891, %f2890;
selp.f32 %f5316, %f2890, %f2891, %p400;
$L__BB0_447:
add.s32 %r604, %r8345, 1;
and.b32 %r605, %r604, 1;
setp.eq.s32 %p401, %r605, 0;
selp.f32 %f479, %f5316, 0f3F800000, %p401;
mul.rn.f32 %f480, %f5316, %f5316;
mov.f32 %f5317, 0fB94D4153;
@%p401 bra $L__BB0_449;
mov.f32 %f2894, 0fBAB607ED;
mov.f32 %f2895, 0f37CBAC00;
fma.rn.f32 %f5317, %f2895, %f480, %f2894;
$L__BB0_449:
selp.f32 %f2896, 0f3C0885E4, 0f3D2AAABB, %p401;
fma.rn.f32 %f2897, %f5317, %f480, %f2896;
selp.f32 %f2898, 0fBE2AAAA8, 0fBEFFFFFF, %p401;
fma.rn.f32 %f2899, %f2897, %f480, %f2898;
mov.f32 %f2900, 0f00000000;
fma.rn.f32 %f2901, %f480, %f479, %f2900;
fma.rn.f32 %f5318, %f2899, %f2901, %f479;
and.b32 %r3845, %r604, 2;
setp.eq.s32 %p403, %r3845, 0;
@%p403 bra $L__BB0_451;
mov.f32 %f2903, 0fBF800000;
fma.rn.f32 %f5318, %f5318, %f2903, %f2900;
$L__BB0_451:
add.f32 %f5326, %f5315, %f5318;
mul.f32 %f2904, %f311, 0f3F22F983;
cvt.rni.s32.f32 %r8349, %f2904;
cvt.rn.f32.s32 %f2905, %r8349;
mov.f32 %f2906, 0fBFC90FDA;
fma.rn.f32 %f2907, %f2905, %f2906, %f311;
mov.f32 %f2908, 0fB3A22168;
fma.rn.f32 %f2909, %f2905, %f2908, %f2907;
mov.f32 %f2910, 0fA7C234C5;
fma.rn.f32 %f5319, %f2905, %f2910, %f2909;
abs.f32 %f488, %f311;
setp.ltu.f32 %p404, %f488, 0f47CE4780;
@%p404 bra $L__BB0_459;
setp.eq.f32 %p405, %f488, 0f7F800000;
@%p405 bra $L__BB0_458;
bra.uni $L__BB0_453;
$L__BB0_458:
mov.f32 %f2913, 0f00000000;
mul.rn.f32 %f5319, %f311, %f2913;
mov.u32 %r8349, 0;
bra.uni $L__BB0_459;
$L__BB0_453:
mov.b32 %r607, %f311;
shr.u32 %r3847, %r607, 23;
and.b32 %r3848, %r3847, 255;
add.s32 %r608, %r3848, -128;
shl.b32 %r3849, %r607, 8;
or.b32 %r609, %r3849, -2147483648;
shr.u32 %r610, %r608, 5;
mov.u64 %rd2558, 0;
mov.u32 %r8346, 0;
mov.u64 %rd1092, __cudart_i2opi_f;
mov.u64 %rd2559, %rd2558;
$L__BB0_454:
.pragma "nounroll";
shl.b64 %rd1091, %rd2558, 2;
add.s64 %rd1093, %rd1092, %rd1091;
ld.global.nc.u32 %r3850, [%rd1093];
mad.wide.u32 %rd1094, %r3850, %r609, %rd2559;
shr.u64 %rd2559, %rd1094, 32;
add.s64 %rd1095, %rd1, %rd1091;
st.local.u32 [%rd1095], %rd1094;
add.s32 %r8346, %r8346, 1;
cvt.s64.s32 %rd2558, %r8346;
setp.ne.s32 %p406, %r8346, 6;
@%p406 bra $L__BB0_454;
st.local.u32 [%rd5], %rd2559;
mov.u32 %r3851, 4;
sub.s32 %r613, %r3851, %r610;
mov.u32 %r3852, 6;
sub.s32 %r3853, %r3852, %r610;
mul.wide.s32 %rd1096, %r3853, 4;
add.s64 %rd1097, %rd1, %rd1096;
ld.local.u32 %r8347, [%rd1097];
ld.local.u32 %r8348, [%rd1097+-4];
and.b32 %r616, %r608, 31;
setp.eq.s32 %p407, %r616, 0;
@%p407 bra $L__BB0_457;
mov.u32 %r3854, 32;
sub.s32 %r3855, %r3854, %r616;
shr.u32 %r3856, %r8348, %r3855;
shl.b32 %r3857, %r8347, %r616;
add.s32 %r8347, %r3856, %r3857;
mul.wide.s32 %rd1098, %r613, 4;
add.s64 %rd1099, %rd1, %rd1098;
ld.local.u32 %r3858, [%rd1099];
shr.u32 %r3859, %r3858, %r3855;
shl.b32 %r3860, %r8348, %r616;
add.s32 %r8348, %r3859, %r3860;
$L__BB0_457:
and.b32 %r3861, %r607, -2147483648;
shr.u32 %r3862, %r8348, 30;
shl.b32 %r3863, %r8347, 2;
or.b32 %r3864, %r3862, %r3863;
shr.u32 %r3865, %r3864, 31;
shr.u32 %r3866, %r8347, 30;
add.s32 %r3867, %r3865, %r3866;
neg.s32 %r3868, %r3867;
setp.eq.s32 %p408, %r3861, 0;
selp.b32 %r8349, %r3867, %r3868, %p408;
setp.ne.s32 %p409, %r3865, 0;
xor.b32 %r3869, %r3861, -2147483648;
selp.b32 %r3870, %r3869, %r3861, %p409;
selp.b32 %r3871, -1, 0, %p409;
xor.b32 %r3872, %r3864, %r3871;
shl.b32 %r3873, %r8348, 2;
xor.b32 %r3874, %r3873, %r3871;
cvt.u64.u32 %rd1100, %r3872;
cvt.u64.u32 %rd1101, %r3874;
bfi.b64 %rd1102, %rd1100, %rd1101, 32, 32;
cvt.rn.f64.s64 %fd61, %rd1102;
mul.f64 %fd62, %fd61, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2911, %fd62;
setp.eq.s32 %p410, %r3870, 0;
neg.f32 %f2912, %f2911;
selp.f32 %f5319, %f2911, %f2912, %p410;
$L__BB0_459:
and.b32 %r623, %r8349, 1;
setp.eq.s32 %p411, %r623, 0;
selp.f32 %f492, %f5319, 0f3F800000, %p411;
mul.rn.f32 %f493, %f5319, %f5319;
mov.f32 %f5320, 0fB94D4153;
@%p411 bra $L__BB0_461;
mov.f32 %f2915, 0fBAB607ED;
mov.f32 %f2916, 0f37CBAC00;
fma.rn.f32 %f5320, %f2916, %f493, %f2915;
$L__BB0_461:
selp.f32 %f2917, 0f3C0885E4, 0f3D2AAABB, %p411;
fma.rn.f32 %f2918, %f5320, %f493, %f2917;
selp.f32 %f2919, 0fBE2AAAA8, 0fBEFFFFFF, %p411;
fma.rn.f32 %f2920, %f2918, %f493, %f2919;
mov.f32 %f2921, 0f00000000;
fma.rn.f32 %f2922, %f493, %f492, %f2921;
fma.rn.f32 %f5321, %f2920, %f2922, %f492;
and.b32 %r3876, %r8349, 2;
setp.eq.s32 %p413, %r3876, 0;
@%p413 bra $L__BB0_463;
mov.f32 %f2924, 0fBF800000;
fma.rn.f32 %f5321, %f5321, %f2924, %f2921;
$L__BB0_463:
mul.f32 %f2925, %f303, 0f3F22F983;
cvt.rni.s32.f32 %r8353, %f2925;
cvt.rn.f32.s32 %f2926, %r8353;
mov.f32 %f2927, 0fBFC90FDA;
fma.rn.f32 %f2928, %f2926, %f2927, %f303;
mov.f32 %f2929, 0fB3A22168;
fma.rn.f32 %f2930, %f2926, %f2929, %f2928;
mov.f32 %f2931, 0fA7C234C5;
fma.rn.f32 %f5322, %f2926, %f2931, %f2930;
abs.f32 %f500, %f303;
setp.ltu.f32 %p414, %f500, 0f47CE4780;
@%p414 bra $L__BB0_471;
setp.eq.f32 %p415, %f500, 0f7F800000;
@%p415 bra $L__BB0_470;
bra.uni $L__BB0_465;
$L__BB0_470:
mov.f32 %f2934, 0f00000000;
mul.rn.f32 %f5322, %f303, %f2934;
mov.u32 %r8353, 0;
bra.uni $L__BB0_471;
$L__BB0_465:
mov.b32 %r625, %f303;
shr.u32 %r3878, %r625, 23;
and.b32 %r3879, %r3878, 255;
add.s32 %r626, %r3879, -128;
shl.b32 %r3880, %r625, 8;
or.b32 %r627, %r3880, -2147483648;
shr.u32 %r628, %r626, 5;
mov.u64 %rd2560, 0;
mov.u32 %r8350, 0;
mov.u64 %rd1106, __cudart_i2opi_f;
mov.u64 %rd2561, %rd2560;
$L__BB0_466:
.pragma "nounroll";
shl.b64 %rd1105, %rd2560, 2;
add.s64 %rd1107, %rd1106, %rd1105;
ld.global.nc.u32 %r3881, [%rd1107];
mad.wide.u32 %rd1108, %r3881, %r627, %rd2561;
shr.u64 %rd2561, %rd1108, 32;
add.s64 %rd1109, %rd1, %rd1105;
st.local.u32 [%rd1109], %rd1108;
add.s32 %r8350, %r8350, 1;
cvt.s64.s32 %rd2560, %r8350;
setp.ne.s32 %p416, %r8350, 6;
@%p416 bra $L__BB0_466;
st.local.u32 [%rd5], %rd2561;
mov.u32 %r3882, 4;
sub.s32 %r631, %r3882, %r628;
mov.u32 %r3883, 6;
sub.s32 %r3884, %r3883, %r628;
mul.wide.s32 %rd1110, %r3884, 4;
add.s64 %rd1111, %rd1, %rd1110;
ld.local.u32 %r8351, [%rd1111];
ld.local.u32 %r8352, [%rd1111+-4];
and.b32 %r634, %r626, 31;
setp.eq.s32 %p417, %r634, 0;
@%p417 bra $L__BB0_469;
mov.u32 %r3885, 32;
sub.s32 %r3886, %r3885, %r634;
shr.u32 %r3887, %r8352, %r3886;
shl.b32 %r3888, %r8351, %r634;
add.s32 %r8351, %r3887, %r3888;
mul.wide.s32 %rd1112, %r631, 4;
add.s64 %rd1113, %rd1, %rd1112;
ld.local.u32 %r3889, [%rd1113];
shr.u32 %r3890, %r3889, %r3886;
shl.b32 %r3891, %r8352, %r634;
add.s32 %r8352, %r3890, %r3891;
$L__BB0_469:
and.b32 %r3892, %r625, -2147483648;
shr.u32 %r3893, %r8352, 30;
shl.b32 %r3894, %r8351, 2;
or.b32 %r3895, %r3893, %r3894;
shr.u32 %r3896, %r3895, 31;
shr.u32 %r3897, %r8351, 30;
add.s32 %r3898, %r3896, %r3897;
neg.s32 %r3899, %r3898;
setp.eq.s32 %p418, %r3892, 0;
selp.b32 %r8353, %r3898, %r3899, %p418;
setp.ne.s32 %p419, %r3896, 0;
xor.b32 %r3900, %r3892, -2147483648;
selp.b32 %r3901, %r3900, %r3892, %p419;
selp.b32 %r3902, -1, 0, %p419;
xor.b32 %r3903, %r3895, %r3902;
shl.b32 %r3904, %r8352, 2;
xor.b32 %r3905, %r3904, %r3902;
cvt.u64.u32 %rd1114, %r3903;
cvt.u64.u32 %rd1115, %r3905;
bfi.b64 %rd1116, %rd1114, %rd1115, 32, 32;
cvt.rn.f64.s64 %fd63, %rd1116;
mul.f64 %fd64, %fd63, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2932, %fd64;
setp.eq.s32 %p420, %r3901, 0;
neg.f32 %f2933, %f2932;
selp.f32 %f5322, %f2932, %f2933, %p420;
$L__BB0_471:
add.s32 %r641, %r8353, 1;
and.b32 %r642, %r641, 1;
setp.eq.s32 %p421, %r642, 0;
selp.f32 %f504, %f5322, 0f3F800000, %p421;
mul.rn.f32 %f505, %f5322, %f5322;
mov.f32 %f5323, 0fB94D4153;
@%p421 bra $L__BB0_473;
mov.f32 %f2936, 0fBAB607ED;
mov.f32 %f2937, 0f37CBAC00;
fma.rn.f32 %f5323, %f2937, %f505, %f2936;
$L__BB0_473:
selp.f32 %f2938, 0f3C0885E4, 0f3D2AAABB, %p421;
fma.rn.f32 %f2939, %f5323, %f505, %f2938;
selp.f32 %f2940, 0fBE2AAAA8, 0fBEFFFFFF, %p421;
fma.rn.f32 %f2941, %f2939, %f505, %f2940;
mov.f32 %f2942, 0f00000000;
fma.rn.f32 %f2943, %f505, %f504, %f2942;
fma.rn.f32 %f5324, %f2941, %f2943, %f504;
and.b32 %r3907, %r641, 2;
setp.eq.s32 %p423, %r3907, 0;
@%p423 bra $L__BB0_475;
mov.f32 %f2945, 0fBF800000;
fma.rn.f32 %f5324, %f5324, %f2945, %f2942;
$L__BB0_475:
add.f32 %f5325, %f5321, %f5324;
bra.uni $L__BB0_476;
$L__BB0_56:
mov.b32 %r2830, %f5348;
shl.b32 %r2831, %r2830, 8;
or.b32 %r47, %r2831, -2147483648;
mov.u64 %rd2498, 0;
mov.u32 %r8226, 0;
mov.u64 %rd648, __cudart_i2opi_f;
mov.u64 %rd2499, %rd2498;
$L__BB0_57:
.pragma "nounroll";
shl.b64 %rd647, %rd2498, 2;
add.s64 %rd649, %rd648, %rd647;
ld.global.nc.u32 %r2832, [%rd649];
mad.wide.u32 %rd650, %r2832, %r47, %rd2499;
shr.u64 %rd2499, %rd650, 32;
add.s64 %rd651, %rd1, %rd647;
st.local.u32 [%rd651], %rd650;
add.s32 %r8226, %r8226, 1;
cvt.s64.s32 %rd2498, %r8226;
setp.ne.s32 %p70, %r8226, 6;
@%p70 bra $L__BB0_57;
mov.b32 %r8066, %f5348;
shr.u32 %r2833, %r8066, 23;
and.b32 %r2834, %r2833, 255;
add.s32 %r2835, %r2834, -128;
shr.u32 %r2836, %r2835, 5;
st.local.u32 [%rd5], %rd2499;
and.b32 %r52, %r2835, 31;
mov.u32 %r2838, 6;
sub.s32 %r2839, %r2838, %r2836;
mul.wide.s32 %rd652, %r2839, 4;
add.s64 %rd653, %rd1, %rd652;
ld.local.u32 %r8227, [%rd653];
ld.local.u32 %r8228, [%rd653+-4];
setp.eq.s32 %p71, %r52, 0;
@%p71 bra $L__BB0_60;
mov.b32 %r8074, %f5348;
shr.u32 %r8073, %r8074, 23;
and.b32 %r8072, %r8073, 255;
add.s32 %r8071, %r8072, -128;
shr.u32 %r8070, %r8071, 5;
mov.u32 %r8069, 4;
sub.s32 %r8068, %r8069, %r8070;
mov.u32 %r2840, 32;
sub.s32 %r2841, %r2840, %r52;
shr.u32 %r2842, %r8228, %r2841;
shl.b32 %r2843, %r8227, %r52;
add.s32 %r8227, %r2842, %r2843;
mul.wide.s32 %rd654, %r8068, 4;
add.s64 %rd655, %rd1, %rd654;
ld.local.u32 %r2844, [%rd655];
shr.u32 %r2845, %r2844, %r2841;
shl.b32 %r2846, %r8228, %r52;
add.s32 %r8228, %r2845, %r2846;
$L__BB0_60:
mov.b32 %r8067, %f5348;
and.b32 %r2847, %r8067, -2147483648;
shr.u32 %r2848, %r8228, 30;
shl.b32 %r2849, %r8227, 2;
or.b32 %r2850, %r2848, %r2849;
shr.u32 %r2851, %r2850, 31;
shr.u32 %r2852, %r8227, 30;
add.s32 %r2853, %r2851, %r2852;
neg.s32 %r2854, %r2853;
setp.eq.s32 %p72, %r2847, 0;
selp.b32 %r8229, %r2853, %r2854, %p72;
setp.ne.s32 %p73, %r2851, 0;
xor.b32 %r2855, %r2847, -2147483648;
selp.b32 %r2856, %r2855, %r2847, %p73;
selp.b32 %r2857, -1, 0, %p73;
xor.b32 %r2858, %r2850, %r2857;
shl.b32 %r2859, %r8228, 2;
xor.b32 %r2860, %r2859, %r2857;
cvt.u64.u32 %rd656, %r2858;
cvt.u64.u32 %rd657, %r2860;
bfi.b64 %rd658, %rd656, %rd657, 32, 32;
cvt.rn.f64.s64 %fd1, %rd658;
mul.f64 %fd2, %fd1, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2266, %fd2;
setp.eq.s32 %p74, %r2856, 0;
neg.f32 %f2267, %f2266;
selp.f32 %f5180, %f2266, %f2267, %p74;
$L__BB0_62:
and.b32 %r61, %r8229, 1;
setp.eq.s32 %p75, %r61, 0;
mul.rn.f32 %f39, %f5180, %f5180;
mov.f32 %f5181, 0fB94D4153;
@%p75 bra $L__BB0_64;
mov.f32 %f2270, 0fBAB607ED;
mov.f32 %f2271, 0f37CBAC00;
fma.rn.f32 %f5181, %f2271, %f39, %f2270;
$L__BB0_64:
and.b32 %r8075, %r8229, 1;
setp.eq.s32 %p1805, %r8075, 0;
selp.f32 %f5161, %f5180, 0f3F800000, %p1805;
selp.f32 %f2272, 0f3C0885E4, 0f3D2AAABB, %p1805;
fma.rn.f32 %f2273, %f5181, %f39, %f2272;
selp.f32 %f2274, 0fBE2AAAA8, 0fBEFFFFFF, %p1805;
fma.rn.f32 %f2275, %f2273, %f39, %f2274;
mov.f32 %f2276, 0f00000000;
fma.rn.f32 %f2277, %f39, %f5161, %f2276;
fma.rn.f32 %f5214, %f2275, %f2277, %f5161;
and.b32 %r2862, %r8229, 2;
setp.eq.s32 %p77, %r2862, 0;
@%p77 bra $L__BB0_66;
mov.f32 %f2279, 0fBF800000;
fma.rn.f32 %f5214, %f5214, %f2279, %f2276;
$L__BB0_66:
shl.b32 %r8065, %r12, 5;
neg.s32 %r8064, %r8065;
setp.ge.s32 %p1804, %r14, %r8064;
@%p1804 bra $L__BB0_79;
mul.f32 %f2281, %f5531, 0f3F22F983;
cvt.rni.s32.f32 %r8233, %f2281;
cvt.rn.f32.s32 %f2282, %r8233;
mov.f32 %f2283, 0fBFC90FDA;
fma.rn.f32 %f2284, %f2282, %f2283, %f5531;
mov.f32 %f2285, 0fB3A22168;
fma.rn.f32 %f2286, %f2282, %f2285, %f2284;
mov.f32 %f2287, 0fA7C234C5;
fma.rn.f32 %f5184, %f2282, %f2287, %f2286;
abs.f32 %f47, %f5531;
setp.ltu.f32 %p79, %f47, 0f47CE4780;
@%p79 bra $L__BB0_75;
setp.eq.f32 %p80, %f47, 0f7F800000;
@%p80 bra $L__BB0_74;
bra.uni $L__BB0_69;
$L__BB0_74:
mov.f32 %f2290, 0f00000000;
mul.rn.f32 %f5184, %f5531, %f2290;
mov.u32 %r8233, 0;
bra.uni $L__BB0_75;
$L__BB0_69:
mov.b32 %r63, %f5531;
shr.u32 %r2866, %r63, 23;
and.b32 %r2867, %r2866, 255;
shl.b32 %r2868, %r63, 8;
or.b32 %r65, %r2868, -2147483648;
mov.u64 %rd2500, 0;
mov.u32 %r8230, 0;
mov.u64 %rd662, __cudart_i2opi_f;
mov.u64 %rd2501, %rd2500;
$L__BB0_70:
.pragma "nounroll";
shl.b64 %rd661, %rd2500, 2;
add.s64 %rd663, %rd662, %rd661;
ld.global.nc.u32 %r2869, [%rd663];
mad.wide.u32 %rd664, %r2869, %r65, %rd2501;
shr.u64 %rd2501, %rd664, 32;
add.s64 %rd665, %rd1, %rd661;
st.local.u32 [%rd665], %rd664;
add.s32 %r8230, %r8230, 1;
cvt.s64.s32 %rd2500, %r8230;
setp.ne.s32 %p81, %r8230, 6;
@%p81 bra $L__BB0_70;
add.s32 %r8045, %r2867, -128;
mov.b32 %r8044, %f5531;
shr.u32 %r8043, %r8044, 23;
and.b32 %r8042, %r8043, 255;
add.s32 %r8041, %r8042, -128;
shr.u32 %r8040, %r8041, 5;
st.local.u32 [%rd5], %rd2501;
mov.u32 %r2871, 6;
sub.s32 %r2872, %r2871, %r8040;
mul.wide.s32 %rd666, %r2872, 4;
add.s64 %rd667, %rd1, %rd666;
ld.local.u32 %r8231, [%rd667];
ld.local.u32 %r8232, [%rd667+-4];
and.b32 %r72, %r8041, 31;
setp.eq.s32 %p82, %r72, 0;
@%p82 bra $L__BB0_73;
mov.b32 %r8082, %f5531;
shr.u32 %r8081, %r8082, 23;
and.b32 %r8080, %r8081, 255;
add.s32 %r8079, %r8080, -128;
shr.u32 %r8078, %r8079, 5;
mov.u32 %r8077, 4;
sub.s32 %r8076, %r8077, %r8078;
mov.u32 %r2873, 32;
sub.s32 %r2874, %r2873, %r72;
shr.u32 %r2875, %r8232, %r2874;
shl.b32 %r2876, %r8231, %r72;
add.s32 %r8231, %r2875, %r2876;
mul.wide.s32 %rd668, %r8076, 4;
add.s64 %rd669, %rd1, %rd668;
ld.local.u32 %r2877, [%rd669];
shr.u32 %r2878, %r2877, %r2874;
shl.b32 %r2879, %r8232, %r72;
add.s32 %r8232, %r2878, %r2879;
$L__BB0_73:
mov.b32 %r8046, %f5531;
and.b32 %r2880, %r8046, -2147483648;
shr.u32 %r2881, %r8232, 30;
shl.b32 %r2882, %r8231, 2;
or.b32 %r2883, %r2881, %r2882;
shr.u32 %r2884, %r2883, 31;
shr.u32 %r2885, %r8231, 30;
add.s32 %r2886, %r2884, %r2885;
neg.s32 %r2887, %r2886;
setp.eq.s32 %p83, %r2880, 0;
selp.b32 %r8233, %r2886, %r2887, %p83;
setp.ne.s32 %p84, %r2884, 0;
xor.b32 %r2888, %r2880, -2147483648;
selp.b32 %r2889, %r2888, %r2880, %p84;
selp.b32 %r2890, -1, 0, %p84;
xor.b32 %r2891, %r2883, %r2890;
shl.b32 %r2892, %r8232, 2;
xor.b32 %r2893, %r2892, %r2890;
cvt.u64.u32 %rd670, %r2891;
cvt.u64.u32 %rd671, %r2893;
bfi.b64 %rd672, %rd670, %rd671, 32, 32;
cvt.rn.f64.s64 %fd3, %rd672;
mul.f64 %fd4, %fd3, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2288, %fd4;
setp.eq.s32 %p85, %r2889, 0;
neg.f32 %f2289, %f2288;
selp.f32 %f5184, %f2288, %f2289, %p85;
$L__BB0_75:
add.s32 %r79, %r8233, 1;
and.b32 %r80, %r79, 1;
setp.eq.s32 %p86, %r80, 0;
selp.f32 %f51, %f5184, 0f3F800000, %p86;
mul.rn.f32 %f52, %f5184, %f5184;
mov.f32 %f5185, 0fB94D4153;
@%p86 bra $L__BB0_77;
mov.f32 %f2292, 0fBAB607ED;
mov.f32 %f2293, 0f37CBAC00;
fma.rn.f32 %f5185, %f2293, %f52, %f2292;
$L__BB0_77:
add.s32 %r8085, %r8233, 1;
add.s32 %r8084, %r8233, 1;
and.b32 %r8083, %r8084, 1;
setp.eq.s32 %p1806, %r8083, 0;
selp.f32 %f2294, 0f3C0885E4, 0f3D2AAABB, %p1806;
fma.rn.f32 %f2295, %f5185, %f52, %f2294;
selp.f32 %f2296, 0fBE2AAAA8, 0fBEFFFFFF, %p1806;
fma.rn.f32 %f2297, %f2295, %f52, %f2296;
mov.f32 %f2298, 0f00000000;
fma.rn.f32 %f2299, %f52, %f51, %f2298;
fma.rn.f32 %f5216, %f2297, %f2299, %f51;
and.b32 %r2895, %r8084, 2;
setp.eq.s32 %p88, %r2895, 0;
@%p88 bra $L__BB0_79;
mov.f32 %f2301, 0fBF800000;
fma.rn.f32 %f5216, %f5216, %f2301, %f2298;
$L__BB0_79:
add.f32 %f5332, %f5214, %f5216;
$L__BB0_80:
neg.s32 %r8048, %r12;
add.s32 %r8047, %r13, -15;
setp.ge.s32 %p1800, %r8047, %r8048;
mov.f32 %f5213, %f5214;
mov.f32 %f5215, %f5216;
@%p1800 bra $L__BB0_109;
shl.b32 %r2898, %r12, 5;
mov.u32 %r2899, -32;
sub.s32 %r81, %r2899, %r2898;
setp.ge.s32 %p90, %r14, %r81;
mov.f32 %f5213, %f5214;
@%p90 bra $L__BB0_94;
mul.f32 %f2303, %f5347, 0f3F22F983;
cvt.rni.s32.f32 %r8237, %f2303;
cvt.rn.f32.s32 %f2304, %r8237;
mov.f32 %f2305, 0fBFC90FDA;
fma.rn.f32 %f2306, %f2304, %f2305, %f5347;
mov.f32 %f2307, 0fB3A22168;
fma.rn.f32 %f2308, %f2304, %f2307, %f2306;
mov.f32 %f2309, 0fA7C234C5;
fma.rn.f32 %f5191, %f2304, %f2309, %f2308;
abs.f32 %f64, %f5347;
setp.ltu.f32 %p91, %f64, 0f47CE4780;
@%p91 bra $L__BB0_90;
setp.eq.f32 %p92, %f64, 0f7F800000;
@%p92 bra $L__BB0_89;
bra.uni $L__BB0_84;
$L__BB0_89:
mov.f32 %f2312, 0f00000000;
mul.rn.f32 %f5191, %f5347, %f2312;
mov.u32 %r8237, 0;
bra.uni $L__BB0_90;
$L__BB0_84:
mov.b32 %r83, %f5347;
shr.u32 %r2901, %r83, 23;
and.b32 %r2902, %r2901, 255;
shl.b32 %r2903, %r83, 8;
or.b32 %r85, %r2903, -2147483648;
mov.u64 %rd2502, 0;
mov.u32 %r8234, 0;
mov.u64 %rd676, __cudart_i2opi_f;
mov.u64 %rd2503, %rd2502;
$L__BB0_85:
.pragma "nounroll";
shl.b64 %rd675, %rd2502, 2;
add.s64 %rd677, %rd676, %rd675;
ld.global.nc.u32 %r2904, [%rd677];
mad.wide.u32 %rd678, %r2904, %r85, %rd2503;
shr.u64 %rd2503, %rd678, 32;
add.s64 %rd679, %rd1, %rd675;
st.local.u32 [%rd679], %rd678;
add.s32 %r8234, %r8234, 1;
cvt.s64.s32 %rd2502, %r8234;
setp.ne.s32 %p93, %r8234, 6;
@%p93 bra $L__BB0_85;
add.s32 %r8094, %r2902, -128;
mov.b32 %r8093, %f5347;
shr.u32 %r8092, %r8093, 23;
and.b32 %r8091, %r8092, 255;
add.s32 %r8090, %r8091, -128;
shr.u32 %r8089, %r8090, 5;
st.local.u32 [%rd5], %rd2503;
mov.u32 %r2906, 6;
sub.s32 %r2907, %r2906, %r8089;
mul.wide.s32 %rd680, %r2907, 4;
add.s64 %rd681, %rd1, %rd680;
ld.local.u32 %r8235, [%rd681];
ld.local.u32 %r8236, [%rd681+-4];
and.b32 %r92, %r8090, 31;
setp.eq.s32 %p94, %r92, 0;
@%p94 bra $L__BB0_88;
mov.b32 %r8102, %f5347;
shr.u32 %r8101, %r8102, 23;
and.b32 %r8100, %r8101, 255;
add.s32 %r8099, %r8100, -128;
shr.u32 %r8098, %r8099, 5;
mov.u32 %r8097, 4;
sub.s32 %r8096, %r8097, %r8098;
mov.u32 %r2908, 32;
sub.s32 %r2909, %r2908, %r92;
shr.u32 %r2910, %r8236, %r2909;
shl.b32 %r2911, %r8235, %r92;
add.s32 %r8235, %r2910, %r2911;
mul.wide.s32 %rd682, %r8096, 4;
add.s64 %rd683, %rd1, %rd682;
ld.local.u32 %r2912, [%rd683];
shr.u32 %r2913, %r2912, %r2909;
shl.b32 %r2914, %r8236, %r92;
add.s32 %r8236, %r2913, %r2914;
$L__BB0_88:
mov.b32 %r8095, %f5347;
and.b32 %r2915, %r8095, -2147483648;
shr.u32 %r2916, %r8236, 30;
shl.b32 %r2917, %r8235, 2;
or.b32 %r2918, %r2916, %r2917;
shr.u32 %r2919, %r2918, 31;
shr.u32 %r2920, %r8235, 30;
add.s32 %r2921, %r2919, %r2920;
neg.s32 %r2922, %r2921;
setp.eq.s32 %p95, %r2915, 0;
selp.b32 %r8237, %r2921, %r2922, %p95;
setp.ne.s32 %p96, %r2919, 0;
xor.b32 %r2923, %r2915, -2147483648;
selp.b32 %r2924, %r2923, %r2915, %p96;
selp.b32 %r2925, -1, 0, %p96;
xor.b32 %r2926, %r2918, %r2925;
shl.b32 %r2927, %r8236, 2;
xor.b32 %r2928, %r2927, %r2925;
cvt.u64.u32 %rd684, %r2926;
cvt.u64.u32 %rd685, %r2928;
bfi.b64 %rd686, %rd684, %rd685, 32, 32;
cvt.rn.f64.s64 %fd5, %rd686;
mul.f64 %fd6, %fd5, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2310, %fd6;
setp.eq.s32 %p97, %r2924, 0;
neg.f32 %f2311, %f2310;
selp.f32 %f5191, %f2310, %f2311, %p97;
$L__BB0_90:
and.b32 %r99, %r8237, 1;
setp.eq.s32 %p98, %r99, 0;
mul.rn.f32 %f69, %f5191, %f5191;
mov.f32 %f5192, 0fB94D4153;
@%p98 bra $L__BB0_92;
mov.f32 %f2314, 0fBAB607ED;
mov.f32 %f2315, 0f37CBAC00;
fma.rn.f32 %f5192, %f2315, %f69, %f2314;
$L__BB0_92:
and.b32 %r8103, %r8237, 1;
setp.eq.s32 %p1809, %r8103, 0;
selp.f32 %f5162, %f5191, 0f3F800000, %p1809;
selp.f32 %f2316, 0f3C0885E4, 0f3D2AAABB, %p1809;
fma.rn.f32 %f2317, %f5192, %f69, %f2316;
selp.f32 %f2318, 0fBE2AAAA8, 0fBEFFFFFF, %p1809;
fma.rn.f32 %f2319, %f2317, %f69, %f2318;
mov.f32 %f2320, 0f00000000;
fma.rn.f32 %f2321, %f69, %f5162, %f2320;
fma.rn.f32 %f5213, %f2319, %f2321, %f5162;
and.b32 %r2930, %r8237, 2;
setp.eq.s32 %p100, %r2930, 0;
@%p100 bra $L__BB0_94;
mov.f32 %f2323, 0fBF800000;
fma.rn.f32 %f5213, %f5213, %f2323, %f2320;
$L__BB0_94:
shl.b32 %r8088, %r12, 5;
mov.u32 %r8087, -32;
sub.s32 %r8086, %r8087, %r8088;
setp.ge.s32 %p1807, %r14, %r8086;
mov.f32 %f5215, %f5216;
@%p1807 bra $L__BB0_107;
mul.f32 %f2324, %f5339, 0f3F22F983;
cvt.rni.s32.f32 %r8241, %f2324;
cvt.rn.f32.s32 %f2325, %r8241;
mov.f32 %f2326, 0fBFC90FDA;
fma.rn.f32 %f2327, %f2325, %f2326, %f5339;
mov.f32 %f2328, 0fB3A22168;
fma.rn.f32 %f2329, %f2325, %f2328, %f2327;
mov.f32 %f2330, 0fA7C234C5;
fma.rn.f32 %f5195, %f2325, %f2330, %f2329;
abs.f32 %f77, %f5339;
setp.ltu.f32 %p102, %f77, 0f47CE4780;
@%p102 bra $L__BB0_103;
setp.eq.f32 %p103, %f77, 0f7F800000;
@%p103 bra $L__BB0_102;
bra.uni $L__BB0_97;
$L__BB0_102:
mov.f32 %f2333, 0f00000000;
mul.rn.f32 %f5195, %f5339, %f2333;
mov.u32 %r8241, 0;
bra.uni $L__BB0_103;
$L__BB0_97:
mov.b32 %r101, %f5339;
shr.u32 %r2932, %r101, 23;
and.b32 %r2933, %r2932, 255;
shl.b32 %r2934, %r101, 8;
or.b32 %r103, %r2934, -2147483648;
mov.u64 %rd2504, 0;
mov.u32 %r8238, 0;
mov.u64 %rd690, __cudart_i2opi_f;
mov.u64 %rd2505, %rd2504;
$L__BB0_98:
.pragma "nounroll";
shl.b64 %rd689, %rd2504, 2;
add.s64 %rd691, %rd690, %rd689;
ld.global.nc.u32 %r2935, [%rd691];
mad.wide.u32 %rd692, %r2935, %r103, %rd2505;
shr.u64 %rd2505, %rd692, 32;
add.s64 %rd693, %rd1, %rd689;
st.local.u32 [%rd693], %rd692;
add.s32 %r8238, %r8238, 1;
cvt.s64.s32 %rd2504, %r8238;
setp.ne.s32 %p104, %r8238, 6;
@%p104 bra $L__BB0_98;
add.s32 %r8112, %r2933, -128;
mov.b32 %r8111, %f5339;
shr.u32 %r8110, %r8111, 23;
and.b32 %r8109, %r8110, 255;
add.s32 %r8108, %r8109, -128;
shr.u32 %r8107, %r8108, 5;
st.local.u32 [%rd5], %rd2505;
mov.u32 %r2937, 6;
sub.s32 %r2938, %r2937, %r8107;
mul.wide.s32 %rd694, %r2938, 4;
add.s64 %rd695, %rd1, %rd694;
ld.local.u32 %r8239, [%rd695];
ld.local.u32 %r8240, [%rd695+-4];
and.b32 %r110, %r8108, 31;
setp.eq.s32 %p105, %r110, 0;
@%p105 bra $L__BB0_101;
mov.b32 %r8120, %f5339;
shr.u32 %r8119, %r8120, 23;
and.b32 %r8118, %r8119, 255;
add.s32 %r8117, %r8118, -128;
shr.u32 %r8116, %r8117, 5;
mov.u32 %r8115, 4;
sub.s32 %r8114, %r8115, %r8116;
mov.u32 %r2939, 32;
sub.s32 %r2940, %r2939, %r110;
shr.u32 %r2941, %r8240, %r2940;
shl.b32 %r2942, %r8239, %r110;
add.s32 %r8239, %r2941, %r2942;
mul.wide.s32 %rd696, %r8114, 4;
add.s64 %rd697, %rd1, %rd696;
ld.local.u32 %r2943, [%rd697];
shr.u32 %r2944, %r2943, %r2940;
shl.b32 %r2945, %r8240, %r110;
add.s32 %r8240, %r2944, %r2945;
$L__BB0_101:
mov.b32 %r8113, %f5339;
and.b32 %r2946, %r8113, -2147483648;
shr.u32 %r2947, %r8240, 30;
shl.b32 %r2948, %r8239, 2;
or.b32 %r2949, %r2947, %r2948;
shr.u32 %r2950, %r2949, 31;
shr.u32 %r2951, %r8239, 30;
add.s32 %r2952, %r2950, %r2951;
neg.s32 %r2953, %r2952;
setp.eq.s32 %p106, %r2946, 0;
selp.b32 %r8241, %r2952, %r2953, %p106;
setp.ne.s32 %p107, %r2950, 0;
xor.b32 %r2954, %r2946, -2147483648;
selp.b32 %r2955, %r2954, %r2946, %p107;
selp.b32 %r2956, -1, 0, %p107;
xor.b32 %r2957, %r2949, %r2956;
shl.b32 %r2958, %r8240, 2;
xor.b32 %r2959, %r2958, %r2956;
cvt.u64.u32 %rd698, %r2957;
cvt.u64.u32 %rd699, %r2959;
bfi.b64 %rd700, %rd698, %rd699, 32, 32;
cvt.rn.f64.s64 %fd7, %rd700;
mul.f64 %fd8, %fd7, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2331, %fd8;
setp.eq.s32 %p108, %r2955, 0;
neg.f32 %f2332, %f2331;
selp.f32 %f5195, %f2331, %f2332, %p108;
$L__BB0_103:
add.s32 %r117, %r8241, 1;
and.b32 %r118, %r117, 1;
setp.eq.s32 %p109, %r118, 0;
selp.f32 %f81, %f5195, 0f3F800000, %p109;
mul.rn.f32 %f82, %f5195, %f5195;
mov.f32 %f5196, 0fB94D4153;
@%p109 bra $L__BB0_105;
mov.f32 %f2335, 0fBAB607ED;
mov.f32 %f2336, 0f37CBAC00;
fma.rn.f32 %f5196, %f2336, %f82, %f2335;
$L__BB0_105:
add.s32 %r8123, %r8241, 1;
add.s32 %r8122, %r8241, 1;
and.b32 %r8121, %r8122, 1;
setp.eq.s32 %p1811, %r8121, 0;
selp.f32 %f2337, 0f3C0885E4, 0f3D2AAABB, %p1811;
fma.rn.f32 %f2338, %f5196, %f82, %f2337;
selp.f32 %f2339, 0fBE2AAAA8, 0fBEFFFFFF, %p1811;
fma.rn.f32 %f2340, %f2338, %f82, %f2339;
mov.f32 %f2341, 0f00000000;
fma.rn.f32 %f2342, %f82, %f81, %f2341;
fma.rn.f32 %f5215, %f2340, %f2342, %f81;
and.b32 %r2961, %r8122, 2;
setp.eq.s32 %p111, %r2961, 0;
@%p111 bra $L__BB0_107;
mov.f32 %f2344, 0fBF800000;
fma.rn.f32 %f5215, %f5215, %f2344, %f2341;
$L__BB0_107:
shl.b32 %r8106, %r12, 5;
mov.u32 %r8105, -32;
sub.s32 %r8104, %r8105, %r8106;
setp.lt.s32 %p1810, %r14, %r8104;
setp.ge.s32 %p1808, %r14, %r81;
selp.f32 %f89, %f5215, %f5216, %p1810;
selp.f32 %f90, %f5213, %f5214, %p1810;
@%p1808 bra $L__BB0_109;
add.f32 %f5331, %f90, %f89;
mov.f32 %f5214, %f5213;
mov.f32 %f5216, %f5215;
$L__BB0_109:
not.b32 %r8184, %r12;
add.s32 %r8183, %r13, -15;
setp.ge.s32 %p1817, %r8183, %r8184;
@%p1817 bra $L__BB0_138;
shl.b32 %r2964, %r12, 5;
neg.s32 %r119, %r2964;
setp.ge.s32 %p115, %r14, %r119;
@%p115 bra $L__BB0_123;
mul.f32 %f2347, %f5346, 0f3F22F983;
cvt.rni.s32.f32 %r8245, %f2347;
cvt.rn.f32.s32 %f2348, %r8245;
mov.f32 %f2349, 0fBFC90FDA;
fma.rn.f32 %f2350, %f2348, %f2349, %f5346;
mov.f32 %f2351, 0fB3A22168;
fma.rn.f32 %f2352, %f2348, %f2351, %f2350;
mov.f32 %f2353, 0fA7C234C5;
fma.rn.f32 %f5204, %f2348, %f2353, %f2352;
abs.f32 %f98, %f5346;
setp.ltu.f32 %p116, %f98, 0f47CE4780;
@%p116 bra $L__BB0_119;
setp.eq.f32 %p117, %f98, 0f7F800000;
@%p117 bra $L__BB0_118;
bra.uni $L__BB0_113;
$L__BB0_118:
mov.f32 %f2356, 0f00000000;
mul.rn.f32 %f5204, %f5346, %f2356;
mov.u32 %r8245, 0;
bra.uni $L__BB0_119;
$L__BB0_113:
mov.b32 %r121, %f5346;
shr.u32 %r2966, %r121, 23;
and.b32 %r2967, %r2966, 255;
shl.b32 %r2968, %r121, 8;
or.b32 %r123, %r2968, -2147483648;
mov.u64 %rd2506, 0;
mov.u32 %r8242, 0;
mov.u64 %rd704, __cudart_i2opi_f;
mov.u64 %rd2507, %rd2506;
$L__BB0_114:
.pragma "nounroll";
shl.b64 %rd703, %rd2506, 2;
add.s64 %rd705, %rd704, %rd703;
ld.global.nc.u32 %r2969, [%rd705];
mad.wide.u32 %rd706, %r2969, %r123, %rd2507;
shr.u64 %rd2507, %rd706, 32;
add.s64 %rd707, %rd1, %rd703;
st.local.u32 [%rd707], %rd706;
add.s32 %r8242, %r8242, 1;
cvt.s64.s32 %rd2506, %r8242;
setp.ne.s32 %p118, %r8242, 6;
@%p118 bra $L__BB0_114;
add.s32 %r8129, %r2967, -128;
mov.b32 %r8128, %f5346;
shr.u32 %r8127, %r8128, 23;
and.b32 %r8126, %r8127, 255;
add.s32 %r8125, %r8126, -128;
shr.u32 %r8124, %r8125, 5;
st.local.u32 [%rd5], %rd2507;
mov.u32 %r2971, 6;
sub.s32 %r2972, %r2971, %r8124;
mul.wide.s32 %rd708, %r2972, 4;
add.s64 %rd709, %rd1, %rd708;
ld.local.u32 %r8243, [%rd709];
ld.local.u32 %r8244, [%rd709+-4];
and.b32 %r130, %r8125, 31;
setp.eq.s32 %p119, %r130, 0;
@%p119 bra $L__BB0_117;
mov.b32 %r8141, %f5346;
shr.u32 %r8140, %r8141, 23;
and.b32 %r8139, %r8140, 255;
add.s32 %r8138, %r8139, -128;
shr.u32 %r8137, %r8138, 5;
mov.u32 %r8136, 4;
sub.s32 %r8135, %r8136, %r8137;
mov.u32 %r2973, 32;
sub.s32 %r2974, %r2973, %r130;
shr.u32 %r2975, %r8244, %r2974;
shl.b32 %r2976, %r8243, %r130;
add.s32 %r8243, %r2975, %r2976;
mul.wide.s32 %rd710, %r8135, 4;
add.s64 %rd711, %rd1, %rd710;
ld.local.u32 %r2977, [%rd711];
shr.u32 %r2978, %r2977, %r2974;
shl.b32 %r2979, %r8244, %r130;
add.s32 %r8244, %r2978, %r2979;
$L__BB0_117:
mov.b32 %r8130, %f5346;
and.b32 %r2980, %r8130, -2147483648;
shr.u32 %r2981, %r8244, 30;
shl.b32 %r2982, %r8243, 2;
or.b32 %r2983, %r2981, %r2982;
shr.u32 %r2984, %r2983, 31;
shr.u32 %r2985, %r8243, 30;
add.s32 %r2986, %r2984, %r2985;
neg.s32 %r2987, %r2986;
setp.eq.s32 %p120, %r2980, 0;
selp.b32 %r8245, %r2986, %r2987, %p120;
setp.ne.s32 %p121, %r2984, 0;
xor.b32 %r2988, %r2980, -2147483648;
selp.b32 %r2989, %r2988, %r2980, %p121;
selp.b32 %r2990, -1, 0, %p121;
xor.b32 %r2991, %r2983, %r2990;
shl.b32 %r2992, %r8244, 2;
xor.b32 %r2993, %r2992, %r2990;
cvt.u64.u32 %rd712, %r2991;
cvt.u64.u32 %rd713, %r2993;
bfi.b64 %rd714, %rd712, %rd713, 32, 32;
cvt.rn.f64.s64 %fd9, %rd714;
mul.f64 %fd10, %fd9, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2354, %fd10;
setp.eq.s32 %p122, %r2989, 0;
neg.f32 %f2355, %f2354;
selp.f32 %f5204, %f2354, %f2355, %p122;
$L__BB0_119:
and.b32 %r137, %r8245, 1;
setp.eq.s32 %p123, %r137, 0;
mul.rn.f32 %f103, %f5204, %f5204;
mov.f32 %f5205, 0fB94D4153;
@%p123 bra $L__BB0_121;
mov.f32 %f2358, 0fBAB607ED;
mov.f32 %f2359, 0f37CBAC00;
fma.rn.f32 %f5205, %f2359, %f103, %f2358;
$L__BB0_121:
selp.f32 %f5163, %f5204, 0f3F800000, %p123;
selp.f32 %f2360, 0f3C0885E4, 0f3D2AAABB, %p123;
fma.rn.f32 %f2361, %f5205, %f103, %f2360;
selp.f32 %f2362, 0fBE2AAAA8, 0fBEFFFFFF, %p123;
fma.rn.f32 %f2363, %f2361, %f103, %f2362;
mov.f32 %f2364, 0f00000000;
fma.rn.f32 %f2365, %f103, %f5163, %f2364;
fma.rn.f32 %f5213, %f2363, %f2365, %f5163;
and.b32 %r2995, %r8245, 2;
setp.eq.s32 %p125, %r2995, 0;
@%p125 bra $L__BB0_123;
mov.f32 %f2367, 0fBF800000;
fma.rn.f32 %f5213, %f5213, %f2367, %f2364;
$L__BB0_123:
shl.b32 %r8198, %r12, 5;
neg.s32 %r8197, %r8198;
setp.ge.s32 %p1822, %r14, %r8197;
@%p1822 bra $L__BB0_136;
mul.f32 %f2368, %f5338, 0f3F22F983;
cvt.rni.s32.f32 %r8249, %f2368;
cvt.rn.f32.s32 %f2369, %r8249;
mov.f32 %f2370, 0fBFC90FDA;
fma.rn.f32 %f2371, %f2369, %f2370, %f5338;
mov.f32 %f2372, 0fB3A22168;
fma.rn.f32 %f2373, %f2369, %f2372, %f2371;
mov.f32 %f2374, 0fA7C234C5;
fma.rn.f32 %f5208, %f2369, %f2374, %f2373;
abs.f32 %f111, %f5338;
setp.ltu.f32 %p127, %f111, 0f47CE4780;
@%p127 bra $L__BB0_132;
setp.eq.f32 %p128, %f111, 0f7F800000;
@%p128 bra $L__BB0_131;
bra.uni $L__BB0_126;
$L__BB0_131:
mov.f32 %f2377, 0f00000000;
mul.rn.f32 %f5208, %f5338, %f2377;
mov.u32 %r8249, 0;
bra.uni $L__BB0_132;
$L__BB0_126:
mov.b32 %r139, %f5338;
shr.u32 %r2997, %r139, 23;
and.b32 %r2998, %r2997, 255;
shl.b32 %r2999, %r139, 8;
or.b32 %r141, %r2999, -2147483648;
mov.u64 %rd2508, 0;
mov.u32 %r8246, 0;
mov.u64 %rd718, __cudart_i2opi_f;
mov.u64 %rd2509, %rd2508;
$L__BB0_127:
.pragma "nounroll";
shl.b64 %rd717, %rd2508, 2;
add.s64 %rd719, %rd718, %rd717;
ld.global.nc.u32 %r3000, [%rd719];
mad.wide.u32 %rd720, %r3000, %r141, %rd2509;
shr.u64 %rd2509, %rd720, 32;
add.s64 %rd721, %rd1, %rd717;
st.local.u32 [%rd721], %rd720;
add.s32 %r8246, %r8246, 1;
cvt.s64.s32 %rd2508, %r8246;
setp.ne.s32 %p129, %r8246, 6;
@%p129 bra $L__BB0_127;
add.s32 %r8147, %r2998, -128;
mov.b32 %r8146, %f5338;
shr.u32 %r8145, %r8146, 23;
and.b32 %r8144, %r8145, 255;
add.s32 %r8143, %r8144, -128;
shr.u32 %r8142, %r8143, 5;
st.local.u32 [%rd5], %rd2509;
mov.u32 %r3002, 6;
sub.s32 %r3003, %r3002, %r8142;
mul.wide.s32 %rd722, %r3003, 4;
add.s64 %rd723, %rd1, %rd722;
ld.local.u32 %r8247, [%rd723];
ld.local.u32 %r8248, [%rd723+-4];
and.b32 %r148, %r8143, 31;
setp.eq.s32 %p130, %r148, 0;
@%p130 bra $L__BB0_130;
mov.b32 %r8159, %f5338;
shr.u32 %r8158, %r8159, 23;
and.b32 %r8157, %r8158, 255;
add.s32 %r8156, %r8157, -128;
shr.u32 %r8155, %r8156, 5;
mov.u32 %r8154, 4;
sub.s32 %r8153, %r8154, %r8155;
mov.u32 %r3004, 32;
sub.s32 %r3005, %r3004, %r148;
shr.u32 %r3006, %r8248, %r3005;
shl.b32 %r3007, %r8247, %r148;
add.s32 %r8247, %r3006, %r3007;
mul.wide.s32 %rd724, %r8153, 4;
add.s64 %rd725, %rd1, %rd724;
ld.local.u32 %r3008, [%rd725];
shr.u32 %r3009, %r3008, %r3005;
shl.b32 %r3010, %r8248, %r148;
add.s32 %r8248, %r3009, %r3010;
$L__BB0_130:
mov.b32 %r8148, %f5338;
and.b32 %r3011, %r8148, -2147483648;
shr.u32 %r3012, %r8248, 30;
shl.b32 %r3013, %r8247, 2;
or.b32 %r3014, %r3012, %r3013;
shr.u32 %r3015, %r3014, 31;
shr.u32 %r3016, %r8247, 30;
add.s32 %r3017, %r3015, %r3016;
neg.s32 %r3018, %r3017;
setp.eq.s32 %p131, %r3011, 0;
selp.b32 %r8249, %r3017, %r3018, %p131;
setp.ne.s32 %p132, %r3015, 0;
xor.b32 %r3019, %r3011, -2147483648;
selp.b32 %r3020, %r3019, %r3011, %p132;
selp.b32 %r3021, -1, 0, %p132;
xor.b32 %r3022, %r3014, %r3021;
shl.b32 %r3023, %r8248, 2;
xor.b32 %r3024, %r3023, %r3021;
cvt.u64.u32 %rd726, %r3022;
cvt.u64.u32 %rd727, %r3024;
bfi.b64 %rd728, %rd726, %rd727, 32, 32;
cvt.rn.f64.s64 %fd11, %rd728;
mul.f64 %fd12, %fd11, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2375, %fd12;
setp.eq.s32 %p133, %r3020, 0;
neg.f32 %f2376, %f2375;
selp.f32 %f5208, %f2375, %f2376, %p133;
$L__BB0_132:
add.s32 %r155, %r8249, 1;
and.b32 %r156, %r155, 1;
setp.eq.s32 %p134, %r156, 0;
selp.f32 %f115, %f5208, 0f3F800000, %p134;
mul.rn.f32 %f116, %f5208, %f5208;
mov.f32 %f5209, 0fB94D4153;
@%p134 bra $L__BB0_134;
mov.f32 %f2379, 0fBAB607ED;
mov.f32 %f2380, 0f37CBAC00;
fma.rn.f32 %f5209, %f2380, %f116, %f2379;
$L__BB0_134:
selp.f32 %f2381, 0f3C0885E4, 0f3D2AAABB, %p134;
fma.rn.f32 %f2382, %f5209, %f116, %f2381;
selp.f32 %f2383, 0fBE2AAAA8, 0fBEFFFFFF, %p134;
fma.rn.f32 %f2384, %f2382, %f116, %f2383;
mov.f32 %f2385, 0f00000000;
fma.rn.f32 %f2386, %f116, %f115, %f2385;
fma.rn.f32 %f5215, %f2384, %f2386, %f115;
and.b32 %r3026, %r155, 2;
setp.eq.s32 %p136, %r3026, 0;
@%p136 bra $L__BB0_136;
mov.f32 %f2388, 0fBF800000;
fma.rn.f32 %f5215, %f5215, %f2388, %f2385;
$L__BB0_136:
shl.b32 %r8152, %r12, 5;
neg.s32 %r8151, %r8152;
setp.lt.s32 %p1814, %r14, %r8151;
shl.b32 %r8150, %r12, 5;
neg.s32 %r8149, %r8150;
setp.ge.s32 %p1813, %r14, %r8149;
selp.f32 %f123, %f5215, %f5216, %p1814;
selp.f32 %f124, %f5213, %f5214, %p1814;
@%p1813 bra $L__BB0_138;
add.f32 %f5330, %f124, %f123;
mov.f32 %f5214, %f5213;
mov.f32 %f5216, %f5215;
$L__BB0_138:
not.b32 %r8134, %r12;
add.s32 %r8133, %r13, -15;
setp.ge.s32 %p1812, %r8133, %r8134;
@%p1812 bra $L__BB0_167;
shl.b32 %r3029, %r12, 5;
mov.u32 %r3030, -32;
sub.s32 %r157, %r3030, %r3029;
setp.ge.s32 %p140, %r14, %r157;
@%p140 bra $L__BB0_152;
mul.f32 %f2391, %f5345, 0f3F22F983;
cvt.rni.s32.f32 %r8253, %f2391;
cvt.rn.f32.s32 %f2392, %r8253;
mov.f32 %f2393, 0fBFC90FDA;
fma.rn.f32 %f2394, %f2392, %f2393, %f5345;
mov.f32 %f2395, 0fB3A22168;
fma.rn.f32 %f2396, %f2392, %f2395, %f2394;
mov.f32 %f2397, 0fA7C234C5;
fma.rn.f32 %f5217, %f2392, %f2397, %f2396;
abs.f32 %f132, %f5345;
setp.ltu.f32 %p141, %f132, 0f47CE4780;
@%p141 bra $L__BB0_148;
setp.eq.f32 %p142, %f132, 0f7F800000;
@%p142 bra $L__BB0_147;
bra.uni $L__BB0_142;
$L__BB0_147:
mov.f32 %f2400, 0f00000000;
mul.rn.f32 %f5217, %f5345, %f2400;
mov.u32 %r8253, 0;
bra.uni $L__BB0_148;
$L__BB0_142:
mov.b32 %r159, %f5345;
shr.u32 %r3032, %r159, 23;
and.b32 %r3033, %r3032, 255;
shl.b32 %r3034, %r159, 8;
or.b32 %r161, %r3034, -2147483648;
mov.u64 %rd2510, 0;
mov.u32 %r8250, 0;
mov.u64 %rd732, __cudart_i2opi_f;
mov.u64 %rd2511, %rd2510;
$L__BB0_143:
.pragma "nounroll";
shl.b64 %rd731, %rd2510, 2;
add.s64 %rd733, %rd732, %rd731;
ld.global.nc.u32 %r3035, [%rd733];
mad.wide.u32 %rd734, %r3035, %r161, %rd2511;
shr.u64 %rd2511, %rd734, 32;
add.s64 %rd735, %rd1, %rd731;
st.local.u32 [%rd735], %rd734;
add.s32 %r8250, %r8250, 1;
cvt.s64.s32 %rd2510, %r8250;
setp.ne.s32 %p143, %r8250, 6;
@%p143 bra $L__BB0_143;
add.s32 %r8165, %r3033, -128;
mov.b32 %r8164, %f5345;
shr.u32 %r8163, %r8164, 23;
and.b32 %r8162, %r8163, 255;
add.s32 %r8161, %r8162, -128;
shr.u32 %r8160, %r8161, 5;
st.local.u32 [%rd5], %rd2511;
mov.u32 %r3036, 4;
sub.s32 %r165, %r3036, %r8160;
mov.u32 %r3037, 6;
sub.s32 %r3038, %r3037, %r8160;
mul.wide.s32 %rd736, %r3038, 4;
add.s64 %rd737, %rd1, %rd736;
ld.local.u32 %r8251, [%rd737];
ld.local.u32 %r8252, [%rd737+-4];
and.b32 %r168, %r8161, 31;
setp.eq.s32 %p144, %r168, 0;
@%p144 bra $L__BB0_146;
mov.u32 %r3039, 32;
sub.s32 %r3040, %r3039, %r168;
shr.u32 %r3041, %r8252, %r3040;
shl.b32 %r3042, %r8251, %r168;
add.s32 %r8251, %r3041, %r3042;
mul.wide.s32 %rd738, %r165, 4;
add.s64 %rd739, %rd1, %rd738;
ld.local.u32 %r3043, [%rd739];
shr.u32 %r3044, %r3043, %r3040;
shl.b32 %r3045, %r8252, %r168;
add.s32 %r8252, %r3044, %r3045;
$L__BB0_146:
mov.b32 %r8166, %f5345;
and.b32 %r3046, %r8166, -2147483648;
shr.u32 %r3047, %r8252, 30;
shl.b32 %r3048, %r8251, 2;
or.b32 %r3049, %r3047, %r3048;
shr.u32 %r3050, %r3049, 31;
shr.u32 %r3051, %r8251, 30;
add.s32 %r3052, %r3050, %r3051;
neg.s32 %r3053, %r3052;
setp.eq.s32 %p145, %r3046, 0;
selp.b32 %r8253, %r3052, %r3053, %p145;
setp.ne.s32 %p146, %r3050, 0;
xor.b32 %r3054, %r3046, -2147483648;
selp.b32 %r3055, %r3054, %r3046, %p146;
selp.b32 %r3056, -1, 0, %p146;
xor.b32 %r3057, %r3049, %r3056;
shl.b32 %r3058, %r8252, 2;
xor.b32 %r3059, %r3058, %r3056;
cvt.u64.u32 %rd740, %r3057;
cvt.u64.u32 %rd741, %r3059;
bfi.b64 %rd742, %rd740, %rd741, 32, 32;
cvt.rn.f64.s64 %fd13, %rd742;
mul.f64 %fd14, %fd13, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2398, %fd14;
setp.eq.s32 %p147, %r3055, 0;
neg.f32 %f2399, %f2398;
selp.f32 %f5217, %f2398, %f2399, %p147;
$L__BB0_148:
and.b32 %r175, %r8253, 1;
setp.eq.s32 %p148, %r175, 0;
selp.f32 %f136, %f5217, 0f3F800000, %p148;
mul.rn.f32 %f137, %f5217, %f5217;
mov.f32 %f5218, 0fB94D4153;
@%p148 bra $L__BB0_150;
mov.f32 %f2402, 0fBAB607ED;
mov.f32 %f2403, 0f37CBAC00;
fma.rn.f32 %f5218, %f2403, %f137, %f2402;
$L__BB0_150:
selp.f32 %f2404, 0f3C0885E4, 0f3D2AAABB, %p148;
fma.rn.f32 %f2405, %f5218, %f137, %f2404;
selp.f32 %f2406, 0fBE2AAAA8, 0fBEFFFFFF, %p148;
fma.rn.f32 %f2407, %f2405, %f137, %f2406;
mov.f32 %f2408, 0f00000000;
fma.rn.f32 %f2409, %f137, %f136, %f2408;
fma.rn.f32 %f5213, %f2407, %f2409, %f136;
and.b32 %r3061, %r8253, 2;
setp.eq.s32 %p150, %r3061, 0;
@%p150 bra $L__BB0_152;
mov.f32 %f2411, 0fBF800000;
fma.rn.f32 %f5213, %f5213, %f2411, %f2408;
$L__BB0_152:
shl.b32 %r8201, %r12, 5;
mov.u32 %r8200, -32;
sub.s32 %r8199, %r8200, %r8201;
setp.ge.s32 %p1823, %r14, %r8199;
@%p1823 bra $L__BB0_165;
mul.f32 %f2412, %f5337, 0f3F22F983;
cvt.rni.s32.f32 %r8257, %f2412;
cvt.rn.f32.s32 %f2413, %r8257;
mov.f32 %f2414, 0fBFC90FDA;
fma.rn.f32 %f2415, %f2413, %f2414, %f5337;
mov.f32 %f2416, 0fB3A22168;
fma.rn.f32 %f2417, %f2413, %f2416, %f2415;
mov.f32 %f2418, 0fA7C234C5;
fma.rn.f32 %f5221, %f2413, %f2418, %f2417;
abs.f32 %f145, %f5337;
setp.ltu.f32 %p152, %f145, 0f47CE4780;
@%p152 bra $L__BB0_161;
setp.eq.f32 %p153, %f145, 0f7F800000;
@%p153 bra $L__BB0_160;
bra.uni $L__BB0_155;
$L__BB0_160:
mov.f32 %f2421, 0f00000000;
mul.rn.f32 %f5221, %f5337, %f2421;
mov.u32 %r8257, 0;
bra.uni $L__BB0_161;
$L__BB0_155:
mov.b32 %r177, %f5337;
shr.u32 %r3063, %r177, 23;
and.b32 %r3064, %r3063, 255;
shl.b32 %r3065, %r177, 8;
or.b32 %r179, %r3065, -2147483648;
mov.u64 %rd2512, 0;
mov.u32 %r8254, 0;
mov.u64 %rd746, __cudart_i2opi_f;
mov.u64 %rd2513, %rd2512;
$L__BB0_156:
.pragma "nounroll";
shl.b64 %rd745, %rd2512, 2;
add.s64 %rd747, %rd746, %rd745;
ld.global.nc.u32 %r3066, [%rd747];
mad.wide.u32 %rd748, %r3066, %r179, %rd2513;
shr.u64 %rd2513, %rd748, 32;
add.s64 %rd749, %rd1, %rd745;
st.local.u32 [%rd749], %rd748;
add.s32 %r8254, %r8254, 1;
cvt.s64.s32 %rd2512, %r8254;
setp.ne.s32 %p154, %r8254, 6;
@%p154 bra $L__BB0_156;
add.s32 %r8175, %r3064, -128;
mov.b32 %r8174, %f5337;
shr.u32 %r8173, %r8174, 23;
and.b32 %r8172, %r8173, 255;
add.s32 %r8171, %r8172, -128;
shr.u32 %r8170, %r8171, 5;
st.local.u32 [%rd5], %rd2513;
mov.u32 %r3067, 4;
sub.s32 %r183, %r3067, %r8170;
mov.u32 %r3068, 6;
sub.s32 %r3069, %r3068, %r8170;
mul.wide.s32 %rd750, %r3069, 4;
add.s64 %rd751, %rd1, %rd750;
ld.local.u32 %r8255, [%rd751];
ld.local.u32 %r8256, [%rd751+-4];
and.b32 %r186, %r8171, 31;
setp.eq.s32 %p155, %r186, 0;
@%p155 bra $L__BB0_159;
mov.u32 %r3070, 32;
sub.s32 %r3071, %r3070, %r186;
shr.u32 %r3072, %r8256, %r3071;
shl.b32 %r3073, %r8255, %r186;
add.s32 %r8255, %r3072, %r3073;
mul.wide.s32 %rd752, %r183, 4;
add.s64 %rd753, %rd1, %rd752;
ld.local.u32 %r3074, [%rd753];
shr.u32 %r3075, %r3074, %r3071;
shl.b32 %r3076, %r8256, %r186;
add.s32 %r8256, %r3075, %r3076;
$L__BB0_159:
mov.b32 %r8176, %f5337;
and.b32 %r3077, %r8176, -2147483648;
shr.u32 %r3078, %r8256, 30;
shl.b32 %r3079, %r8255, 2;
or.b32 %r3080, %r3078, %r3079;
shr.u32 %r3081, %r3080, 31;
shr.u32 %r3082, %r8255, 30;
add.s32 %r3083, %r3081, %r3082;
neg.s32 %r3084, %r3083;
setp.eq.s32 %p156, %r3077, 0;
selp.b32 %r8257, %r3083, %r3084, %p156;
setp.ne.s32 %p157, %r3081, 0;
xor.b32 %r3085, %r3077, -2147483648;
selp.b32 %r3086, %r3085, %r3077, %p157;
selp.b32 %r3087, -1, 0, %p157;
xor.b32 %r3088, %r3080, %r3087;
shl.b32 %r3089, %r8256, 2;
xor.b32 %r3090, %r3089, %r3087;
cvt.u64.u32 %rd754, %r3088;
cvt.u64.u32 %rd755, %r3090;
bfi.b64 %rd756, %rd754, %rd755, 32, 32;
cvt.rn.f64.s64 %fd15, %rd756;
mul.f64 %fd16, %fd15, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2419, %fd16;
setp.eq.s32 %p158, %r3086, 0;
neg.f32 %f2420, %f2419;
selp.f32 %f5221, %f2419, %f2420, %p158;
$L__BB0_161:
add.s32 %r193, %r8257, 1;
and.b32 %r194, %r193, 1;
setp.eq.s32 %p159, %r194, 0;
selp.f32 %f149, %f5221, 0f3F800000, %p159;
mul.rn.f32 %f150, %f5221, %f5221;
mov.f32 %f5222, 0fB94D4153;
@%p159 bra $L__BB0_163;
mov.f32 %f2423, 0fBAB607ED;
mov.f32 %f2424, 0f37CBAC00;
fma.rn.f32 %f5222, %f2424, %f150, %f2423;
$L__BB0_163:
selp.f32 %f2425, 0f3C0885E4, 0f3D2AAABB, %p159;
fma.rn.f32 %f2426, %f5222, %f150, %f2425;
selp.f32 %f2427, 0fBE2AAAA8, 0fBEFFFFFF, %p159;
fma.rn.f32 %f2428, %f2426, %f150, %f2427;
mov.f32 %f2429, 0f00000000;
fma.rn.f32 %f2430, %f150, %f149, %f2429;
fma.rn.f32 %f5215, %f2428, %f2430, %f149;
and.b32 %r3092, %r193, 2;
setp.eq.s32 %p161, %r3092, 0;
@%p161 bra $L__BB0_165;
mov.f32 %f2432, 0fBF800000;
fma.rn.f32 %f5215, %f5215, %f2432, %f2429;
$L__BB0_165:
shl.b32 %r8182, %r12, 5;
mov.u32 %r8181, -32;
sub.s32 %r8180, %r8181, %r8182;
setp.lt.s32 %p1816, %r14, %r8180;
shl.b32 %r8179, %r12, 5;
mov.u32 %r8178, -32;
sub.s32 %r8177, %r8178, %r8179;
setp.ge.s32 %p1815, %r14, %r8177;
selp.f32 %f157, %f5215, %f5216, %p1816;
selp.f32 %f158, %f5213, %f5214, %p1816;
@%p1815 bra $L__BB0_167;
add.f32 %f5329, %f158, %f157;
mov.f32 %f5214, %f5213;
mov.f32 %f5216, %f5215;
$L__BB0_167:
add.s32 %r8049, %r13, -15;
mov.u32 %r3093, -2;
sub.s32 %r3094, %r3093, %r12;
setp.ge.s32 %p164, %r8049, %r3094;
@%p164 bra $L__BB0_196;
shl.b32 %r3096, %r12, 5;
neg.s32 %r195, %r3096;
setp.ge.s32 %p165, %r14, %r195;
@%p165 bra $L__BB0_181;
mul.f32 %f2435, %f5344, 0f3F22F983;
cvt.rni.s32.f32 %r8261, %f2435;
cvt.rn.f32.s32 %f2436, %r8261;
mov.f32 %f2437, 0fBFC90FDA;
fma.rn.f32 %f2438, %f2436, %f2437, %f5344;
mov.f32 %f2439, 0fB3A22168;
fma.rn.f32 %f2440, %f2436, %f2439, %f2438;
mov.f32 %f2441, 0fA7C234C5;
fma.rn.f32 %f5230, %f2436, %f2441, %f2440;
abs.f32 %f166, %f5344;
setp.ltu.f32 %p166, %f166, 0f47CE4780;
@%p166 bra $L__BB0_177;
setp.eq.f32 %p167, %f166, 0f7F800000;
@%p167 bra $L__BB0_176;
bra.uni $L__BB0_171;
$L__BB0_176:
mov.f32 %f2444, 0f00000000;
mul.rn.f32 %f5230, %f5344, %f2444;
mov.u32 %r8261, 0;
bra.uni $L__BB0_177;
$L__BB0_171:
mov.b32 %r197, %f5344;
shr.u32 %r3098, %r197, 23;
and.b32 %r3099, %r3098, 255;
shl.b32 %r3100, %r197, 8;
or.b32 %r199, %r3100, -2147483648;
mov.u64 %rd2514, 0;
mov.u32 %r8258, 0;
mov.u64 %rd760, __cudart_i2opi_f;
mov.u64 %rd2515, %rd2514;
$L__BB0_172:
.pragma "nounroll";
shl.b64 %rd759, %rd2514, 2;
add.s64 %rd761, %rd760, %rd759;
ld.global.nc.u32 %r3101, [%rd761];
mad.wide.u32 %rd762, %r3101, %r199, %rd2515;
shr.u64 %rd2515, %rd762, 32;
add.s64 %rd763, %rd1, %rd759;
st.local.u32 [%rd763], %rd762;
add.s32 %r8258, %r8258, 1;
cvt.s64.s32 %rd2514, %r8258;
setp.ne.s32 %p168, %r8258, 6;
@%p168 bra $L__BB0_172;
add.s32 %r8207, %r3099, -128;
mov.b32 %r8206, %f5344;
shr.u32 %r8205, %r8206, 23;
and.b32 %r8204, %r8205, 255;
add.s32 %r8203, %r8204, -128;
shr.u32 %r8202, %r8203, 5;
st.local.u32 [%rd5], %rd2515;
mov.u32 %r3102, 4;
sub.s32 %r203, %r3102, %r8202;
mov.u32 %r3103, 6;
sub.s32 %r3104, %r3103, %r8202;
mul.wide.s32 %rd764, %r3104, 4;
add.s64 %rd765, %rd1, %rd764;
ld.local.u32 %r8259, [%rd765];
ld.local.u32 %r8260, [%rd765+-4];
and.b32 %r206, %r8203, 31;
setp.eq.s32 %p169, %r206, 0;
@%p169 bra $L__BB0_175;
mov.u32 %r3105, 32;
sub.s32 %r3106, %r3105, %r206;
shr.u32 %r3107, %r8260, %r3106;
shl.b32 %r3108, %r8259, %r206;
add.s32 %r8259, %r3107, %r3108;
mul.wide.s32 %rd766, %r203, 4;
add.s64 %rd767, %rd1, %rd766;
ld.local.u32 %r3109, [%rd767];
shr.u32 %r3110, %r3109, %r3106;
shl.b32 %r3111, %r8260, %r206;
add.s32 %r8260, %r3110, %r3111;
$L__BB0_175:
mov.b32 %r8208, %f5344;
and.b32 %r3112, %r8208, -2147483648;
shr.u32 %r3113, %r8260, 30;
shl.b32 %r3114, %r8259, 2;
or.b32 %r3115, %r3113, %r3114;
shr.u32 %r3116, %r3115, 31;
shr.u32 %r3117, %r8259, 30;
add.s32 %r3118, %r3116, %r3117;
neg.s32 %r3119, %r3118;
setp.eq.s32 %p170, %r3112, 0;
selp.b32 %r8261, %r3118, %r3119, %p170;
setp.ne.s32 %p171, %r3116, 0;
xor.b32 %r3120, %r3112, -2147483648;
selp.b32 %r3121, %r3120, %r3112, %p171;
selp.b32 %r3122, -1, 0, %p171;
xor.b32 %r3123, %r3115, %r3122;
shl.b32 %r3124, %r8260, 2;
xor.b32 %r3125, %r3124, %r3122;
cvt.u64.u32 %rd768, %r3123;
cvt.u64.u32 %rd769, %r3125;
bfi.b64 %rd770, %rd768, %rd769, 32, 32;
cvt.rn.f64.s64 %fd17, %rd770;
mul.f64 %fd18, %fd17, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2442, %fd18;
setp.eq.s32 %p172, %r3121, 0;
neg.f32 %f2443, %f2442;
selp.f32 %f5230, %f2442, %f2443, %p172;
$L__BB0_177:
and.b32 %r213, %r8261, 1;
setp.eq.s32 %p173, %r213, 0;
selp.f32 %f170, %f5230, 0f3F800000, %p173;
mul.rn.f32 %f171, %f5230, %f5230;
mov.f32 %f5231, 0fB94D4153;
@%p173 bra $L__BB0_179;
mov.f32 %f2446, 0fBAB607ED;
mov.f32 %f2447, 0f37CBAC00;
fma.rn.f32 %f5231, %f2447, %f171, %f2446;
$L__BB0_179:
selp.f32 %f2448, 0f3C0885E4, 0f3D2AAABB, %p173;
fma.rn.f32 %f2449, %f5231, %f171, %f2448;
selp.f32 %f2450, 0fBE2AAAA8, 0fBEFFFFFF, %p173;
fma.rn.f32 %f2451, %f2449, %f171, %f2450;
mov.f32 %f2452, 0f00000000;
fma.rn.f32 %f2453, %f171, %f170, %f2452;
fma.rn.f32 %f5213, %f2451, %f2453, %f170;
and.b32 %r3127, %r8261, 2;
setp.eq.s32 %p175, %r3127, 0;
@%p175 bra $L__BB0_181;
mov.f32 %f2455, 0fBF800000;
fma.rn.f32 %f5213, %f5213, %f2455, %f2452;
$L__BB0_181:
shl.b32 %r8210, %r12, 5;
neg.s32 %r8209, %r8210;
setp.lt.s32 %p4, %r14, %r8209;
@%p165 bra $L__BB0_194;
mul.f32 %f2456, %f5336, 0f3F22F983;
cvt.rni.s32.f32 %r8265, %f2456;
cvt.rn.f32.s32 %f2457, %r8265;
mov.f32 %f2458, 0fBFC90FDA;
fma.rn.f32 %f2459, %f2457, %f2458, %f5336;
mov.f32 %f2460, 0fB3A22168;
fma.rn.f32 %f2461, %f2457, %f2460, %f2459;
mov.f32 %f2462, 0fA7C234C5;
fma.rn.f32 %f5234, %f2457, %f2462, %f2461;
abs.f32 %f179, %f5336;
setp.ltu.f32 %p177, %f179, 0f47CE4780;
@%p177 bra $L__BB0_190;
setp.eq.f32 %p178, %f179, 0f7F800000;
@%p178 bra $L__BB0_189;
bra.uni $L__BB0_184;
$L__BB0_189:
mov.f32 %f2465, 0f00000000;
mul.rn.f32 %f5234, %f5336, %f2465;
mov.u32 %r8265, 0;
bra.uni $L__BB0_190;
$L__BB0_184:
mov.b32 %r215, %f5336;
shr.u32 %r3129, %r215, 23;
and.b32 %r3130, %r3129, 255;
add.s32 %r216, %r3130, -128;
shl.b32 %r3131, %r215, 8;
or.b32 %r217, %r3131, -2147483648;
shr.u32 %r218, %r216, 5;
mov.u64 %rd2516, 0;
mov.u32 %r8262, 0;
mov.u64 %rd774, __cudart_i2opi_f;
mov.u64 %rd2517, %rd2516;
$L__BB0_185:
.pragma "nounroll";
shl.b64 %rd773, %rd2516, 2;
add.s64 %rd775, %rd774, %rd773;
ld.global.nc.u32 %r3132, [%rd775];
mad.wide.u32 %rd776, %r3132, %r217, %rd2517;
shr.u64 %rd2517, %rd776, 32;
add.s64 %rd777, %rd1, %rd773;
st.local.u32 [%rd777], %rd776;
add.s32 %r8262, %r8262, 1;
cvt.s64.s32 %rd2516, %r8262;
setp.ne.s32 %p179, %r8262, 6;
@%p179 bra $L__BB0_185;
st.local.u32 [%rd5], %rd2517;
mov.u32 %r3133, 4;
sub.s32 %r221, %r3133, %r218;
mov.u32 %r3134, 6;
sub.s32 %r3135, %r3134, %r218;
mul.wide.s32 %rd778, %r3135, 4;
add.s64 %rd779, %rd1, %rd778;
ld.local.u32 %r8263, [%rd779];
ld.local.u32 %r8264, [%rd779+-4];
and.b32 %r224, %r216, 31;
setp.eq.s32 %p180, %r224, 0;
@%p180 bra $L__BB0_188;
mov.u32 %r3136, 32;
sub.s32 %r3137, %r3136, %r224;
shr.u32 %r3138, %r8264, %r3137;
shl.b32 %r3139, %r8263, %r224;
add.s32 %r8263, %r3138, %r3139;
mul.wide.s32 %rd780, %r221, 4;
add.s64 %rd781, %rd1, %rd780;
ld.local.u32 %r3140, [%rd781];
shr.u32 %r3141, %r3140, %r3137;
shl.b32 %r3142, %r8264, %r224;
add.s32 %r8264, %r3141, %r3142;
$L__BB0_188:
and.b32 %r3143, %r215, -2147483648;
shr.u32 %r3144, %r8264, 30;
shl.b32 %r3145, %r8263, 2;
or.b32 %r3146, %r3144, %r3145;
shr.u32 %r3147, %r3146, 31;
shr.u32 %r3148, %r8263, 30;
add.s32 %r3149, %r3147, %r3148;
neg.s32 %r3150, %r3149;
setp.eq.s32 %p181, %r3143, 0;
selp.b32 %r8265, %r3149, %r3150, %p181;
setp.ne.s32 %p182, %r3147, 0;
xor.b32 %r3151, %r3143, -2147483648;
selp.b32 %r3152, %r3151, %r3143, %p182;
selp.b32 %r3153, -1, 0, %p182;
xor.b32 %r3154, %r3146, %r3153;
shl.b32 %r3155, %r8264, 2;
xor.b32 %r3156, %r3155, %r3153;
cvt.u64.u32 %rd782, %r3154;
cvt.u64.u32 %rd783, %r3156;
bfi.b64 %rd784, %rd782, %rd783, 32, 32;
cvt.rn.f64.s64 %fd19, %rd784;
mul.f64 %fd20, %fd19, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2463, %fd20;
setp.eq.s32 %p183, %r3152, 0;
neg.f32 %f2464, %f2463;
selp.f32 %f5234, %f2463, %f2464, %p183;
$L__BB0_190:
add.s32 %r231, %r8265, 1;
and.b32 %r232, %r231, 1;
setp.eq.s32 %p184, %r232, 0;
selp.f32 %f183, %f5234, 0f3F800000, %p184;
mul.rn.f32 %f184, %f5234, %f5234;
mov.f32 %f5235, 0fB94D4153;
@%p184 bra $L__BB0_192;
mov.f32 %f2467, 0fBAB607ED;
mov.f32 %f2468, 0f37CBAC00;
fma.rn.f32 %f5235, %f2468, %f184, %f2467;
$L__BB0_192:
selp.f32 %f2469, 0f3C0885E4, 0f3D2AAABB, %p184;
fma.rn.f32 %f2470, %f5235, %f184, %f2469;
selp.f32 %f2471, 0fBE2AAAA8, 0fBEFFFFFF, %p184;
fma.rn.f32 %f2472, %f2470, %f184, %f2471;
mov.f32 %f2473, 0f00000000;
fma.rn.f32 %f2474, %f184, %f183, %f2473;
fma.rn.f32 %f5215, %f2472, %f2474, %f183;
and.b32 %r3158, %r231, 2;
setp.eq.s32 %p186, %r3158, 0;
@%p186 bra $L__BB0_194;
mov.f32 %f2476, 0fBF800000;
fma.rn.f32 %f5215, %f5215, %f2476, %f2473;
$L__BB0_194:
selp.f32 %f191, %f5215, %f5216, %p4;
selp.f32 %f192, %f5213, %f5214, %p4;
@%p165 bra $L__BB0_196;
add.f32 %f5328, %f192, %f191;
mov.f32 %f5214, %f5213;
mov.f32 %f5216, %f5215;
$L__BB0_196:
mov.u32 %r8213, -2;
sub.s32 %r8212, %r8213, %r12;
add.s32 %r8211, %r13, -15;
setp.ge.s32 %p1824, %r8211, %r8212;
@%p1824 bra $L__BB0_225;
shl.b32 %r3162, %r12, 5;
mov.u32 %r3163, -32;
sub.s32 %r233, %r3163, %r3162;
setp.ge.s32 %p190, %r14, %r233;
@%p190 bra $L__BB0_210;
mul.f32 %f2479, %f5343, 0f3F22F983;
cvt.rni.s32.f32 %r8269, %f2479;
cvt.rn.f32.s32 %f2480, %r8269;
mov.f32 %f2481, 0fBFC90FDA;
fma.rn.f32 %f2482, %f2480, %f2481, %f5343;
mov.f32 %f2483, 0fB3A22168;
fma.rn.f32 %f2484, %f2480, %f2483, %f2482;
mov.f32 %f2485, 0fA7C234C5;
fma.rn.f32 %f5243, %f2480, %f2485, %f2484;
abs.f32 %f200, %f5343;
setp.ltu.f32 %p191, %f200, 0f47CE4780;
@%p191 bra $L__BB0_206;
setp.eq.f32 %p192, %f200, 0f7F800000;
@%p192 bra $L__BB0_205;
bra.uni $L__BB0_200;
$L__BB0_205:
mov.f32 %f2488, 0f00000000;
mul.rn.f32 %f5243, %f5343, %f2488;
mov.u32 %r8269, 0;
bra.uni $L__BB0_206;
$L__BB0_200:
mov.b32 %r235, %f5343;
shr.u32 %r3165, %r235, 23;
and.b32 %r3166, %r3165, 255;
add.s32 %r236, %r3166, -128;
shl.b32 %r3167, %r235, 8;
or.b32 %r237, %r3167, -2147483648;
shr.u32 %r238, %r236, 5;
mov.u64 %rd2518, 0;
mov.u32 %r8266, 0;
mov.u64 %rd788, __cudart_i2opi_f;
mov.u64 %rd2519, %rd2518;
$L__BB0_201:
.pragma "nounroll";
shl.b64 %rd787, %rd2518, 2;
add.s64 %rd789, %rd788, %rd787;
ld.global.nc.u32 %r3168, [%rd789];
mad.wide.u32 %rd790, %r3168, %r237, %rd2519;
shr.u64 %rd2519, %rd790, 32;
add.s64 %rd791, %rd1, %rd787;
st.local.u32 [%rd791], %rd790;
add.s32 %r8266, %r8266, 1;
cvt.s64.s32 %rd2518, %r8266;
setp.ne.s32 %p193, %r8266, 6;
@%p193 bra $L__BB0_201;
st.local.u32 [%rd5], %rd2519;
mov.u32 %r3169, 4;
sub.s32 %r241, %r3169, %r238;
mov.u32 %r3170, 6;
sub.s32 %r3171, %r3170, %r238;
mul.wide.s32 %rd792, %r3171, 4;
add.s64 %rd793, %rd1, %rd792;
ld.local.u32 %r8267, [%rd793];
ld.local.u32 %r8268, [%rd793+-4];
and.b32 %r244, %r236, 31;
setp.eq.s32 %p194, %r244, 0;
@%p194 bra $L__BB0_204;
mov.u32 %r3172, 32;
sub.s32 %r3173, %r3172, %r244;
shr.u32 %r3174, %r8268, %r3173;
shl.b32 %r3175, %r8267, %r244;
add.s32 %r8267, %r3174, %r3175;
mul.wide.s32 %rd794, %r241, 4;
add.s64 %rd795, %rd1, %rd794;
ld.local.u32 %r3176, [%rd795];
shr.u32 %r3177, %r3176, %r3173;
shl.b32 %r3178, %r8268, %r244;
add.s32 %r8268, %r3177, %r3178;
$L__BB0_204:
and.b32 %r3179, %r235, -2147483648;
shr.u32 %r3180, %r8268, 30;
shl.b32 %r3181, %r8267, 2;
or.b32 %r3182, %r3180, %r3181;
shr.u32 %r3183, %r3182, 31;
shr.u32 %r3184, %r8267, 30;
add.s32 %r3185, %r3183, %r3184;
neg.s32 %r3186, %r3185;
setp.eq.s32 %p195, %r3179, 0;
selp.b32 %r8269, %r3185, %r3186, %p195;
setp.ne.s32 %p196, %r3183, 0;
xor.b32 %r3187, %r3179, -2147483648;
selp.b32 %r3188, %r3187, %r3179, %p196;
selp.b32 %r3189, -1, 0, %p196;
xor.b32 %r3190, %r3182, %r3189;
shl.b32 %r3191, %r8268, 2;
xor.b32 %r3192, %r3191, %r3189;
cvt.u64.u32 %rd796, %r3190;
cvt.u64.u32 %rd797, %r3192;
bfi.b64 %rd798, %rd796, %rd797, 32, 32;
cvt.rn.f64.s64 %fd21, %rd798;
mul.f64 %fd22, %fd21, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2486, %fd22;
setp.eq.s32 %p197, %r3188, 0;
neg.f32 %f2487, %f2486;
selp.f32 %f5243, %f2486, %f2487, %p197;
$L__BB0_206:
and.b32 %r251, %r8269, 1;
setp.eq.s32 %p198, %r251, 0;
selp.f32 %f204, %f5243, 0f3F800000, %p198;
mul.rn.f32 %f205, %f5243, %f5243;
mov.f32 %f5244, 0fB94D4153;
@%p198 bra $L__BB0_208;
mov.f32 %f2490, 0fBAB607ED;
mov.f32 %f2491, 0f37CBAC00;
fma.rn.f32 %f5244, %f2491, %f205, %f2490;
$L__BB0_208:
selp.f32 %f2492, 0f3C0885E4, 0f3D2AAABB, %p198;
fma.rn.f32 %f2493, %f5244, %f205, %f2492;
selp.f32 %f2494, 0fBE2AAAA8, 0fBEFFFFFF, %p198;
fma.rn.f32 %f2495, %f2493, %f205, %f2494;
mov.f32 %f2496, 0f00000000;
fma.rn.f32 %f2497, %f205, %f204, %f2496;
fma.rn.f32 %f5213, %f2495, %f2497, %f204;
and.b32 %r3194, %r8269, 2;
setp.eq.s32 %p200, %r3194, 0;
@%p200 bra $L__BB0_210;
mov.f32 %f2499, 0fBF800000;
fma.rn.f32 %f5213, %f5213, %f2499, %f2496;
$L__BB0_210:
setp.lt.s32 %p5, %r14, %r233;
@%p190 bra $L__BB0_223;
mul.f32 %f2500, %f5335, 0f3F22F983;
cvt.rni.s32.f32 %r8273, %f2500;
cvt.rn.f32.s32 %f2501, %r8273;
mov.f32 %f2502, 0fBFC90FDA;
fma.rn.f32 %f2503, %f2501, %f2502, %f5335;
mov.f32 %f2504, 0fB3A22168;
fma.rn.f32 %f2505, %f2501, %f2504, %f2503;
mov.f32 %f2506, 0fA7C234C5;
fma.rn.f32 %f5247, %f2501, %f2506, %f2505;
abs.f32 %f213, %f5335;
setp.ltu.f32 %p202, %f213, 0f47CE4780;
@%p202 bra $L__BB0_219;
setp.eq.f32 %p203, %f213, 0f7F800000;
@%p203 bra $L__BB0_218;
bra.uni $L__BB0_213;
$L__BB0_218:
mov.f32 %f2509, 0f00000000;
mul.rn.f32 %f5247, %f5335, %f2509;
mov.u32 %r8273, 0;
bra.uni $L__BB0_219;
$L__BB0_213:
mov.b32 %r253, %f5335;
shr.u32 %r3196, %r253, 23;
and.b32 %r3197, %r3196, 255;
add.s32 %r254, %r3197, -128;
shl.b32 %r3198, %r253, 8;
or.b32 %r255, %r3198, -2147483648;
shr.u32 %r256, %r254, 5;
mov.u64 %rd2520, 0;
mov.u32 %r8270, 0;
mov.u64 %rd802, __cudart_i2opi_f;
mov.u64 %rd2521, %rd2520;
$L__BB0_214:
.pragma "nounroll";
shl.b64 %rd801, %rd2520, 2;
add.s64 %rd803, %rd802, %rd801;
ld.global.nc.u32 %r3199, [%rd803];
mad.wide.u32 %rd804, %r3199, %r255, %rd2521;
shr.u64 %rd2521, %rd804, 32;
add.s64 %rd805, %rd1, %rd801;
st.local.u32 [%rd805], %rd804;
add.s32 %r8270, %r8270, 1;
cvt.s64.s32 %rd2520, %r8270;
setp.ne.s32 %p204, %r8270, 6;
@%p204 bra $L__BB0_214;
st.local.u32 [%rd5], %rd2521;
mov.u32 %r3200, 4;
sub.s32 %r259, %r3200, %r256;
mov.u32 %r3201, 6;
sub.s32 %r3202, %r3201, %r256;
mul.wide.s32 %rd806, %r3202, 4;
add.s64 %rd807, %rd1, %rd806;
ld.local.u32 %r8271, [%rd807];
ld.local.u32 %r8272, [%rd807+-4];
and.b32 %r262, %r254, 31;
setp.eq.s32 %p205, %r262, 0;
@%p205 bra $L__BB0_217;
mov.u32 %r3203, 32;
sub.s32 %r3204, %r3203, %r262;
shr.u32 %r3205, %r8272, %r3204;
shl.b32 %r3206, %r8271, %r262;
add.s32 %r8271, %r3205, %r3206;
mul.wide.s32 %rd808, %r259, 4;
add.s64 %rd809, %rd1, %rd808;
ld.local.u32 %r3207, [%rd809];
shr.u32 %r3208, %r3207, %r3204;
shl.b32 %r3209, %r8272, %r262;
add.s32 %r8272, %r3208, %r3209;
$L__BB0_217:
and.b32 %r3210, %r253, -2147483648;
shr.u32 %r3211, %r8272, 30;
shl.b32 %r3212, %r8271, 2;
or.b32 %r3213, %r3211, %r3212;
shr.u32 %r3214, %r3213, 31;
shr.u32 %r3215, %r8271, 30;
add.s32 %r3216, %r3214, %r3215;
neg.s32 %r3217, %r3216;
setp.eq.s32 %p206, %r3210, 0;
selp.b32 %r8273, %r3216, %r3217, %p206;
setp.ne.s32 %p207, %r3214, 0;
xor.b32 %r3218, %r3210, -2147483648;
selp.b32 %r3219, %r3218, %r3210, %p207;
selp.b32 %r3220, -1, 0, %p207;
xor.b32 %r3221, %r3213, %r3220;
shl.b32 %r3222, %r8272, 2;
xor.b32 %r3223, %r3222, %r3220;
cvt.u64.u32 %rd810, %r3221;
cvt.u64.u32 %rd811, %r3223;
bfi.b64 %rd812, %rd810, %rd811, 32, 32;
cvt.rn.f64.s64 %fd23, %rd812;
mul.f64 %fd24, %fd23, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2507, %fd24;
setp.eq.s32 %p208, %r3219, 0;
neg.f32 %f2508, %f2507;
selp.f32 %f5247, %f2507, %f2508, %p208;
$L__BB0_219:
add.s32 %r269, %r8273, 1;
and.b32 %r270, %r269, 1;
setp.eq.s32 %p209, %r270, 0;
selp.f32 %f217, %f5247, 0f3F800000, %p209;
mul.rn.f32 %f218, %f5247, %f5247;
mov.f32 %f5248, 0fB94D4153;
@%p209 bra $L__BB0_221;
mov.f32 %f2511, 0fBAB607ED;
mov.f32 %f2512, 0f37CBAC00;
fma.rn.f32 %f5248, %f2512, %f218, %f2511;
$L__BB0_221:
selp.f32 %f2513, 0f3C0885E4, 0f3D2AAABB, %p209;
fma.rn.f32 %f2514, %f5248, %f218, %f2513;
selp.f32 %f2515, 0fBE2AAAA8, 0fBEFFFFFF, %p209;
fma.rn.f32 %f2516, %f2514, %f218, %f2515;
mov.f32 %f2517, 0f00000000;
fma.rn.f32 %f2518, %f218, %f217, %f2517;
fma.rn.f32 %f5215, %f2516, %f2518, %f217;
and.b32 %r3225, %r269, 2;
setp.eq.s32 %p211, %r3225, 0;
@%p211 bra $L__BB0_223;
mov.f32 %f2520, 0fBF800000;
fma.rn.f32 %f5215, %f5215, %f2520, %f2517;
$L__BB0_223:
selp.f32 %f225, %f5215, %f5216, %p5;
selp.f32 %f226, %f5213, %f5214, %p5;
@%p190 bra $L__BB0_225;
add.f32 %f5327, %f226, %f225;
mov.f32 %f5214, %f5213;
mov.f32 %f5216, %f5215;
$L__BB0_225:
add.s32 %r8050, %r13, -15;
mov.u32 %r3226, -3;
sub.s32 %r3227, %r3226, %r12;
setp.ge.s32 %p214, %r8050, %r3227;
@%p214 bra $L__BB0_254;
shl.b32 %r3229, %r12, 5;
neg.s32 %r271, %r3229;
setp.ge.s32 %p215, %r14, %r271;
@%p215 bra $L__BB0_239;
mul.f32 %f2523, %f5342, 0f3F22F983;
cvt.rni.s32.f32 %r8277, %f2523;
cvt.rn.f32.s32 %f2524, %r8277;
mov.f32 %f2525, 0fBFC90FDA;
fma.rn.f32 %f2526, %f2524, %f2525, %f5342;
mov.f32 %f2527, 0fB3A22168;
fma.rn.f32 %f2528, %f2524, %f2527, %f2526;
mov.f32 %f2529, 0fA7C234C5;
fma.rn.f32 %f5256, %f2524, %f2529, %f2528;
abs.f32 %f234, %f5342;
setp.ltu.f32 %p216, %f234, 0f47CE4780;
@%p216 bra $L__BB0_235;
setp.eq.f32 %p217, %f234, 0f7F800000;
@%p217 bra $L__BB0_234;
bra.uni $L__BB0_229;
$L__BB0_234:
mov.f32 %f2532, 0f00000000;
mul.rn.f32 %f5256, %f5342, %f2532;
mov.u32 %r8277, 0;
bra.uni $L__BB0_235;
$L__BB0_229:
mov.b32 %r273, %f5342;
shr.u32 %r3231, %r273, 23;
and.b32 %r3232, %r3231, 255;
add.s32 %r274, %r3232, -128;
shl.b32 %r3233, %r273, 8;
or.b32 %r275, %r3233, -2147483648;
shr.u32 %r276, %r274, 5;
mov.u64 %rd2522, 0;
mov.u32 %r8274, 0;
mov.u64 %rd816, __cudart_i2opi_f;
mov.u64 %rd2523, %rd2522;
$L__BB0_230:
.pragma "nounroll";
shl.b64 %rd815, %rd2522, 2;
add.s64 %rd817, %rd816, %rd815;
ld.global.nc.u32 %r3234, [%rd817];
mad.wide.u32 %rd818, %r3234, %r275, %rd2523;
shr.u64 %rd2523, %rd818, 32;
add.s64 %rd819, %rd1, %rd815;
st.local.u32 [%rd819], %rd818;
add.s32 %r8274, %r8274, 1;
cvt.s64.s32 %rd2522, %r8274;
setp.ne.s32 %p218, %r8274, 6;
@%p218 bra $L__BB0_230;
st.local.u32 [%rd5], %rd2523;
mov.u32 %r3235, 4;
sub.s32 %r279, %r3235, %r276;
mov.u32 %r3236, 6;
sub.s32 %r3237, %r3236, %r276;
mul.wide.s32 %rd820, %r3237, 4;
add.s64 %rd821, %rd1, %rd820;
ld.local.u32 %r8275, [%rd821];
ld.local.u32 %r8276, [%rd821+-4];
and.b32 %r282, %r274, 31;
setp.eq.s32 %p219, %r282, 0;
@%p219 bra $L__BB0_233;
mov.u32 %r3238, 32;
sub.s32 %r3239, %r3238, %r282;
shr.u32 %r3240, %r8276, %r3239;
shl.b32 %r3241, %r8275, %r282;
add.s32 %r8275, %r3240, %r3241;
mul.wide.s32 %rd822, %r279, 4;
add.s64 %rd823, %rd1, %rd822;
ld.local.u32 %r3242, [%rd823];
shr.u32 %r3243, %r3242, %r3239;
shl.b32 %r3244, %r8276, %r282;
add.s32 %r8276, %r3243, %r3244;
$L__BB0_233:
and.b32 %r3245, %r273, -2147483648;
shr.u32 %r3246, %r8276, 30;
shl.b32 %r3247, %r8275, 2;
or.b32 %r3248, %r3246, %r3247;
shr.u32 %r3249, %r3248, 31;
shr.u32 %r3250, %r8275, 30;
add.s32 %r3251, %r3249, %r3250;
neg.s32 %r3252, %r3251;
setp.eq.s32 %p220, %r3245, 0;
selp.b32 %r8277, %r3251, %r3252, %p220;
setp.ne.s32 %p221, %r3249, 0;
xor.b32 %r3253, %r3245, -2147483648;
selp.b32 %r3254, %r3253, %r3245, %p221;
selp.b32 %r3255, -1, 0, %p221;
xor.b32 %r3256, %r3248, %r3255;
shl.b32 %r3257, %r8276, 2;
xor.b32 %r3258, %r3257, %r3255;
cvt.u64.u32 %rd824, %r3256;
cvt.u64.u32 %rd825, %r3258;
bfi.b64 %rd826, %rd824, %rd825, 32, 32;
cvt.rn.f64.s64 %fd25, %rd826;
mul.f64 %fd26, %fd25, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2530, %fd26;
setp.eq.s32 %p222, %r3254, 0;
neg.f32 %f2531, %f2530;
selp.f32 %f5256, %f2530, %f2531, %p222;
$L__BB0_235:
and.b32 %r289, %r8277, 1;
setp.eq.s32 %p223, %r289, 0;
selp.f32 %f238, %f5256, 0f3F800000, %p223;
mul.rn.f32 %f239, %f5256, %f5256;
mov.f32 %f5257, 0fB94D4153;
@%p223 bra $L__BB0_237;
mov.f32 %f2534, 0fBAB607ED;
mov.f32 %f2535, 0f37CBAC00;
fma.rn.f32 %f5257, %f2535, %f239, %f2534;
$L__BB0_237:
selp.f32 %f2536, 0f3C0885E4, 0f3D2AAABB, %p223;
fma.rn.f32 %f2537, %f5257, %f239, %f2536;
selp.f32 %f2538, 0fBE2AAAA8, 0fBEFFFFFF, %p223;
fma.rn.f32 %f2539, %f2537, %f239, %f2538;
mov.f32 %f2540, 0f00000000;
fma.rn.f32 %f2541, %f239, %f238, %f2540;
fma.rn.f32 %f5213, %f2539, %f2541, %f238;
and.b32 %r3260, %r8277, 2;
setp.eq.s32 %p225, %r3260, 0;
@%p225 bra $L__BB0_239;
mov.f32 %f2543, 0fBF800000;
fma.rn.f32 %f5213, %f5213, %f2543, %f2540;
$L__BB0_239:
setp.lt.s32 %p6, %r14, %r271;
@%p215 bra $L__BB0_252;
mul.f32 %f2544, %f5334, 0f3F22F983;
cvt.rni.s32.f32 %r8281, %f2544;
cvt.rn.f32.s32 %f2545, %r8281;
mov.f32 %f2546, 0fBFC90FDA;
fma.rn.f32 %f2547, %f2545, %f2546, %f5334;
mov.f32 %f2548, 0fB3A22168;
fma.rn.f32 %f2549, %f2545, %f2548, %f2547;
mov.f32 %f2550, 0fA7C234C5;
fma.rn.f32 %f5260, %f2545, %f2550, %f2549;
abs.f32 %f247, %f5334;
setp.ltu.f32 %p227, %f247, 0f47CE4780;
@%p227 bra $L__BB0_248;
setp.eq.f32 %p228, %f247, 0f7F800000;
@%p228 bra $L__BB0_247;
bra.uni $L__BB0_242;
$L__BB0_247:
mov.f32 %f2553, 0f00000000;
mul.rn.f32 %f5260, %f5334, %f2553;
mov.u32 %r8281, 0;
bra.uni $L__BB0_248;
$L__BB0_242:
mov.b32 %r291, %f5334;
shr.u32 %r3262, %r291, 23;
and.b32 %r3263, %r3262, 255;
add.s32 %r292, %r3263, -128;
shl.b32 %r3264, %r291, 8;
or.b32 %r293, %r3264, -2147483648;
shr.u32 %r294, %r292, 5;
mov.u64 %rd2524, 0;
mov.u32 %r8278, 0;
mov.u64 %rd830, __cudart_i2opi_f;
mov.u64 %rd2525, %rd2524;
$L__BB0_243:
.pragma "nounroll";
shl.b64 %rd829, %rd2524, 2;
add.s64 %rd831, %rd830, %rd829;
ld.global.nc.u32 %r3265, [%rd831];
mad.wide.u32 %rd832, %r3265, %r293, %rd2525;
shr.u64 %rd2525, %rd832, 32;
add.s64 %rd833, %rd1, %rd829;
st.local.u32 [%rd833], %rd832;
add.s32 %r8278, %r8278, 1;
cvt.s64.s32 %rd2524, %r8278;
setp.ne.s32 %p229, %r8278, 6;
@%p229 bra $L__BB0_243;
st.local.u32 [%rd5], %rd2525;
mov.u32 %r3266, 4;
sub.s32 %r297, %r3266, %r294;
mov.u32 %r3267, 6;
sub.s32 %r3268, %r3267, %r294;
mul.wide.s32 %rd834, %r3268, 4;
add.s64 %rd835, %rd1, %rd834;
ld.local.u32 %r8279, [%rd835];
ld.local.u32 %r8280, [%rd835+-4];
and.b32 %r300, %r292, 31;
setp.eq.s32 %p230, %r300, 0;
@%p230 bra $L__BB0_246;
mov.u32 %r3269, 32;
sub.s32 %r3270, %r3269, %r300;
shr.u32 %r3271, %r8280, %r3270;
shl.b32 %r3272, %r8279, %r300;
add.s32 %r8279, %r3271, %r3272;
mul.wide.s32 %rd836, %r297, 4;
add.s64 %rd837, %rd1, %rd836;
ld.local.u32 %r3273, [%rd837];
shr.u32 %r3274, %r3273, %r3270;
shl.b32 %r3275, %r8280, %r300;
add.s32 %r8280, %r3274, %r3275;
$L__BB0_246:
and.b32 %r3276, %r291, -2147483648;
shr.u32 %r3277, %r8280, 30;
shl.b32 %r3278, %r8279, 2;
or.b32 %r3279, %r3277, %r3278;
shr.u32 %r3280, %r3279, 31;
shr.u32 %r3281, %r8279, 30;
add.s32 %r3282, %r3280, %r3281;
neg.s32 %r3283, %r3282;
setp.eq.s32 %p231, %r3276, 0;
selp.b32 %r8281, %r3282, %r3283, %p231;
setp.ne.s32 %p232, %r3280, 0;
xor.b32 %r3284, %r3276, -2147483648;
selp.b32 %r3285, %r3284, %r3276, %p232;
selp.b32 %r3286, -1, 0, %p232;
xor.b32 %r3287, %r3279, %r3286;
shl.b32 %r3288, %r8280, 2;
xor.b32 %r3289, %r3288, %r3286;
cvt.u64.u32 %rd838, %r3287;
cvt.u64.u32 %rd839, %r3289;
bfi.b64 %rd840, %rd838, %rd839, 32, 32;
cvt.rn.f64.s64 %fd27, %rd840;
mul.f64 %fd28, %fd27, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2551, %fd28;
setp.eq.s32 %p233, %r3285, 0;
neg.f32 %f2552, %f2551;
selp.f32 %f5260, %f2551, %f2552, %p233;
$L__BB0_248:
add.s32 %r307, %r8281, 1;
and.b32 %r308, %r307, 1;
setp.eq.s32 %p234, %r308, 0;
selp.f32 %f251, %f5260, 0f3F800000, %p234;
mul.rn.f32 %f252, %f5260, %f5260;
mov.f32 %f5261, 0fB94D4153;
@%p234 bra $L__BB0_250;
mov.f32 %f2555, 0fBAB607ED;
mov.f32 %f2556, 0f37CBAC00;
fma.rn.f32 %f5261, %f2556, %f252, %f2555;
$L__BB0_250:
selp.f32 %f2557, 0f3C0885E4, 0f3D2AAABB, %p234;
fma.rn.f32 %f2558, %f5261, %f252, %f2557;
selp.f32 %f2559, 0fBE2AAAA8, 0fBEFFFFFF, %p234;
fma.rn.f32 %f2560, %f2558, %f252, %f2559;
mov.f32 %f2561, 0f00000000;
fma.rn.f32 %f2562, %f252, %f251, %f2561;
fma.rn.f32 %f5215, %f2560, %f2562, %f251;
and.b32 %r3291, %r307, 2;
setp.eq.s32 %p236, %r3291, 0;
@%p236 bra $L__BB0_252;
mov.f32 %f2564, 0fBF800000;
fma.rn.f32 %f5215, %f5215, %f2564, %f2561;
$L__BB0_252:
selp.f32 %f259, %f5215, %f5216, %p6;
selp.f32 %f260, %f5213, %f5214, %p6;
@%p215 bra $L__BB0_254;
add.f32 %f5326, %f260, %f259;
mov.f32 %f5214, %f5213;
mov.f32 %f5216, %f5215;
$L__BB0_254:
@%p214 bra $L__BB0_476;
shl.b32 %r3295, %r12, 5;
mov.u32 %r3296, -32;
sub.s32 %r309, %r3296, %r3295;
setp.ge.s32 %p240, %r14, %r309;
@%p240 bra $L__BB0_268;
mul.f32 %f2567, %f5341, 0f3F22F983;
cvt.rni.s32.f32 %r8285, %f2567;
cvt.rn.f32.s32 %f2568, %r8285;
mov.f32 %f2569, 0fBFC90FDA;
fma.rn.f32 %f2570, %f2568, %f2569, %f5341;
mov.f32 %f2571, 0fB3A22168;
fma.rn.f32 %f2572, %f2568, %f2571, %f2570;
mov.f32 %f2573, 0fA7C234C5;
fma.rn.f32 %f5269, %f2568, %f2573, %f2572;
abs.f32 %f268, %f5341;
setp.ltu.f32 %p241, %f268, 0f47CE4780;
@%p241 bra $L__BB0_264;
setp.eq.f32 %p242, %f268, 0f7F800000;
@%p242 bra $L__BB0_263;
bra.uni $L__BB0_258;
$L__BB0_263:
mov.f32 %f2576, 0f00000000;
mul.rn.f32 %f5269, %f5341, %f2576;
mov.u32 %r8285, 0;
bra.uni $L__BB0_264;
$L__BB0_258:
mov.b32 %r311, %f5341;
shr.u32 %r3298, %r311, 23;
and.b32 %r3299, %r3298, 255;
add.s32 %r312, %r3299, -128;
shl.b32 %r3300, %r311, 8;
or.b32 %r313, %r3300, -2147483648;
shr.u32 %r314, %r312, 5;
mov.u64 %rd2526, 0;
mov.u32 %r8282, 0;
mov.u64 %rd844, __cudart_i2opi_f;
mov.u64 %rd2527, %rd2526;
$L__BB0_259:
.pragma "nounroll";
shl.b64 %rd843, %rd2526, 2;
add.s64 %rd845, %rd844, %rd843;
ld.global.nc.u32 %r3301, [%rd845];
mad.wide.u32 %rd846, %r3301, %r313, %rd2527;
shr.u64 %rd2527, %rd846, 32;
add.s64 %rd847, %rd1, %rd843;
st.local.u32 [%rd847], %rd846;
add.s32 %r8282, %r8282, 1;
cvt.s64.s32 %rd2526, %r8282;
setp.ne.s32 %p243, %r8282, 6;
@%p243 bra $L__BB0_259;
st.local.u32 [%rd5], %rd2527;
mov.u32 %r3302, 4;
sub.s32 %r317, %r3302, %r314;
mov.u32 %r3303, 6;
sub.s32 %r3304, %r3303, %r314;
mul.wide.s32 %rd848, %r3304, 4;
add.s64 %rd849, %rd1, %rd848;
ld.local.u32 %r8283, [%rd849];
ld.local.u32 %r8284, [%rd849+-4];
and.b32 %r320, %r312, 31;
setp.eq.s32 %p244, %r320, 0;
@%p244 bra $L__BB0_262;
mov.u32 %r3305, 32;
sub.s32 %r3306, %r3305, %r320;
shr.u32 %r3307, %r8284, %r3306;
shl.b32 %r3308, %r8283, %r320;
add.s32 %r8283, %r3307, %r3308;
mul.wide.s32 %rd850, %r317, 4;
add.s64 %rd851, %rd1, %rd850;
ld.local.u32 %r3309, [%rd851];
shr.u32 %r3310, %r3309, %r3306;
shl.b32 %r3311, %r8284, %r320;
add.s32 %r8284, %r3310, %r3311;
$L__BB0_262:
and.b32 %r3312, %r311, -2147483648;
shr.u32 %r3313, %r8284, 30;
shl.b32 %r3314, %r8283, 2;
or.b32 %r3315, %r3313, %r3314;
shr.u32 %r3316, %r3315, 31;
shr.u32 %r3317, %r8283, 30;
add.s32 %r3318, %r3316, %r3317;
neg.s32 %r3319, %r3318;
setp.eq.s32 %p245, %r3312, 0;
selp.b32 %r8285, %r3318, %r3319, %p245;
setp.ne.s32 %p246, %r3316, 0;
xor.b32 %r3320, %r3312, -2147483648;
selp.b32 %r3321, %r3320, %r3312, %p246;
selp.b32 %r3322, -1, 0, %p246;
xor.b32 %r3323, %r3315, %r3322;
shl.b32 %r3324, %r8284, 2;
xor.b32 %r3325, %r3324, %r3322;
cvt.u64.u32 %rd852, %r3323;
cvt.u64.u32 %rd853, %r3325;
bfi.b64 %rd854, %rd852, %rd853, 32, 32;
cvt.rn.f64.s64 %fd29, %rd854;
mul.f64 %fd30, %fd29, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2574, %fd30;
setp.eq.s32 %p247, %r3321, 0;
neg.f32 %f2575, %f2574;
selp.f32 %f5269, %f2574, %f2575, %p247;
$L__BB0_264:
and.b32 %r327, %r8285, 1;
setp.eq.s32 %p248, %r327, 0;
selp.f32 %f272, %f5269, 0f3F800000, %p248;
mul.rn.f32 %f273, %f5269, %f5269;
mov.f32 %f5270, 0fB94D4153;
@%p248 bra $L__BB0_266;
mov.f32 %f2578, 0fBAB607ED;
mov.f32 %f2579, 0f37CBAC00;
fma.rn.f32 %f5270, %f2579, %f273, %f2578;
$L__BB0_266:
selp.f32 %f2580, 0f3C0885E4, 0f3D2AAABB, %p248;
fma.rn.f32 %f2581, %f5270, %f273, %f2580;
selp.f32 %f2582, 0fBE2AAAA8, 0fBEFFFFFF, %p248;
fma.rn.f32 %f2583, %f2581, %f273, %f2582;
mov.f32 %f2584, 0f00000000;
fma.rn.f32 %f2585, %f273, %f272, %f2584;
fma.rn.f32 %f5213, %f2583, %f2585, %f272;
and.b32 %r3327, %r8285, 2;
setp.eq.s32 %p250, %r3327, 0;
@%p250 bra $L__BB0_268;
mov.f32 %f2587, 0fBF800000;
fma.rn.f32 %f5213, %f5213, %f2587, %f2584;
$L__BB0_268:
setp.lt.s32 %p7, %r14, %r309;
@%p240 bra $L__BB0_281;
mul.f32 %f2588, %f5333, 0f3F22F983;
cvt.rni.s32.f32 %r8289, %f2588;
cvt.rn.f32.s32 %f2589, %r8289;
mov.f32 %f2590, 0fBFC90FDA;
fma.rn.f32 %f2591, %f2589, %f2590, %f5333;
mov.f32 %f2592, 0fB3A22168;
fma.rn.f32 %f2593, %f2589, %f2592, %f2591;
mov.f32 %f2594, 0fA7C234C5;
fma.rn.f32 %f5273, %f2589, %f2594, %f2593;
abs.f32 %f281, %f5333;
setp.ltu.f32 %p252, %f281, 0f47CE4780;
@%p252 bra $L__BB0_277;
setp.eq.f32 %p253, %f281, 0f7F800000;
@%p253 bra $L__BB0_276;
bra.uni $L__BB0_271;
$L__BB0_276:
mov.f32 %f2597, 0f00000000;
mul.rn.f32 %f5273, %f5333, %f2597;
mov.u32 %r8289, 0;
bra.uni $L__BB0_277;
$L__BB0_271:
mov.b32 %r329, %f5333;
shr.u32 %r3329, %r329, 23;
and.b32 %r3330, %r3329, 255;
add.s32 %r330, %r3330, -128;
shl.b32 %r3331, %r329, 8;
or.b32 %r331, %r3331, -2147483648;
shr.u32 %r332, %r330, 5;
mov.u64 %rd2528, 0;
mov.u32 %r8286, 0;
mov.u64 %rd858, __cudart_i2opi_f;
mov.u64 %rd2529, %rd2528;
$L__BB0_272:
.pragma "nounroll";
shl.b64 %rd857, %rd2528, 2;
add.s64 %rd859, %rd858, %rd857;
ld.global.nc.u32 %r3332, [%rd859];
mad.wide.u32 %rd860, %r3332, %r331, %rd2529;
shr.u64 %rd2529, %rd860, 32;
add.s64 %rd861, %rd1, %rd857;
st.local.u32 [%rd861], %rd860;
add.s32 %r8286, %r8286, 1;
cvt.s64.s32 %rd2528, %r8286;
setp.ne.s32 %p254, %r8286, 6;
@%p254 bra $L__BB0_272;
st.local.u32 [%rd5], %rd2529;
mov.u32 %r3333, 4;
sub.s32 %r335, %r3333, %r332;
mov.u32 %r3334, 6;
sub.s32 %r3335, %r3334, %r332;
mul.wide.s32 %rd862, %r3335, 4;
add.s64 %rd863, %rd1, %rd862;
ld.local.u32 %r8287, [%rd863];
ld.local.u32 %r8288, [%rd863+-4];
and.b32 %r338, %r330, 31;
setp.eq.s32 %p255, %r338, 0;
@%p255 bra $L__BB0_275;
mov.u32 %r3336, 32;
sub.s32 %r3337, %r3336, %r338;
shr.u32 %r3338, %r8288, %r3337;
shl.b32 %r3339, %r8287, %r338;
add.s32 %r8287, %r3338, %r3339;
mul.wide.s32 %rd864, %r335, 4;
add.s64 %rd865, %rd1, %rd864;
ld.local.u32 %r3340, [%rd865];
shr.u32 %r3341, %r3340, %r3337;
shl.b32 %r3342, %r8288, %r338;
add.s32 %r8288, %r3341, %r3342;
$L__BB0_275:
and.b32 %r3343, %r329, -2147483648;
shr.u32 %r3344, %r8288, 30;
shl.b32 %r3345, %r8287, 2;
or.b32 %r3346, %r3344, %r3345;
shr.u32 %r3347, %r3346, 31;
shr.u32 %r3348, %r8287, 30;
add.s32 %r3349, %r3347, %r3348;
neg.s32 %r3350, %r3349;
setp.eq.s32 %p256, %r3343, 0;
selp.b32 %r8289, %r3349, %r3350, %p256;
setp.ne.s32 %p257, %r3347, 0;
xor.b32 %r3351, %r3343, -2147483648;
selp.b32 %r3352, %r3351, %r3343, %p257;
selp.b32 %r3353, -1, 0, %p257;
xor.b32 %r3354, %r3346, %r3353;
shl.b32 %r3355, %r8288, 2;
xor.b32 %r3356, %r3355, %r3353;
cvt.u64.u32 %rd866, %r3354;
cvt.u64.u32 %rd867, %r3356;
bfi.b64 %rd868, %rd866, %rd867, 32, 32;
cvt.rn.f64.s64 %fd31, %rd868;
mul.f64 %fd32, %fd31, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2595, %fd32;
setp.eq.s32 %p258, %r3352, 0;
neg.f32 %f2596, %f2595;
selp.f32 %f5273, %f2595, %f2596, %p258;
$L__BB0_277:
add.s32 %r345, %r8289, 1;
and.b32 %r346, %r345, 1;
setp.eq.s32 %p259, %r346, 0;
selp.f32 %f285, %f5273, 0f3F800000, %p259;
mul.rn.f32 %f286, %f5273, %f5273;
mov.f32 %f5274, 0fB94D4153;
@%p259 bra $L__BB0_279;
mov.f32 %f2599, 0fBAB607ED;
mov.f32 %f2600, 0f37CBAC00;
fma.rn.f32 %f5274, %f2600, %f286, %f2599;
$L__BB0_279:
selp.f32 %f2601, 0f3C0885E4, 0f3D2AAABB, %p259;
fma.rn.f32 %f2602, %f5274, %f286, %f2601;
selp.f32 %f2603, 0fBE2AAAA8, 0fBEFFFFFF, %p259;
fma.rn.f32 %f2604, %f2602, %f286, %f2603;
mov.f32 %f2605, 0f00000000;
fma.rn.f32 %f2606, %f286, %f285, %f2605;
fma.rn.f32 %f5215, %f2604, %f2606, %f285;
and.b32 %r3358, %r345, 2;
setp.eq.s32 %p261, %r3358, 0;
@%p261 bra $L__BB0_281;
mov.f32 %f2608, 0fBF800000;
fma.rn.f32 %f5215, %f5215, %f2608, %f2605;
$L__BB0_281:
selp.f32 %f293, %f5215, %f5216, %p7;
selp.f32 %f294, %f5213, %f5214, %p7;
@%p240 bra $L__BB0_476;
add.f32 %f5325, %f294, %f293;
mov.f32 %f5214, %f5213;
mov.f32 %f5216, %f5215;
$L__BB0_476:
@%p32 bra $L__BB0_478;
shl.b32 %r3908, %r12, 2;
mov.u32 %r3909, -4;
sub.s32 %r3910, %r3909, %r3908;
add.s32 %r3911, %r13, -12;
setp.lt.s32 %p425, %r3911, %r3910;
@%p425 bra $L__BB0_758;
bra.uni $L__BB0_478;
$L__BB0_758:
mov.u32 %r4637, %ctaid.x;
mul.lo.s32 %r4638, %r2614, %r4637;
shl.b32 %r4639, %r12, 5;
add.s32 %r4640, %r4639, %r1;
mul.hi.s32 %r4641, %r4640, -1840700269;
add.s32 %r4642, %r4641, %r4640;
shr.u32 %r4643, %r4642, 31;
shr.s32 %r4644, %r4642, 2;
add.s32 %r4645, %r4644, %r4643;
mul.lo.s32 %r4646, %r4645, %r2615;
mul.lo.s32 %r4647, %r4645, 7;
sub.s32 %r4648, %r4640, %r4647;
mul.lo.s32 %r4649, %r4648, %r2616;
add.s32 %r4650, %r13, 4;
mad.lo.s32 %r4651, %r4650, %r2613, %r4638;
add.s32 %r4652, %r4651, %r4646;
add.s32 %r4653, %r4652, %r4649;
mul.wide.s32 %rd1373, %r4653, 4;
add.s64 %rd1374, %rd3, %rd1373;
ld.global.f32 %f839, [%rd1374];
add.s32 %r4654, %r4640, 32;
mul.hi.s32 %r4655, %r4654, -1840700269;
add.s32 %r4656, %r4655, %r4654;
shr.u32 %r4657, %r4656, 31;
shr.s32 %r4658, %r4656, 2;
add.s32 %r4659, %r4658, %r4657;
mul.lo.s32 %r4660, %r4659, %r2615;
mul.lo.s32 %r4661, %r4659, 7;
sub.s32 %r4662, %r4654, %r4661;
mul.lo.s32 %r4663, %r4662, %r2616;
add.s32 %r4664, %r4651, %r4660;
add.s32 %r4665, %r4664, %r4663;
mul.wide.s32 %rd1375, %r4665, 4;
add.s64 %rd1376, %rd3, %rd1375;
ld.global.f32 %f840, [%rd1376];
add.s32 %r4666, %r2612, %r4638;
mad.lo.s32 %r4667, %r13, %r2613, %r4666;
add.s32 %r4668, %r4667, %r4646;
add.s32 %r4669, %r4668, %r4649;
mul.wide.s32 %rd1377, %r4669, 4;
add.s64 %rd1378, %rd3, %rd1377;
ld.global.f32 %f841, [%rd1378];
add.s32 %r4670, %r4667, %r4660;
add.s32 %r4671, %r4670, %r4663;
mul.wide.s32 %rd1379, %r4671, 4;
add.s64 %rd1380, %rd3, %rd1379;
ld.global.f32 %f842, [%rd1380];
mul.wide.s32 %rd1381, %r2613, 4;
add.s64 %rd1382, %rd1378, %rd1381;
ld.global.f32 %f843, [%rd1382];
add.s64 %rd1383, %rd1380, %rd1381;
ld.global.f32 %f844, [%rd1383];
add.s64 %rd1384, %rd1382, %rd1381;
ld.global.f32 %f845, [%rd1384];
add.s64 %rd1385, %rd1383, %rd1381;
ld.global.f32 %f846, [%rd1385];
mul.hi.s32 %r4672, %r4640, 954437177;
shr.u32 %r4673, %r4672, 31;
shr.s32 %r4674, %r4672, 1;
add.s32 %r4675, %r4674, %r4673;
mul.lo.s32 %r4676, %r4675, %r2605;
mul.lo.s32 %r4677, %r4675, 9;
sub.s32 %r4678, %r4640, %r4677;
mul.lo.s32 %r4679, %r4678, %r2606;
add.s32 %r4680, %r13, 1;
mul.lo.s32 %r4681, %r4680, %r2603;
mad.lo.s32 %r4682, %r2604, %r4637, %r2602;
add.s32 %r4683, %r4682, %r4681;
add.s32 %r4684, %r4683, %r4676;
add.s32 %r4685, %r4684, %r4679;
mul.wide.s32 %rd1386, %r4685, 4;
add.s64 %rd1387, %rd2, %rd1386;
ld.global.f32 %f847, [%rd1387];
mul.hi.s32 %r4686, %r4654, 954437177;
shr.u32 %r4687, %r4686, 31;
shr.s32 %r4688, %r4686, 1;
add.s32 %r4689, %r4688, %r4687;
mul.lo.s32 %r4690, %r4689, %r2605;
mul.lo.s32 %r4691, %r4689, 9;
sub.s32 %r4692, %r4654, %r4691;
mul.lo.s32 %r4693, %r4692, %r2606;
add.s32 %r4694, %r4683, %r4690;
add.s32 %r4695, %r4694, %r4693;
mul.wide.s32 %rd1388, %r4695, 4;
add.s64 %rd1389, %rd2, %rd1388;
ld.global.f32 %f848, [%rd1389];
add.s32 %r4696, %r4683, %r2603;
add.s32 %r4697, %r4696, %r4676;
add.s32 %r4698, %r4697, %r4679;
mul.wide.s32 %rd1390, %r4698, 4;
add.s64 %rd1391, %rd2, %rd1390;
ld.global.f32 %f849, [%rd1391];
add.s32 %r4699, %r4696, %r4690;
add.s32 %r4700, %r4699, %r4693;
mul.wide.s32 %rd1392, %r4700, 4;
add.s64 %rd1393, %rd2, %rd1392;
ld.global.f32 %f850, [%rd1393];
add.s32 %r4701, %r4682, %r2602;
mad.lo.s32 %r4702, %r13, %r2603, %r4701;
add.s32 %r4703, %r4702, %r4676;
add.s32 %r4704, %r4703, %r4679;
mul.wide.s32 %rd1394, %r4704, 4;
add.s64 %rd1395, %rd2, %rd1394;
ld.global.f32 %f851, [%rd1395];
add.s32 %r4705, %r4702, %r4690;
add.s32 %r4706, %r4705, %r4693;
mul.wide.s32 %rd1396, %r4706, 4;
add.s64 %rd1397, %rd2, %rd1396;
ld.global.f32 %f852, [%rd1397];
add.s32 %r4707, %r4701, %r4681;
add.s32 %r4708, %r4707, %r4676;
add.s32 %r4709, %r4708, %r4679;
mul.wide.s32 %rd1398, %r4709, 4;
add.s64 %rd1399, %rd2, %rd1398;
ld.global.f32 %f853, [%rd1399];
add.s32 %r4710, %r4707, %r4690;
add.s32 %r4711, %r4710, %r4693;
mul.wide.s32 %rd1400, %r4711, 4;
add.s64 %rd1401, %rd2, %rd1400;
ld.global.f32 %f854, [%rd1401];
mul.f32 %f3299, %f847, 0f3F22F983;
cvt.rni.s32.f32 %r8421, %f3299;
cvt.rn.f32.s32 %f3300, %r8421;
mov.f32 %f3301, 0fBFC90FDA;
fma.rn.f32 %f3302, %f3300, %f3301, %f847;
mov.f32 %f3303, 0fB3A22168;
fma.rn.f32 %f3304, %f3300, %f3303, %f3302;
mov.f32 %f3305, 0fA7C234C5;
fma.rn.f32 %f5468, %f3300, %f3305, %f3304;
abs.f32 %f856, %f847;
setp.ltu.f32 %p658, %f856, 0f47CE4780;
@%p658 bra $L__BB0_766;
setp.eq.f32 %p659, %f856, 0f7F800000;
@%p659 bra $L__BB0_765;
bra.uni $L__BB0_760;
$L__BB0_765:
mov.f32 %f3308, 0f00000000;
mul.rn.f32 %f5468, %f847, %f3308;
mov.u32 %r8421, 0;
bra.uni $L__BB0_766;
$L__BB0_478:
mov.u32 %r643, %ctaid.x;
mul.lo.s32 %r644, %r2614, %r643;
add.s32 %r3912, %r13, -15;
mov.u32 %r3913, -4;
sub.s32 %r645, %r3913, %r12;
setp.ge.s32 %p426, %r3912, %r645;
add.s32 %r3914, %r13, 4;
mad.lo.s32 %r646, %r3914, %r2613, %r644;
@%p426 bra $L__BB0_481;
shl.b32 %r647, %r12, 5;
neg.s32 %r3915, %r647;
setp.ge.s32 %p427, %r14, %r3915;
@%p427 bra $L__BB0_481;
add.s32 %r3916, %r647, %r1;
mul.hi.s32 %r3917, %r3916, -1840700269;
add.s32 %r3918, %r3917, %r3916;
shr.u32 %r3919, %r3918, 31;
shr.s32 %r3920, %r3918, 2;
add.s32 %r3921, %r3920, %r3919;
mul.lo.s32 %r3922, %r3921, 7;
sub.s32 %r3923, %r3916, %r3922;
mad.lo.s32 %r3924, %r3921, %r2615, %r646;
mad.lo.s32 %r3925, %r3923, %r2616, %r3924;
mul.wide.s32 %rd1117, %r3925, 4;
add.s64 %rd1118, %rd3, %rd1117;
ld.global.f32 %f5531, [%rd1118];
$L__BB0_481:
@%p426 bra $L__BB0_484;
shl.b32 %r648, %r12, 5;
mov.u32 %r3927, -32;
sub.s32 %r3928, %r3927, %r648;
setp.ge.s32 %p429, %r14, %r3928;
@%p429 bra $L__BB0_484;
add.s32 %r3929, %r648, %r1;
add.s32 %r3930, %r3929, 32;
mul.hi.s32 %r3931, %r3930, -1840700269;
add.s32 %r3932, %r3931, %r3930;
shr.u32 %r3933, %r3932, 31;
shr.s32 %r3934, %r3932, 2;
add.s32 %r3935, %r3934, %r3933;
mul.lo.s32 %r3936, %r3935, 7;
sub.s32 %r3937, %r3930, %r3936;
mad.lo.s32 %r3938, %r3935, %r2615, %r646;
mad.lo.s32 %r3939, %r3937, %r2616, %r3938;
mul.wide.s32 %rd1119, %r3939, 4;
add.s64 %rd1120, %rd3, %rd1119;
ld.global.f32 %f5339, [%rd1120];
$L__BB0_484:
mov.u32 %r3941, -5;
sub.s32 %r649, %r3941, %r12;
setp.ge.s32 %p430, %r3912, %r649;
add.s32 %r3942, %r2612, %r644;
mad.lo.s32 %r650, %r13, %r2613, %r3942;
@%p430 bra $L__BB0_487;
shl.b32 %r651, %r12, 5;
neg.s32 %r3943, %r651;
setp.ge.s32 %p431, %r14, %r3943;
@%p431 bra $L__BB0_487;
add.s32 %r3944, %r651, %r1;
mul.hi.s32 %r3945, %r3944, -1840700269;
add.s32 %r3946, %r3945, %r3944;
shr.u32 %r3947, %r3946, 31;
shr.s32 %r3948, %r3946, 2;
add.s32 %r3949, %r3948, %r3947;
mul.lo.s32 %r3950, %r3949, 7;
sub.s32 %r3951, %r3944, %r3950;
mad.lo.s32 %r3952, %r3949, %r2615, %r650;
mad.lo.s32 %r3953, %r3951, %r2616, %r3952;
mul.wide.s32 %rd1121, %r3953, 4;
add.s64 %rd1122, %rd3, %rd1121;
ld.global.f32 %f5338, [%rd1122];
$L__BB0_487:
@%p430 bra $L__BB0_490;
shl.b32 %r652, %r12, 5;
mov.u32 %r3955, -32;
sub.s32 %r3956, %r3955, %r652;
setp.ge.s32 %p433, %r14, %r3956;
@%p433 bra $L__BB0_490;
add.s32 %r3957, %r652, %r1;
add.s32 %r3958, %r3957, 32;
mul.hi.s32 %r3959, %r3958, -1840700269;
add.s32 %r3960, %r3959, %r3958;
shr.u32 %r3961, %r3960, 31;
shr.s32 %r3962, %r3960, 2;
add.s32 %r3963, %r3962, %r3961;
mul.lo.s32 %r3964, %r3963, 7;
sub.s32 %r3965, %r3958, %r3964;
mad.lo.s32 %r3966, %r3963, %r2615, %r650;
mad.lo.s32 %r3967, %r3965, %r2616, %r3966;
mul.wide.s32 %rd1123, %r3967, 4;
add.s64 %rd1124, %rd3, %rd1123;
ld.global.f32 %f5337, [%rd1124];
$L__BB0_490:
mov.u32 %r3969, -6;
sub.s32 %r653, %r3969, %r12;
setp.ge.s32 %p434, %r3912, %r653;
add.s32 %r654, %r650, %r2613;
@%p434 bra $L__BB0_493;
shl.b32 %r655, %r12, 5;
neg.s32 %r3970, %r655;
setp.ge.s32 %p435, %r14, %r3970;
@%p435 bra $L__BB0_493;
add.s32 %r3971, %r655, %r1;
mul.hi.s32 %r3972, %r3971, -1840700269;
add.s32 %r3973, %r3972, %r3971;
shr.u32 %r3974, %r3973, 31;
shr.s32 %r3975, %r3973, 2;
add.s32 %r3976, %r3975, %r3974;
mul.lo.s32 %r3977, %r3976, 7;
sub.s32 %r3978, %r3971, %r3977;
mad.lo.s32 %r3979, %r3976, %r2615, %r654;
mad.lo.s32 %r3980, %r3978, %r2616, %r3979;
mul.wide.s32 %rd1125, %r3980, 4;
add.s64 %rd1126, %rd3, %rd1125;
ld.global.f32 %f5336, [%rd1126];
$L__BB0_493:
@%p434 bra $L__BB0_496;
shl.b32 %r656, %r12, 5;
mov.u32 %r3982, -32;
sub.s32 %r3983, %r3982, %r656;
setp.ge.s32 %p437, %r14, %r3983;
@%p437 bra $L__BB0_496;
add.s32 %r3984, %r656, %r1;
add.s32 %r3985, %r3984, 32;
mul.hi.s32 %r3986, %r3985, -1840700269;
add.s32 %r3987, %r3986, %r3985;
shr.u32 %r3988, %r3987, 31;
shr.s32 %r3989, %r3987, 2;
add.s32 %r3990, %r3989, %r3988;
mul.lo.s32 %r3991, %r3990, 7;
sub.s32 %r3992, %r3985, %r3991;
mad.lo.s32 %r3993, %r3990, %r2615, %r654;
mad.lo.s32 %r3994, %r3992, %r2616, %r3993;
mul.wide.s32 %rd1127, %r3994, 4;
add.s64 %rd1128, %rd3, %rd1127;
ld.global.f32 %f5335, [%rd1128];
$L__BB0_496:
mov.u32 %r3996, -7;
sub.s32 %r657, %r3996, %r12;
setp.ge.s32 %p438, %r3912, %r657;
add.s32 %r658, %r654, %r2613;
@%p438 bra $L__BB0_499;
shl.b32 %r659, %r12, 5;
neg.s32 %r3997, %r659;
setp.ge.s32 %p439, %r14, %r3997;
@%p439 bra $L__BB0_499;
add.s32 %r3998, %r659, %r1;
mul.hi.s32 %r3999, %r3998, -1840700269;
add.s32 %r4000, %r3999, %r3998;
shr.u32 %r4001, %r4000, 31;
shr.s32 %r4002, %r4000, 2;
add.s32 %r4003, %r4002, %r4001;
mul.lo.s32 %r4004, %r4003, 7;
sub.s32 %r4005, %r3998, %r4004;
mad.lo.s32 %r4006, %r4003, %r2615, %r658;
mad.lo.s32 %r4007, %r4005, %r2616, %r4006;
mul.wide.s32 %rd1129, %r4007, 4;
add.s64 %rd1130, %rd3, %rd1129;
ld.global.f32 %f5334, [%rd1130];
$L__BB0_499:
@%p438 bra $L__BB0_502;
shl.b32 %r660, %r12, 5;
mov.u32 %r4009, -32;
sub.s32 %r4010, %r4009, %r660;
setp.ge.s32 %p441, %r14, %r4010;
@%p441 bra $L__BB0_502;
add.s32 %r4011, %r660, %r1;
add.s32 %r4012, %r4011, 32;
mul.hi.s32 %r4013, %r4012, -1840700269;
add.s32 %r4014, %r4013, %r4012;
shr.u32 %r4015, %r4014, 31;
shr.s32 %r4016, %r4014, 2;
add.s32 %r4017, %r4016, %r4015;
mul.lo.s32 %r4018, %r4017, 7;
sub.s32 %r4019, %r4012, %r4018;
mad.lo.s32 %r4020, %r4017, %r2615, %r658;
mad.lo.s32 %r4021, %r4019, %r2616, %r4020;
mul.wide.s32 %rd1131, %r4021, 4;
add.s64 %rd1132, %rd3, %rd1131;
ld.global.f32 %f5333, [%rd1132];
$L__BB0_502:
mul.lo.s32 %r661, %r2604, %r643;
add.s32 %r4023, %r2602, %r661;
add.s32 %r4024, %r13, 1;
mul.lo.s32 %r662, %r4024, %r2603;
add.s32 %r663, %r4023, %r662;
@%p426 bra $L__BB0_505;
shl.b32 %r664, %r12, 5;
neg.s32 %r4025, %r664;
setp.ge.s32 %p443, %r14, %r4025;
@%p443 bra $L__BB0_505;
add.s32 %r4026, %r664, %r1;
mul.hi.s32 %r4027, %r4026, 954437177;
shr.u32 %r4028, %r4027, 31;
shr.s32 %r4029, %r4027, 1;
add.s32 %r4030, %r4029, %r4028;
mul.lo.s32 %r4031, %r4030, 9;
sub.s32 %r4032, %r4026, %r4031;
mad.lo.s32 %r4033, %r4030, %r2605, %r663;
mad.lo.s32 %r4034, %r4032, %r2606, %r4033;
mul.wide.s32 %rd1133, %r4034, 4;
add.s64 %rd1134, %rd2, %rd1133;
ld.global.f32 %f5348, [%rd1134];
$L__BB0_505:
@%p426 bra $L__BB0_508;
shl.b32 %r665, %r12, 5;
mov.u32 %r4036, -32;
sub.s32 %r4037, %r4036, %r665;
setp.ge.s32 %p445, %r14, %r4037;
@%p445 bra $L__BB0_508;
add.s32 %r4038, %r665, %r1;
add.s32 %r4039, %r4038, 32;
mul.hi.s32 %r4040, %r4039, 954437177;
shr.u32 %r4041, %r4040, 31;
shr.s32 %r4042, %r4040, 1;
add.s32 %r4043, %r4042, %r4041;
mul.lo.s32 %r4044, %r4043, 9;
sub.s32 %r4045, %r4039, %r4044;
mad.lo.s32 %r4046, %r4043, %r2605, %r663;
mad.lo.s32 %r4047, %r4045, %r2606, %r4046;
mul.wide.s32 %rd1135, %r4047, 4;
add.s64 %rd1136, %rd2, %rd1135;
ld.global.f32 %f5347, [%rd1136];
$L__BB0_508:
add.s32 %r666, %r663, %r2603;
@%p430 bra $L__BB0_511;
shl.b32 %r667, %r12, 5;
neg.s32 %r4049, %r667;
setp.ge.s32 %p447, %r14, %r4049;
@%p447 bra $L__BB0_511;
add.s32 %r4050, %r667, %r1;
mul.hi.s32 %r4051, %r4050, 954437177;
shr.u32 %r4052, %r4051, 31;
shr.s32 %r4053, %r4051, 1;
add.s32 %r4054, %r4053, %r4052;
mul.lo.s32 %r4055, %r4054, 9;
sub.s32 %r4056, %r4050, %r4055;
mad.lo.s32 %r4057, %r4054, %r2605, %r666;
mad.lo.s32 %r4058, %r4056, %r2606, %r4057;
mul.wide.s32 %rd1137, %r4058, 4;
add.s64 %rd1138, %rd2, %rd1137;
ld.global.f32 %f5346, [%rd1138];
$L__BB0_511:
@%p430 bra $L__BB0_514;
shl.b32 %r668, %r12, 5;
mov.u32 %r4060, -32;
sub.s32 %r4061, %r4060, %r668;
setp.ge.s32 %p449, %r14, %r4061;
@%p449 bra $L__BB0_514;
add.s32 %r4062, %r668, %r1;
add.s32 %r4063, %r4062, 32;
mul.hi.s32 %r4064, %r4063, 954437177;
shr.u32 %r4065, %r4064, 31;
shr.s32 %r4066, %r4064, 1;
add.s32 %r4067, %r4066, %r4065;
mul.lo.s32 %r4068, %r4067, 9;
sub.s32 %r4069, %r4063, %r4068;
mad.lo.s32 %r4070, %r4067, %r2605, %r666;
mad.lo.s32 %r4071, %r4069, %r2606, %r4070;
mul.wide.s32 %rd1139, %r4071, 4;
add.s64 %rd1140, %rd2, %rd1139;
ld.global.f32 %f5345, [%rd1140];
$L__BB0_514:
shl.b32 %r4073, %r2602, 1;
add.s32 %r669, %r4073, %r661;
mad.lo.s32 %r670, %r13, %r2603, %r669;
@%p434 bra $L__BB0_517;
shl.b32 %r671, %r12, 5;
neg.s32 %r4074, %r671;
setp.ge.s32 %p451, %r14, %r4074;
@%p451 bra $L__BB0_517;
add.s32 %r4075, %r671, %r1;
mul.hi.s32 %r4076, %r4075, 954437177;
shr.u32 %r4077, %r4076, 31;
shr.s32 %r4078, %r4076, 1;
add.s32 %r4079, %r4078, %r4077;
mul.lo.s32 %r4080, %r4079, 9;
sub.s32 %r4081, %r4075, %r4080;
mad.lo.s32 %r4082, %r4079, %r2605, %r670;
mad.lo.s32 %r4083, %r4081, %r2606, %r4082;
mul.wide.s32 %rd1141, %r4083, 4;
add.s64 %rd1142, %rd2, %rd1141;
ld.global.f32 %f5344, [%rd1142];
$L__BB0_517:
@%p434 bra $L__BB0_520;
shl.b32 %r672, %r12, 5;
mov.u32 %r4085, -32;
sub.s32 %r4086, %r4085, %r672;
setp.ge.s32 %p453, %r14, %r4086;
@%p453 bra $L__BB0_520;
add.s32 %r4087, %r672, %r1;
add.s32 %r4088, %r4087, 32;
mul.hi.s32 %r4089, %r4088, 954437177;
shr.u32 %r4090, %r4089, 31;
shr.s32 %r4091, %r4089, 1;
add.s32 %r4092, %r4091, %r4090;
mul.lo.s32 %r4093, %r4092, 9;
sub.s32 %r4094, %r4088, %r4093;
mad.lo.s32 %r4095, %r4092, %r2605, %r670;
mad.lo.s32 %r4096, %r4094, %r2606, %r4095;
mul.wide.s32 %rd1143, %r4096, 4;
add.s64 %rd1144, %rd2, %rd1143;
ld.global.f32 %f5343, [%rd1144];
$L__BB0_520:
add.s32 %r673, %r669, %r662;
@%p438 bra $L__BB0_523;
shl.b32 %r674, %r12, 5;
neg.s32 %r4098, %r674;
setp.ge.s32 %p455, %r14, %r4098;
@%p455 bra $L__BB0_523;
add.s32 %r4099, %r674, %r1;
mul.hi.s32 %r4100, %r4099, 954437177;
shr.u32 %r4101, %r4100, 31;
shr.s32 %r4102, %r4100, 1;
add.s32 %r4103, %r4102, %r4101;
mul.lo.s32 %r4104, %r4103, 9;
sub.s32 %r4105, %r4099, %r4104;
mad.lo.s32 %r4106, %r4103, %r2605, %r673;
mad.lo.s32 %r4107, %r4105, %r2606, %r4106;
mul.wide.s32 %rd1145, %r4107, 4;
add.s64 %rd1146, %rd2, %rd1145;
ld.global.f32 %f5342, [%rd1146];
$L__BB0_523:
@%p438 bra $L__BB0_526;
shl.b32 %r675, %r12, 5;
mov.u32 %r4109, -32;
sub.s32 %r4110, %r4109, %r675;
setp.ge.s32 %p457, %r14, %r4110;
@%p457 bra $L__BB0_526;
add.s32 %r4111, %r675, %r1;
add.s32 %r4112, %r4111, 32;
mul.hi.s32 %r4113, %r4112, 954437177;
shr.u32 %r4114, %r4113, 31;
shr.s32 %r4115, %r4113, 1;
add.s32 %r4116, %r4115, %r4114;
mul.lo.s32 %r4117, %r4116, 9;
sub.s32 %r4118, %r4112, %r4117;
mad.lo.s32 %r4119, %r4116, %r2605, %r673;
mad.lo.s32 %r4120, %r4118, %r2606, %r4119;
mul.wide.s32 %rd1147, %r4120, 4;
add.s64 %rd1148, %rd2, %rd1147;
ld.global.f32 %f5341, [%rd1148];
$L__BB0_526:
@%p426 bra $L__BB0_555;
shl.b32 %r4122, %r12, 5;
neg.s32 %r676, %r4122;
setp.ge.s32 %p459, %r14, %r676;
@%p459 bra $L__BB0_540;
mul.f32 %f2948, %f5348, 0f3F22F983;
cvt.rni.s32.f32 %r8357, %f2948;
cvt.rn.f32.s32 %f2949, %r8357;
mov.f32 %f2950, 0fBFC90FDA;
fma.rn.f32 %f2951, %f2949, %f2950, %f5348;
mov.f32 %f2952, 0fB3A22168;
fma.rn.f32 %f2953, %f2949, %f2952, %f2951;
mov.f32 %f2954, 0fA7C234C5;
fma.rn.f32 %f5369, %f2949, %f2954, %f2953;
abs.f32 %f573, %f5348;
setp.ltu.f32 %p460, %f573, 0f47CE4780;
@%p460 bra $L__BB0_536;
setp.eq.f32 %p461, %f573, 0f7F800000;
@%p461 bra $L__BB0_535;
bra.uni $L__BB0_530;
$L__BB0_535:
mov.f32 %f2957, 0f00000000;
mul.rn.f32 %f5369, %f5348, %f2957;
mov.u32 %r8357, 0;
bra.uni $L__BB0_536;
$L__BB0_760:
mov.b32 %r981, %f847;
shr.u32 %r4713, %r981, 23;
and.b32 %r4714, %r4713, 255;
add.s32 %r982, %r4714, -128;
shl.b32 %r4715, %r981, 8;
or.b32 %r983, %r4715, -2147483648;
shr.u32 %r984, %r982, 5;
mov.u64 %rd2594, 0;
mov.u32 %r8418, 0;
mov.u64 %rd1405, __cudart_i2opi_f;
mov.u64 %rd2595, %rd2594;
$L__BB0_761:
.pragma "nounroll";
shl.b64 %rd1404, %rd2594, 2;
add.s64 %rd1406, %rd1405, %rd1404;
ld.global.nc.u32 %r4716, [%rd1406];
mad.wide.u32 %rd1407, %r4716, %r983, %rd2595;
shr.u64 %rd2595, %rd1407, 32;
add.s64 %rd1408, %rd1, %rd1404;
st.local.u32 [%rd1408], %rd1407;
add.s32 %r8418, %r8418, 1;
cvt.s64.s32 %rd2594, %r8418;
setp.ne.s32 %p660, %r8418, 6;
@%p660 bra $L__BB0_761;
st.local.u32 [%rd5], %rd2595;
mov.u32 %r4717, 4;
sub.s32 %r987, %r4717, %r984;
mov.u32 %r4718, 6;
sub.s32 %r4719, %r4718, %r984;
mul.wide.s32 %rd1409, %r4719, 4;
add.s64 %rd1410, %rd1, %rd1409;
ld.local.u32 %r8419, [%rd1410];
ld.local.u32 %r8420, [%rd1410+-4];
and.b32 %r990, %r982, 31;
setp.eq.s32 %p661, %r990, 0;
@%p661 bra $L__BB0_764;
mov.u32 %r4720, 32;
sub.s32 %r4721, %r4720, %r990;
shr.u32 %r4722, %r8420, %r4721;
shl.b32 %r4723, %r8419, %r990;
add.s32 %r8419, %r4722, %r4723;
mul.wide.s32 %rd1411, %r987, 4;
add.s64 %rd1412, %rd1, %rd1411;
ld.local.u32 %r4724, [%rd1412];
shr.u32 %r4725, %r4724, %r4721;
shl.b32 %r4726, %r8420, %r990;
add.s32 %r8420, %r4725, %r4726;
$L__BB0_764:
and.b32 %r4727, %r981, -2147483648;
shr.u32 %r4728, %r8420, 30;
shl.b32 %r4729, %r8419, 2;
or.b32 %r4730, %r4728, %r4729;
shr.u32 %r4731, %r4730, 31;
shr.u32 %r4732, %r8419, 30;
add.s32 %r4733, %r4731, %r4732;
neg.s32 %r4734, %r4733;
setp.eq.s32 %p662, %r4727, 0;
selp.b32 %r8421, %r4733, %r4734, %p662;
setp.ne.s32 %p663, %r4731, 0;
xor.b32 %r4735, %r4727, -2147483648;
selp.b32 %r4736, %r4735, %r4727, %p663;
selp.b32 %r4737, -1, 0, %p663;
xor.b32 %r4738, %r4730, %r4737;
shl.b32 %r4739, %r8420, 2;
xor.b32 %r4740, %r4739, %r4737;
cvt.u64.u32 %rd1413, %r4738;
cvt.u64.u32 %rd1414, %r4740;
bfi.b64 %rd1415, %rd1413, %rd1414, 32, 32;
cvt.rn.f64.s64 %fd97, %rd1415;
mul.f64 %fd98, %fd97, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3306, %fd98;
setp.eq.s32 %p664, %r4736, 0;
neg.f32 %f3307, %f3306;
selp.f32 %f5468, %f3306, %f3307, %p664;
$L__BB0_766:
and.b32 %r997, %r8421, 1;
setp.eq.s32 %p665, %r997, 0;
selp.f32 %f860, %f5468, 0f3F800000, %p665;
mul.rn.f32 %f861, %f5468, %f5468;
mov.f32 %f5469, 0fB94D4153;
@%p665 bra $L__BB0_768;
mov.f32 %f3310, 0fBAB607ED;
mov.f32 %f3311, 0f37CBAC00;
fma.rn.f32 %f5469, %f3311, %f861, %f3310;
$L__BB0_768:
selp.f32 %f3312, 0f3C0885E4, 0f3D2AAABB, %p665;
fma.rn.f32 %f3313, %f5469, %f861, %f3312;
selp.f32 %f3314, 0fBE2AAAA8, 0fBEFFFFFF, %p665;
fma.rn.f32 %f3315, %f3313, %f861, %f3314;
mov.f32 %f3316, 0f00000000;
fma.rn.f32 %f3317, %f861, %f860, %f3316;
fma.rn.f32 %f5470, %f3315, %f3317, %f860;
and.b32 %r4742, %r8421, 2;
setp.eq.s32 %p667, %r4742, 0;
@%p667 bra $L__BB0_770;
mov.f32 %f3319, 0fBF800000;
fma.rn.f32 %f5470, %f5470, %f3319, %f3316;
$L__BB0_770:
mul.f32 %f3320, %f839, 0f3F22F983;
cvt.rni.s32.f32 %r8425, %f3320;
cvt.rn.f32.s32 %f3321, %r8425;
mov.f32 %f3322, 0fBFC90FDA;
fma.rn.f32 %f3323, %f3321, %f3322, %f839;
mov.f32 %f3324, 0fB3A22168;
fma.rn.f32 %f3325, %f3321, %f3324, %f3323;
mov.f32 %f3326, 0fA7C234C5;
fma.rn.f32 %f5471, %f3321, %f3326, %f3325;
abs.f32 %f868, %f839;
setp.ltu.f32 %p668, %f868, 0f47CE4780;
@%p668 bra $L__BB0_778;
setp.eq.f32 %p669, %f868, 0f7F800000;
@%p669 bra $L__BB0_777;
bra.uni $L__BB0_772;
$L__BB0_777:
mov.f32 %f3329, 0f00000000;
mul.rn.f32 %f5471, %f839, %f3329;
mov.u32 %r8425, 0;
bra.uni $L__BB0_778;
$L__BB0_772:
mov.b32 %r999, %f839;
shr.u32 %r4744, %r999, 23;
and.b32 %r4745, %r4744, 255;
add.s32 %r1000, %r4745, -128;
shl.b32 %r4746, %r999, 8;
or.b32 %r1001, %r4746, -2147483648;
shr.u32 %r1002, %r1000, 5;
mov.u64 %rd2596, 0;
mov.u32 %r8422, 0;
mov.u64 %rd1419, __cudart_i2opi_f;
mov.u64 %rd2597, %rd2596;
$L__BB0_773:
.pragma "nounroll";
shl.b64 %rd1418, %rd2596, 2;
add.s64 %rd1420, %rd1419, %rd1418;
ld.global.nc.u32 %r4747, [%rd1420];
mad.wide.u32 %rd1421, %r4747, %r1001, %rd2597;
shr.u64 %rd2597, %rd1421, 32;
add.s64 %rd1422, %rd1, %rd1418;
st.local.u32 [%rd1422], %rd1421;
add.s32 %r8422, %r8422, 1;
cvt.s64.s32 %rd2596, %r8422;
setp.ne.s32 %p670, %r8422, 6;
@%p670 bra $L__BB0_773;
st.local.u32 [%rd5], %rd2597;
mov.u32 %r4748, 4;
sub.s32 %r1005, %r4748, %r1002;
mov.u32 %r4749, 6;
sub.s32 %r4750, %r4749, %r1002;
mul.wide.s32 %rd1423, %r4750, 4;
add.s64 %rd1424, %rd1, %rd1423;
ld.local.u32 %r8423, [%rd1424];
ld.local.u32 %r8424, [%rd1424+-4];
and.b32 %r1008, %r1000, 31;
setp.eq.s32 %p671, %r1008, 0;
@%p671 bra $L__BB0_776;
mov.u32 %r4751, 32;
sub.s32 %r4752, %r4751, %r1008;
shr.u32 %r4753, %r8424, %r4752;
shl.b32 %r4754, %r8423, %r1008;
add.s32 %r8423, %r4753, %r4754;
mul.wide.s32 %rd1425, %r1005, 4;
add.s64 %rd1426, %rd1, %rd1425;
ld.local.u32 %r4755, [%rd1426];
shr.u32 %r4756, %r4755, %r4752;
shl.b32 %r4757, %r8424, %r1008;
add.s32 %r8424, %r4756, %r4757;
$L__BB0_776:
and.b32 %r4758, %r999, -2147483648;
shr.u32 %r4759, %r8424, 30;
shl.b32 %r4760, %r8423, 2;
or.b32 %r4761, %r4759, %r4760;
shr.u32 %r4762, %r4761, 31;
shr.u32 %r4763, %r8423, 30;
add.s32 %r4764, %r4762, %r4763;
neg.s32 %r4765, %r4764;
setp.eq.s32 %p672, %r4758, 0;
selp.b32 %r8425, %r4764, %r4765, %p672;
setp.ne.s32 %p673, %r4762, 0;
xor.b32 %r4766, %r4758, -2147483648;
selp.b32 %r4767, %r4766, %r4758, %p673;
selp.b32 %r4768, -1, 0, %p673;
xor.b32 %r4769, %r4761, %r4768;
shl.b32 %r4770, %r8424, 2;
xor.b32 %r4771, %r4770, %r4768;
cvt.u64.u32 %rd1427, %r4769;
cvt.u64.u32 %rd1428, %r4771;
bfi.b64 %rd1429, %rd1427, %rd1428, 32, 32;
cvt.rn.f64.s64 %fd99, %rd1429;
mul.f64 %fd100, %fd99, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3327, %fd100;
setp.eq.s32 %p674, %r4767, 0;
neg.f32 %f3328, %f3327;
selp.f32 %f5471, %f3327, %f3328, %p674;
$L__BB0_778:
add.s32 %r1015, %r8425, 1;
and.b32 %r1016, %r1015, 1;
setp.eq.s32 %p675, %r1016, 0;
selp.f32 %f872, %f5471, 0f3F800000, %p675;
mul.rn.f32 %f873, %f5471, %f5471;
mov.f32 %f5472, 0fB94D4153;
@%p675 bra $L__BB0_780;
mov.f32 %f3331, 0fBAB607ED;
mov.f32 %f3332, 0f37CBAC00;
fma.rn.f32 %f5472, %f3332, %f873, %f3331;
$L__BB0_780:
selp.f32 %f3333, 0f3C0885E4, 0f3D2AAABB, %p675;
fma.rn.f32 %f3334, %f5472, %f873, %f3333;
selp.f32 %f3335, 0fBE2AAAA8, 0fBEFFFFFF, %p675;
fma.rn.f32 %f3336, %f3334, %f873, %f3335;
mov.f32 %f3337, 0f00000000;
fma.rn.f32 %f3338, %f873, %f872, %f3337;
fma.rn.f32 %f5473, %f3336, %f3338, %f872;
and.b32 %r4773, %r1015, 2;
setp.eq.s32 %p677, %r4773, 0;
@%p677 bra $L__BB0_782;
mov.f32 %f3340, 0fBF800000;
fma.rn.f32 %f5473, %f5473, %f3340, %f3337;
$L__BB0_782:
add.f32 %f5523, %f5470, %f5473;
mul.f32 %f3341, %f848, 0f3F22F983;
cvt.rni.s32.f32 %r8429, %f3341;
cvt.rn.f32.s32 %f3342, %r8429;
mov.f32 %f3343, 0fBFC90FDA;
fma.rn.f32 %f3344, %f3342, %f3343, %f848;
mov.f32 %f3345, 0fB3A22168;
fma.rn.f32 %f3346, %f3342, %f3345, %f3344;
mov.f32 %f3347, 0fA7C234C5;
fma.rn.f32 %f5474, %f3342, %f3347, %f3346;
abs.f32 %f881, %f848;
setp.ltu.f32 %p678, %f881, 0f47CE4780;
@%p678 bra $L__BB0_790;
setp.eq.f32 %p679, %f881, 0f7F800000;
@%p679 bra $L__BB0_789;
bra.uni $L__BB0_784;
$L__BB0_789:
mov.f32 %f3350, 0f00000000;
mul.rn.f32 %f5474, %f848, %f3350;
mov.u32 %r8429, 0;
bra.uni $L__BB0_790;
$L__BB0_784:
mov.b32 %r1018, %f848;
shr.u32 %r4775, %r1018, 23;
and.b32 %r4776, %r4775, 255;
add.s32 %r1019, %r4776, -128;
shl.b32 %r4777, %r1018, 8;
or.b32 %r1020, %r4777, -2147483648;
shr.u32 %r1021, %r1019, 5;
mov.u64 %rd2598, 0;
mov.u32 %r8426, 0;
mov.u64 %rd1433, __cudart_i2opi_f;
mov.u64 %rd2599, %rd2598;
$L__BB0_785:
.pragma "nounroll";
shl.b64 %rd1432, %rd2598, 2;
add.s64 %rd1434, %rd1433, %rd1432;
ld.global.nc.u32 %r4778, [%rd1434];
mad.wide.u32 %rd1435, %r4778, %r1020, %rd2599;
shr.u64 %rd2599, %rd1435, 32;
add.s64 %rd1436, %rd1, %rd1432;
st.local.u32 [%rd1436], %rd1435;
add.s32 %r8426, %r8426, 1;
cvt.s64.s32 %rd2598, %r8426;
setp.ne.s32 %p680, %r8426, 6;
@%p680 bra $L__BB0_785;
st.local.u32 [%rd5], %rd2599;
mov.u32 %r4779, 4;
sub.s32 %r1024, %r4779, %r1021;
mov.u32 %r4780, 6;
sub.s32 %r4781, %r4780, %r1021;
mul.wide.s32 %rd1437, %r4781, 4;
add.s64 %rd1438, %rd1, %rd1437;
ld.local.u32 %r8427, [%rd1438];
ld.local.u32 %r8428, [%rd1438+-4];
and.b32 %r1027, %r1019, 31;
setp.eq.s32 %p681, %r1027, 0;
@%p681 bra $L__BB0_788;
mov.u32 %r4782, 32;
sub.s32 %r4783, %r4782, %r1027;
shr.u32 %r4784, %r8428, %r4783;
shl.b32 %r4785, %r8427, %r1027;
add.s32 %r8427, %r4784, %r4785;
mul.wide.s32 %rd1439, %r1024, 4;
add.s64 %rd1440, %rd1, %rd1439;
ld.local.u32 %r4786, [%rd1440];
shr.u32 %r4787, %r4786, %r4783;
shl.b32 %r4788, %r8428, %r1027;
add.s32 %r8428, %r4787, %r4788;
$L__BB0_788:
and.b32 %r4789, %r1018, -2147483648;
shr.u32 %r4790, %r8428, 30;
shl.b32 %r4791, %r8427, 2;
or.b32 %r4792, %r4790, %r4791;
shr.u32 %r4793, %r4792, 31;
shr.u32 %r4794, %r8427, 30;
add.s32 %r4795, %r4793, %r4794;
neg.s32 %r4796, %r4795;
setp.eq.s32 %p682, %r4789, 0;
selp.b32 %r8429, %r4795, %r4796, %p682;
setp.ne.s32 %p683, %r4793, 0;
xor.b32 %r4797, %r4789, -2147483648;
selp.b32 %r4798, %r4797, %r4789, %p683;
selp.b32 %r4799, -1, 0, %p683;
xor.b32 %r4800, %r4792, %r4799;
shl.b32 %r4801, %r8428, 2;
xor.b32 %r4802, %r4801, %r4799;
cvt.u64.u32 %rd1441, %r4800;
cvt.u64.u32 %rd1442, %r4802;
bfi.b64 %rd1443, %rd1441, %rd1442, 32, 32;
cvt.rn.f64.s64 %fd101, %rd1443;
mul.f64 %fd102, %fd101, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3348, %fd102;
setp.eq.s32 %p684, %r4798, 0;
neg.f32 %f3349, %f3348;
selp.f32 %f5474, %f3348, %f3349, %p684;
$L__BB0_790:
and.b32 %r1034, %r8429, 1;
setp.eq.s32 %p685, %r1034, 0;
selp.f32 %f885, %f5474, 0f3F800000, %p685;
mul.rn.f32 %f886, %f5474, %f5474;
mov.f32 %f5475, 0fB94D4153;
@%p685 bra $L__BB0_792;
mov.f32 %f3352, 0fBAB607ED;
mov.f32 %f3353, 0f37CBAC00;
fma.rn.f32 %f5475, %f3353, %f886, %f3352;
$L__BB0_792:
selp.f32 %f3354, 0f3C0885E4, 0f3D2AAABB, %p685;
fma.rn.f32 %f3355, %f5475, %f886, %f3354;
selp.f32 %f3356, 0fBE2AAAA8, 0fBEFFFFFF, %p685;
fma.rn.f32 %f3357, %f3355, %f886, %f3356;
mov.f32 %f3358, 0f00000000;
fma.rn.f32 %f3359, %f886, %f885, %f3358;
fma.rn.f32 %f5476, %f3357, %f3359, %f885;
and.b32 %r4804, %r8429, 2;
setp.eq.s32 %p687, %r4804, 0;
@%p687 bra $L__BB0_794;
mov.f32 %f3361, 0fBF800000;
fma.rn.f32 %f5476, %f5476, %f3361, %f3358;
$L__BB0_794:
mul.f32 %f3362, %f840, 0f3F22F983;
cvt.rni.s32.f32 %r8433, %f3362;
cvt.rn.f32.s32 %f3363, %r8433;
mov.f32 %f3364, 0fBFC90FDA;
fma.rn.f32 %f3365, %f3363, %f3364, %f840;
mov.f32 %f3366, 0fB3A22168;
fma.rn.f32 %f3367, %f3363, %f3366, %f3365;
mov.f32 %f3368, 0fA7C234C5;
fma.rn.f32 %f5477, %f3363, %f3368, %f3367;
abs.f32 %f893, %f840;
setp.ltu.f32 %p688, %f893, 0f47CE4780;
@%p688 bra $L__BB0_802;
setp.eq.f32 %p689, %f893, 0f7F800000;
@%p689 bra $L__BB0_801;
bra.uni $L__BB0_796;
$L__BB0_801:
mov.f32 %f3371, 0f00000000;
mul.rn.f32 %f5477, %f840, %f3371;
mov.u32 %r8433, 0;
bra.uni $L__BB0_802;
$L__BB0_796:
mov.b32 %r1036, %f840;
shr.u32 %r4806, %r1036, 23;
and.b32 %r4807, %r4806, 255;
add.s32 %r1037, %r4807, -128;
shl.b32 %r4808, %r1036, 8;
or.b32 %r1038, %r4808, -2147483648;
shr.u32 %r1039, %r1037, 5;
mov.u64 %rd2600, 0;
mov.u32 %r8430, 0;
mov.u64 %rd1447, __cudart_i2opi_f;
mov.u64 %rd2601, %rd2600;
$L__BB0_797:
.pragma "nounroll";
shl.b64 %rd1446, %rd2600, 2;
add.s64 %rd1448, %rd1447, %rd1446;
ld.global.nc.u32 %r4809, [%rd1448];
mad.wide.u32 %rd1449, %r4809, %r1038, %rd2601;
shr.u64 %rd2601, %rd1449, 32;
add.s64 %rd1450, %rd1, %rd1446;
st.local.u32 [%rd1450], %rd1449;
add.s32 %r8430, %r8430, 1;
cvt.s64.s32 %rd2600, %r8430;
setp.ne.s32 %p690, %r8430, 6;
@%p690 bra $L__BB0_797;
st.local.u32 [%rd5], %rd2601;
mov.u32 %r4810, 4;
sub.s32 %r1042, %r4810, %r1039;
mov.u32 %r4811, 6;
sub.s32 %r4812, %r4811, %r1039;
mul.wide.s32 %rd1451, %r4812, 4;
add.s64 %rd1452, %rd1, %rd1451;
ld.local.u32 %r8431, [%rd1452];
ld.local.u32 %r8432, [%rd1452+-4];
and.b32 %r1045, %r1037, 31;
setp.eq.s32 %p691, %r1045, 0;
@%p691 bra $L__BB0_800;
mov.u32 %r4813, 32;
sub.s32 %r4814, %r4813, %r1045;
shr.u32 %r4815, %r8432, %r4814;
shl.b32 %r4816, %r8431, %r1045;
add.s32 %r8431, %r4815, %r4816;
mul.wide.s32 %rd1453, %r1042, 4;
add.s64 %rd1454, %rd1, %rd1453;
ld.local.u32 %r4817, [%rd1454];
shr.u32 %r4818, %r4817, %r4814;
shl.b32 %r4819, %r8432, %r1045;
add.s32 %r8432, %r4818, %r4819;
$L__BB0_800:
and.b32 %r4820, %r1036, -2147483648;
shr.u32 %r4821, %r8432, 30;
shl.b32 %r4822, %r8431, 2;
or.b32 %r4823, %r4821, %r4822;
shr.u32 %r4824, %r4823, 31;
shr.u32 %r4825, %r8431, 30;
add.s32 %r4826, %r4824, %r4825;
neg.s32 %r4827, %r4826;
setp.eq.s32 %p692, %r4820, 0;
selp.b32 %r8433, %r4826, %r4827, %p692;
setp.ne.s32 %p693, %r4824, 0;
xor.b32 %r4828, %r4820, -2147483648;
selp.b32 %r4829, %r4828, %r4820, %p693;
selp.b32 %r4830, -1, 0, %p693;
xor.b32 %r4831, %r4823, %r4830;
shl.b32 %r4832, %r8432, 2;
xor.b32 %r4833, %r4832, %r4830;
cvt.u64.u32 %rd1455, %r4831;
cvt.u64.u32 %rd1456, %r4833;
bfi.b64 %rd1457, %rd1455, %rd1456, 32, 32;
cvt.rn.f64.s64 %fd103, %rd1457;
mul.f64 %fd104, %fd103, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3369, %fd104;
setp.eq.s32 %p694, %r4829, 0;
neg.f32 %f3370, %f3369;
selp.f32 %f5477, %f3369, %f3370, %p694;
$L__BB0_802:
add.s32 %r1052, %r8433, 1;
and.b32 %r1053, %r1052, 1;
setp.eq.s32 %p695, %r1053, 0;
selp.f32 %f897, %f5477, 0f3F800000, %p695;
mul.rn.f32 %f898, %f5477, %f5477;
mov.f32 %f5478, 0fB94D4153;
@%p695 bra $L__BB0_804;
mov.f32 %f3373, 0fBAB607ED;
mov.f32 %f3374, 0f37CBAC00;
fma.rn.f32 %f5478, %f3374, %f898, %f3373;
$L__BB0_804:
selp.f32 %f3375, 0f3C0885E4, 0f3D2AAABB, %p695;
fma.rn.f32 %f3376, %f5478, %f898, %f3375;
selp.f32 %f3377, 0fBE2AAAA8, 0fBEFFFFFF, %p695;
fma.rn.f32 %f3378, %f3376, %f898, %f3377;
mov.f32 %f3379, 0f00000000;
fma.rn.f32 %f3380, %f898, %f897, %f3379;
fma.rn.f32 %f5479, %f3378, %f3380, %f897;
and.b32 %r4835, %r1052, 2;
setp.eq.s32 %p697, %r4835, 0;
@%p697 bra $L__BB0_806;
mov.f32 %f3382, 0fBF800000;
fma.rn.f32 %f5479, %f5479, %f3382, %f3379;
$L__BB0_806:
add.f32 %f5522, %f5476, %f5479;
mul.f32 %f3383, %f849, 0f3F22F983;
cvt.rni.s32.f32 %r8437, %f3383;
cvt.rn.f32.s32 %f3384, %r8437;
mov.f32 %f3385, 0fBFC90FDA;
fma.rn.f32 %f3386, %f3384, %f3385, %f849;
mov.f32 %f3387, 0fB3A22168;
fma.rn.f32 %f3388, %f3384, %f3387, %f3386;
mov.f32 %f3389, 0fA7C234C5;
fma.rn.f32 %f5480, %f3384, %f3389, %f3388;
abs.f32 %f906, %f849;
setp.ltu.f32 %p698, %f906, 0f47CE4780;
@%p698 bra $L__BB0_814;
setp.eq.f32 %p699, %f906, 0f7F800000;
@%p699 bra $L__BB0_813;
bra.uni $L__BB0_808;
$L__BB0_813:
mov.f32 %f3392, 0f00000000;
mul.rn.f32 %f5480, %f849, %f3392;
mov.u32 %r8437, 0;
bra.uni $L__BB0_814;
$L__BB0_808:
mov.b32 %r1055, %f849;
shr.u32 %r4837, %r1055, 23;
and.b32 %r4838, %r4837, 255;
add.s32 %r1056, %r4838, -128;
shl.b32 %r4839, %r1055, 8;
or.b32 %r1057, %r4839, -2147483648;
shr.u32 %r1058, %r1056, 5;
mov.u64 %rd2602, 0;
mov.u32 %r8434, 0;
mov.u64 %rd1461, __cudart_i2opi_f;
mov.u64 %rd2603, %rd2602;
$L__BB0_809:
.pragma "nounroll";
shl.b64 %rd1460, %rd2602, 2;
add.s64 %rd1462, %rd1461, %rd1460;
ld.global.nc.u32 %r4840, [%rd1462];
mad.wide.u32 %rd1463, %r4840, %r1057, %rd2603;
shr.u64 %rd2603, %rd1463, 32;
add.s64 %rd1464, %rd1, %rd1460;
st.local.u32 [%rd1464], %rd1463;
add.s32 %r8434, %r8434, 1;
cvt.s64.s32 %rd2602, %r8434;
setp.ne.s32 %p700, %r8434, 6;
@%p700 bra $L__BB0_809;
st.local.u32 [%rd5], %rd2603;
mov.u32 %r4841, 4;
sub.s32 %r1061, %r4841, %r1058;
mov.u32 %r4842, 6;
sub.s32 %r4843, %r4842, %r1058;
mul.wide.s32 %rd1465, %r4843, 4;
add.s64 %rd1466, %rd1, %rd1465;
ld.local.u32 %r8435, [%rd1466];
ld.local.u32 %r8436, [%rd1466+-4];
and.b32 %r1064, %r1056, 31;
setp.eq.s32 %p701, %r1064, 0;
@%p701 bra $L__BB0_812;
mov.u32 %r4844, 32;
sub.s32 %r4845, %r4844, %r1064;
shr.u32 %r4846, %r8436, %r4845;
shl.b32 %r4847, %r8435, %r1064;
add.s32 %r8435, %r4846, %r4847;
mul.wide.s32 %rd1467, %r1061, 4;
add.s64 %rd1468, %rd1, %rd1467;
ld.local.u32 %r4848, [%rd1468];
shr.u32 %r4849, %r4848, %r4845;
shl.b32 %r4850, %r8436, %r1064;
add.s32 %r8436, %r4849, %r4850;
$L__BB0_812:
and.b32 %r4851, %r1055, -2147483648;
shr.u32 %r4852, %r8436, 30;
shl.b32 %r4853, %r8435, 2;
or.b32 %r4854, %r4852, %r4853;
shr.u32 %r4855, %r4854, 31;
shr.u32 %r4856, %r8435, 30;
add.s32 %r4857, %r4855, %r4856;
neg.s32 %r4858, %r4857;
setp.eq.s32 %p702, %r4851, 0;
selp.b32 %r8437, %r4857, %r4858, %p702;
setp.ne.s32 %p703, %r4855, 0;
xor.b32 %r4859, %r4851, -2147483648;
selp.b32 %r4860, %r4859, %r4851, %p703;
selp.b32 %r4861, -1, 0, %p703;
xor.b32 %r4862, %r4854, %r4861;
shl.b32 %r4863, %r8436, 2;
xor.b32 %r4864, %r4863, %r4861;
cvt.u64.u32 %rd1469, %r4862;
cvt.u64.u32 %rd1470, %r4864;
bfi.b64 %rd1471, %rd1469, %rd1470, 32, 32;
cvt.rn.f64.s64 %fd105, %rd1471;
mul.f64 %fd106, %fd105, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3390, %fd106;
setp.eq.s32 %p704, %r4860, 0;
neg.f32 %f3391, %f3390;
selp.f32 %f5480, %f3390, %f3391, %p704;
$L__BB0_814:
and.b32 %r1071, %r8437, 1;
setp.eq.s32 %p705, %r1071, 0;
selp.f32 %f910, %f5480, 0f3F800000, %p705;
mul.rn.f32 %f911, %f5480, %f5480;
mov.f32 %f5481, 0fB94D4153;
@%p705 bra $L__BB0_816;
mov.f32 %f3394, 0fBAB607ED;
mov.f32 %f3395, 0f37CBAC00;
fma.rn.f32 %f5481, %f3395, %f911, %f3394;
$L__BB0_816:
selp.f32 %f3396, 0f3C0885E4, 0f3D2AAABB, %p705;
fma.rn.f32 %f3397, %f5481, %f911, %f3396;
selp.f32 %f3398, 0fBE2AAAA8, 0fBEFFFFFF, %p705;
fma.rn.f32 %f3399, %f3397, %f911, %f3398;
mov.f32 %f3400, 0f00000000;
fma.rn.f32 %f3401, %f911, %f910, %f3400;
fma.rn.f32 %f5482, %f3399, %f3401, %f910;
and.b32 %r4866, %r8437, 2;
setp.eq.s32 %p707, %r4866, 0;
@%p707 bra $L__BB0_818;
mov.f32 %f3403, 0fBF800000;
fma.rn.f32 %f5482, %f5482, %f3403, %f3400;
$L__BB0_818:
mul.f32 %f3404, %f841, 0f3F22F983;
cvt.rni.s32.f32 %r8441, %f3404;
cvt.rn.f32.s32 %f3405, %r8441;
mov.f32 %f3406, 0fBFC90FDA;
fma.rn.f32 %f3407, %f3405, %f3406, %f841;
mov.f32 %f3408, 0fB3A22168;
fma.rn.f32 %f3409, %f3405, %f3408, %f3407;
mov.f32 %f3410, 0fA7C234C5;
fma.rn.f32 %f5483, %f3405, %f3410, %f3409;
abs.f32 %f918, %f841;
setp.ltu.f32 %p708, %f918, 0f47CE4780;
@%p708 bra $L__BB0_826;
setp.eq.f32 %p709, %f918, 0f7F800000;
@%p709 bra $L__BB0_825;
bra.uni $L__BB0_820;
$L__BB0_825:
mov.f32 %f3413, 0f00000000;
mul.rn.f32 %f5483, %f841, %f3413;
mov.u32 %r8441, 0;
bra.uni $L__BB0_826;
$L__BB0_820:
mov.b32 %r1073, %f841;
shr.u32 %r4868, %r1073, 23;
and.b32 %r4869, %r4868, 255;
add.s32 %r1074, %r4869, -128;
shl.b32 %r4870, %r1073, 8;
or.b32 %r1075, %r4870, -2147483648;
shr.u32 %r1076, %r1074, 5;
mov.u64 %rd2604, 0;
mov.u32 %r8438, 0;
mov.u64 %rd1475, __cudart_i2opi_f;
mov.u64 %rd2605, %rd2604;
$L__BB0_821:
.pragma "nounroll";
shl.b64 %rd1474, %rd2604, 2;
add.s64 %rd1476, %rd1475, %rd1474;
ld.global.nc.u32 %r4871, [%rd1476];
mad.wide.u32 %rd1477, %r4871, %r1075, %rd2605;
shr.u64 %rd2605, %rd1477, 32;
add.s64 %rd1478, %rd1, %rd1474;
st.local.u32 [%rd1478], %rd1477;
add.s32 %r8438, %r8438, 1;
cvt.s64.s32 %rd2604, %r8438;
setp.ne.s32 %p710, %r8438, 6;
@%p710 bra $L__BB0_821;
st.local.u32 [%rd5], %rd2605;
mov.u32 %r4872, 4;
sub.s32 %r1079, %r4872, %r1076;
mov.u32 %r4873, 6;
sub.s32 %r4874, %r4873, %r1076;
mul.wide.s32 %rd1479, %r4874, 4;
add.s64 %rd1480, %rd1, %rd1479;
ld.local.u32 %r8439, [%rd1480];
ld.local.u32 %r8440, [%rd1480+-4];
and.b32 %r1082, %r1074, 31;
setp.eq.s32 %p711, %r1082, 0;
@%p711 bra $L__BB0_824;
mov.u32 %r4875, 32;
sub.s32 %r4876, %r4875, %r1082;
shr.u32 %r4877, %r8440, %r4876;
shl.b32 %r4878, %r8439, %r1082;
add.s32 %r8439, %r4877, %r4878;
mul.wide.s32 %rd1481, %r1079, 4;
add.s64 %rd1482, %rd1, %rd1481;
ld.local.u32 %r4879, [%rd1482];
shr.u32 %r4880, %r4879, %r4876;
shl.b32 %r4881, %r8440, %r1082;
add.s32 %r8440, %r4880, %r4881;
$L__BB0_824:
and.b32 %r4882, %r1073, -2147483648;
shr.u32 %r4883, %r8440, 30;
shl.b32 %r4884, %r8439, 2;
or.b32 %r4885, %r4883, %r4884;
shr.u32 %r4886, %r4885, 31;
shr.u32 %r4887, %r8439, 30;
add.s32 %r4888, %r4886, %r4887;
neg.s32 %r4889, %r4888;
setp.eq.s32 %p712, %r4882, 0;
selp.b32 %r8441, %r4888, %r4889, %p712;
setp.ne.s32 %p713, %r4886, 0;
xor.b32 %r4890, %r4882, -2147483648;
selp.b32 %r4891, %r4890, %r4882, %p713;
selp.b32 %r4892, -1, 0, %p713;
xor.b32 %r4893, %r4885, %r4892;
shl.b32 %r4894, %r8440, 2;
xor.b32 %r4895, %r4894, %r4892;
cvt.u64.u32 %rd1483, %r4893;
cvt.u64.u32 %rd1484, %r4895;
bfi.b64 %rd1485, %rd1483, %rd1484, 32, 32;
cvt.rn.f64.s64 %fd107, %rd1485;
mul.f64 %fd108, %fd107, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3411, %fd108;
setp.eq.s32 %p714, %r4891, 0;
neg.f32 %f3412, %f3411;
selp.f32 %f5483, %f3411, %f3412, %p714;
$L__BB0_826:
add.s32 %r1089, %r8441, 1;
and.b32 %r1090, %r1089, 1;
setp.eq.s32 %p715, %r1090, 0;
selp.f32 %f922, %f5483, 0f3F800000, %p715;
mul.rn.f32 %f923, %f5483, %f5483;
mov.f32 %f5484, 0fB94D4153;
@%p715 bra $L__BB0_828;
mov.f32 %f3415, 0fBAB607ED;
mov.f32 %f3416, 0f37CBAC00;
fma.rn.f32 %f5484, %f3416, %f923, %f3415;
$L__BB0_828:
selp.f32 %f3417, 0f3C0885E4, 0f3D2AAABB, %p715;
fma.rn.f32 %f3418, %f5484, %f923, %f3417;
selp.f32 %f3419, 0fBE2AAAA8, 0fBEFFFFFF, %p715;
fma.rn.f32 %f3420, %f3418, %f923, %f3419;
mov.f32 %f3421, 0f00000000;
fma.rn.f32 %f3422, %f923, %f922, %f3421;
fma.rn.f32 %f5485, %f3420, %f3422, %f922;
and.b32 %r4897, %r1089, 2;
setp.eq.s32 %p717, %r4897, 0;
@%p717 bra $L__BB0_830;
mov.f32 %f3424, 0fBF800000;
fma.rn.f32 %f5485, %f5485, %f3424, %f3421;
$L__BB0_830:
add.f32 %f5521, %f5482, %f5485;
mul.f32 %f3425, %f850, 0f3F22F983;
cvt.rni.s32.f32 %r8445, %f3425;
cvt.rn.f32.s32 %f3426, %r8445;
mov.f32 %f3427, 0fBFC90FDA;
fma.rn.f32 %f3428, %f3426, %f3427, %f850;
mov.f32 %f3429, 0fB3A22168;
fma.rn.f32 %f3430, %f3426, %f3429, %f3428;
mov.f32 %f3431, 0fA7C234C5;
fma.rn.f32 %f5486, %f3426, %f3431, %f3430;
abs.f32 %f931, %f850;
setp.ltu.f32 %p718, %f931, 0f47CE4780;
@%p718 bra $L__BB0_838;
setp.eq.f32 %p719, %f931, 0f7F800000;
@%p719 bra $L__BB0_837;
bra.uni $L__BB0_832;
$L__BB0_837:
mov.f32 %f3434, 0f00000000;
mul.rn.f32 %f5486, %f850, %f3434;
mov.u32 %r8445, 0;
bra.uni $L__BB0_838;
$L__BB0_832:
mov.b32 %r1092, %f850;
shr.u32 %r4899, %r1092, 23;
and.b32 %r4900, %r4899, 255;
add.s32 %r1093, %r4900, -128;
shl.b32 %r4901, %r1092, 8;
or.b32 %r1094, %r4901, -2147483648;
shr.u32 %r1095, %r1093, 5;
mov.u64 %rd2606, 0;
mov.u32 %r8442, 0;
mov.u64 %rd1489, __cudart_i2opi_f;
mov.u64 %rd2607, %rd2606;
$L__BB0_833:
.pragma "nounroll";
shl.b64 %rd1488, %rd2606, 2;
add.s64 %rd1490, %rd1489, %rd1488;
ld.global.nc.u32 %r4902, [%rd1490];
mad.wide.u32 %rd1491, %r4902, %r1094, %rd2607;
shr.u64 %rd2607, %rd1491, 32;
add.s64 %rd1492, %rd1, %rd1488;
st.local.u32 [%rd1492], %rd1491;
add.s32 %r8442, %r8442, 1;
cvt.s64.s32 %rd2606, %r8442;
setp.ne.s32 %p720, %r8442, 6;
@%p720 bra $L__BB0_833;
st.local.u32 [%rd5], %rd2607;
mov.u32 %r4903, 4;
sub.s32 %r1098, %r4903, %r1095;
mov.u32 %r4904, 6;
sub.s32 %r4905, %r4904, %r1095;
mul.wide.s32 %rd1493, %r4905, 4;
add.s64 %rd1494, %rd1, %rd1493;
ld.local.u32 %r8443, [%rd1494];
ld.local.u32 %r8444, [%rd1494+-4];
and.b32 %r1101, %r1093, 31;
setp.eq.s32 %p721, %r1101, 0;
@%p721 bra $L__BB0_836;
mov.u32 %r4906, 32;
sub.s32 %r4907, %r4906, %r1101;
shr.u32 %r4908, %r8444, %r4907;
shl.b32 %r4909, %r8443, %r1101;
add.s32 %r8443, %r4908, %r4909;
mul.wide.s32 %rd1495, %r1098, 4;
add.s64 %rd1496, %rd1, %rd1495;
ld.local.u32 %r4910, [%rd1496];
shr.u32 %r4911, %r4910, %r4907;
shl.b32 %r4912, %r8444, %r1101;
add.s32 %r8444, %r4911, %r4912;
$L__BB0_836:
and.b32 %r4913, %r1092, -2147483648;
shr.u32 %r4914, %r8444, 30;
shl.b32 %r4915, %r8443, 2;
or.b32 %r4916, %r4914, %r4915;
shr.u32 %r4917, %r4916, 31;
shr.u32 %r4918, %r8443, 30;
add.s32 %r4919, %r4917, %r4918;
neg.s32 %r4920, %r4919;
setp.eq.s32 %p722, %r4913, 0;
selp.b32 %r8445, %r4919, %r4920, %p722;
setp.ne.s32 %p723, %r4917, 0;
xor.b32 %r4921, %r4913, -2147483648;
selp.b32 %r4922, %r4921, %r4913, %p723;
selp.b32 %r4923, -1, 0, %p723;
xor.b32 %r4924, %r4916, %r4923;
shl.b32 %r4925, %r8444, 2;
xor.b32 %r4926, %r4925, %r4923;
cvt.u64.u32 %rd1497, %r4924;
cvt.u64.u32 %rd1498, %r4926;
bfi.b64 %rd1499, %rd1497, %rd1498, 32, 32;
cvt.rn.f64.s64 %fd109, %rd1499;
mul.f64 %fd110, %fd109, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3432, %fd110;
setp.eq.s32 %p724, %r4922, 0;
neg.f32 %f3433, %f3432;
selp.f32 %f5486, %f3432, %f3433, %p724;
$L__BB0_838:
and.b32 %r1108, %r8445, 1;
setp.eq.s32 %p725, %r1108, 0;
selp.f32 %f935, %f5486, 0f3F800000, %p725;
mul.rn.f32 %f936, %f5486, %f5486;
mov.f32 %f5487, 0fB94D4153;
@%p725 bra $L__BB0_840;
mov.f32 %f3436, 0fBAB607ED;
mov.f32 %f3437, 0f37CBAC00;
fma.rn.f32 %f5487, %f3437, %f936, %f3436;
$L__BB0_840:
selp.f32 %f3438, 0f3C0885E4, 0f3D2AAABB, %p725;
fma.rn.f32 %f3439, %f5487, %f936, %f3438;
selp.f32 %f3440, 0fBE2AAAA8, 0fBEFFFFFF, %p725;
fma.rn.f32 %f3441, %f3439, %f936, %f3440;
mov.f32 %f3442, 0f00000000;
fma.rn.f32 %f3443, %f936, %f935, %f3442;
fma.rn.f32 %f5488, %f3441, %f3443, %f935;
and.b32 %r4928, %r8445, 2;
setp.eq.s32 %p727, %r4928, 0;
@%p727 bra $L__BB0_842;
mov.f32 %f3445, 0fBF800000;
fma.rn.f32 %f5488, %f5488, %f3445, %f3442;
$L__BB0_842:
mul.f32 %f3446, %f842, 0f3F22F983;
cvt.rni.s32.f32 %r8449, %f3446;
cvt.rn.f32.s32 %f3447, %r8449;
mov.f32 %f3448, 0fBFC90FDA;
fma.rn.f32 %f3449, %f3447, %f3448, %f842;
mov.f32 %f3450, 0fB3A22168;
fma.rn.f32 %f3451, %f3447, %f3450, %f3449;
mov.f32 %f3452, 0fA7C234C5;
fma.rn.f32 %f5489, %f3447, %f3452, %f3451;
abs.f32 %f943, %f842;
setp.ltu.f32 %p728, %f943, 0f47CE4780;
@%p728 bra $L__BB0_850;
setp.eq.f32 %p729, %f943, 0f7F800000;
@%p729 bra $L__BB0_849;
bra.uni $L__BB0_844;
$L__BB0_849:
mov.f32 %f3455, 0f00000000;
mul.rn.f32 %f5489, %f842, %f3455;
mov.u32 %r8449, 0;
bra.uni $L__BB0_850;
$L__BB0_844:
mov.b32 %r1110, %f842;
shr.u32 %r4930, %r1110, 23;
and.b32 %r4931, %r4930, 255;
add.s32 %r1111, %r4931, -128;
shl.b32 %r4932, %r1110, 8;
or.b32 %r1112, %r4932, -2147483648;
shr.u32 %r1113, %r1111, 5;
mov.u64 %rd2608, 0;
mov.u32 %r8446, 0;
mov.u64 %rd1503, __cudart_i2opi_f;
mov.u64 %rd2609, %rd2608;
$L__BB0_845:
.pragma "nounroll";
shl.b64 %rd1502, %rd2608, 2;
add.s64 %rd1504, %rd1503, %rd1502;
ld.global.nc.u32 %r4933, [%rd1504];
mad.wide.u32 %rd1505, %r4933, %r1112, %rd2609;
shr.u64 %rd2609, %rd1505, 32;
add.s64 %rd1506, %rd1, %rd1502;
st.local.u32 [%rd1506], %rd1505;
add.s32 %r8446, %r8446, 1;
cvt.s64.s32 %rd2608, %r8446;
setp.ne.s32 %p730, %r8446, 6;
@%p730 bra $L__BB0_845;
st.local.u32 [%rd5], %rd2609;
mov.u32 %r4934, 4;
sub.s32 %r1116, %r4934, %r1113;
mov.u32 %r4935, 6;
sub.s32 %r4936, %r4935, %r1113;
mul.wide.s32 %rd1507, %r4936, 4;
add.s64 %rd1508, %rd1, %rd1507;
ld.local.u32 %r8447, [%rd1508];
ld.local.u32 %r8448, [%rd1508+-4];
and.b32 %r1119, %r1111, 31;
setp.eq.s32 %p731, %r1119, 0;
@%p731 bra $L__BB0_848;
mov.u32 %r4937, 32;
sub.s32 %r4938, %r4937, %r1119;
shr.u32 %r4939, %r8448, %r4938;
shl.b32 %r4940, %r8447, %r1119;
add.s32 %r8447, %r4939, %r4940;
mul.wide.s32 %rd1509, %r1116, 4;
add.s64 %rd1510, %rd1, %rd1509;
ld.local.u32 %r4941, [%rd1510];
shr.u32 %r4942, %r4941, %r4938;
shl.b32 %r4943, %r8448, %r1119;
add.s32 %r8448, %r4942, %r4943;
$L__BB0_848:
and.b32 %r4944, %r1110, -2147483648;
shr.u32 %r4945, %r8448, 30;
shl.b32 %r4946, %r8447, 2;
or.b32 %r4947, %r4945, %r4946;
shr.u32 %r4948, %r4947, 31;
shr.u32 %r4949, %r8447, 30;
add.s32 %r4950, %r4948, %r4949;
neg.s32 %r4951, %r4950;
setp.eq.s32 %p732, %r4944, 0;
selp.b32 %r8449, %r4950, %r4951, %p732;
setp.ne.s32 %p733, %r4948, 0;
xor.b32 %r4952, %r4944, -2147483648;
selp.b32 %r4953, %r4952, %r4944, %p733;
selp.b32 %r4954, -1, 0, %p733;
xor.b32 %r4955, %r4947, %r4954;
shl.b32 %r4956, %r8448, 2;
xor.b32 %r4957, %r4956, %r4954;
cvt.u64.u32 %rd1511, %r4955;
cvt.u64.u32 %rd1512, %r4957;
bfi.b64 %rd1513, %rd1511, %rd1512, 32, 32;
cvt.rn.f64.s64 %fd111, %rd1513;
mul.f64 %fd112, %fd111, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3453, %fd112;
setp.eq.s32 %p734, %r4953, 0;
neg.f32 %f3454, %f3453;
selp.f32 %f5489, %f3453, %f3454, %p734;
$L__BB0_850:
add.s32 %r1126, %r8449, 1;
and.b32 %r1127, %r1126, 1;
setp.eq.s32 %p735, %r1127, 0;
selp.f32 %f947, %f5489, 0f3F800000, %p735;
mul.rn.f32 %f948, %f5489, %f5489;
mov.f32 %f5490, 0fB94D4153;
@%p735 bra $L__BB0_852;
mov.f32 %f3457, 0fBAB607ED;
mov.f32 %f3458, 0f37CBAC00;
fma.rn.f32 %f5490, %f3458, %f948, %f3457;
$L__BB0_852:
selp.f32 %f3459, 0f3C0885E4, 0f3D2AAABB, %p735;
fma.rn.f32 %f3460, %f5490, %f948, %f3459;
selp.f32 %f3461, 0fBE2AAAA8, 0fBEFFFFFF, %p735;
fma.rn.f32 %f3462, %f3460, %f948, %f3461;
mov.f32 %f3463, 0f00000000;
fma.rn.f32 %f3464, %f948, %f947, %f3463;
fma.rn.f32 %f5491, %f3462, %f3464, %f947;
and.b32 %r4959, %r1126, 2;
setp.eq.s32 %p737, %r4959, 0;
@%p737 bra $L__BB0_854;
mov.f32 %f3466, 0fBF800000;
fma.rn.f32 %f5491, %f5491, %f3466, %f3463;
$L__BB0_854:
add.f32 %f5520, %f5488, %f5491;
mul.f32 %f3467, %f851, 0f3F22F983;
cvt.rni.s32.f32 %r8453, %f3467;
cvt.rn.f32.s32 %f3468, %r8453;
mov.f32 %f3469, 0fBFC90FDA;
fma.rn.f32 %f3470, %f3468, %f3469, %f851;
mov.f32 %f3471, 0fB3A22168;
fma.rn.f32 %f3472, %f3468, %f3471, %f3470;
mov.f32 %f3473, 0fA7C234C5;
fma.rn.f32 %f5492, %f3468, %f3473, %f3472;
abs.f32 %f956, %f851;
setp.ltu.f32 %p738, %f956, 0f47CE4780;
@%p738 bra $L__BB0_862;
setp.eq.f32 %p739, %f956, 0f7F800000;
@%p739 bra $L__BB0_861;
bra.uni $L__BB0_856;
$L__BB0_861:
mov.f32 %f3476, 0f00000000;
mul.rn.f32 %f5492, %f851, %f3476;
mov.u32 %r8453, 0;
bra.uni $L__BB0_862;
$L__BB0_856:
mov.b32 %r1129, %f851;
shr.u32 %r4961, %r1129, 23;
and.b32 %r4962, %r4961, 255;
add.s32 %r1130, %r4962, -128;
shl.b32 %r4963, %r1129, 8;
or.b32 %r1131, %r4963, -2147483648;
shr.u32 %r1132, %r1130, 5;
mov.u64 %rd2610, 0;
mov.u32 %r8450, 0;
mov.u64 %rd1517, __cudart_i2opi_f;
mov.u64 %rd2611, %rd2610;
$L__BB0_857:
.pragma "nounroll";
shl.b64 %rd1516, %rd2610, 2;
add.s64 %rd1518, %rd1517, %rd1516;
ld.global.nc.u32 %r4964, [%rd1518];
mad.wide.u32 %rd1519, %r4964, %r1131, %rd2611;
shr.u64 %rd2611, %rd1519, 32;
add.s64 %rd1520, %rd1, %rd1516;
st.local.u32 [%rd1520], %rd1519;
add.s32 %r8450, %r8450, 1;
cvt.s64.s32 %rd2610, %r8450;
setp.ne.s32 %p740, %r8450, 6;
@%p740 bra $L__BB0_857;
st.local.u32 [%rd5], %rd2611;
mov.u32 %r4965, 4;
sub.s32 %r1135, %r4965, %r1132;
mov.u32 %r4966, 6;
sub.s32 %r4967, %r4966, %r1132;
mul.wide.s32 %rd1521, %r4967, 4;
add.s64 %rd1522, %rd1, %rd1521;
ld.local.u32 %r8451, [%rd1522];
ld.local.u32 %r8452, [%rd1522+-4];
and.b32 %r1138, %r1130, 31;
setp.eq.s32 %p741, %r1138, 0;
@%p741 bra $L__BB0_860;
mov.u32 %r4968, 32;
sub.s32 %r4969, %r4968, %r1138;
shr.u32 %r4970, %r8452, %r4969;
shl.b32 %r4971, %r8451, %r1138;
add.s32 %r8451, %r4970, %r4971;
mul.wide.s32 %rd1523, %r1135, 4;
add.s64 %rd1524, %rd1, %rd1523;
ld.local.u32 %r4972, [%rd1524];
shr.u32 %r4973, %r4972, %r4969;
shl.b32 %r4974, %r8452, %r1138;
add.s32 %r8452, %r4973, %r4974;
$L__BB0_860:
and.b32 %r4975, %r1129, -2147483648;
shr.u32 %r4976, %r8452, 30;
shl.b32 %r4977, %r8451, 2;
or.b32 %r4978, %r4976, %r4977;
shr.u32 %r4979, %r4978, 31;
shr.u32 %r4980, %r8451, 30;
add.s32 %r4981, %r4979, %r4980;
neg.s32 %r4982, %r4981;
setp.eq.s32 %p742, %r4975, 0;
selp.b32 %r8453, %r4981, %r4982, %p742;
setp.ne.s32 %p743, %r4979, 0;
xor.b32 %r4983, %r4975, -2147483648;
selp.b32 %r4984, %r4983, %r4975, %p743;
selp.b32 %r4985, -1, 0, %p743;
xor.b32 %r4986, %r4978, %r4985;
shl.b32 %r4987, %r8452, 2;
xor.b32 %r4988, %r4987, %r4985;
cvt.u64.u32 %rd1525, %r4986;
cvt.u64.u32 %rd1526, %r4988;
bfi.b64 %rd1527, %rd1525, %rd1526, 32, 32;
cvt.rn.f64.s64 %fd113, %rd1527;
mul.f64 %fd114, %fd113, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3474, %fd114;
setp.eq.s32 %p744, %r4984, 0;
neg.f32 %f3475, %f3474;
selp.f32 %f5492, %f3474, %f3475, %p744;
$L__BB0_862:
and.b32 %r1145, %r8453, 1;
setp.eq.s32 %p745, %r1145, 0;
selp.f32 %f960, %f5492, 0f3F800000, %p745;
mul.rn.f32 %f961, %f5492, %f5492;
mov.f32 %f5493, 0fB94D4153;
@%p745 bra $L__BB0_864;
mov.f32 %f3478, 0fBAB607ED;
mov.f32 %f3479, 0f37CBAC00;
fma.rn.f32 %f5493, %f3479, %f961, %f3478;
$L__BB0_864:
selp.f32 %f3480, 0f3C0885E4, 0f3D2AAABB, %p745;
fma.rn.f32 %f3481, %f5493, %f961, %f3480;
selp.f32 %f3482, 0fBE2AAAA8, 0fBEFFFFFF, %p745;
fma.rn.f32 %f3483, %f3481, %f961, %f3482;
mov.f32 %f3484, 0f00000000;
fma.rn.f32 %f3485, %f961, %f960, %f3484;
fma.rn.f32 %f5494, %f3483, %f3485, %f960;
and.b32 %r4990, %r8453, 2;
setp.eq.s32 %p747, %r4990, 0;
@%p747 bra $L__BB0_866;
mov.f32 %f3487, 0fBF800000;
fma.rn.f32 %f5494, %f5494, %f3487, %f3484;
$L__BB0_866:
mul.f32 %f3488, %f843, 0f3F22F983;
cvt.rni.s32.f32 %r8457, %f3488;
cvt.rn.f32.s32 %f3489, %r8457;
mov.f32 %f3490, 0fBFC90FDA;
fma.rn.f32 %f3491, %f3489, %f3490, %f843;
mov.f32 %f3492, 0fB3A22168;
fma.rn.f32 %f3493, %f3489, %f3492, %f3491;
mov.f32 %f3494, 0fA7C234C5;
fma.rn.f32 %f5495, %f3489, %f3494, %f3493;
abs.f32 %f968, %f843;
setp.ltu.f32 %p748, %f968, 0f47CE4780;
@%p748 bra $L__BB0_874;
setp.eq.f32 %p749, %f968, 0f7F800000;
@%p749 bra $L__BB0_873;
bra.uni $L__BB0_868;
$L__BB0_873:
mov.f32 %f3497, 0f00000000;
mul.rn.f32 %f5495, %f843, %f3497;
mov.u32 %r8457, 0;
bra.uni $L__BB0_874;
$L__BB0_868:
mov.b32 %r1147, %f843;
shr.u32 %r4992, %r1147, 23;
and.b32 %r4993, %r4992, 255;
add.s32 %r1148, %r4993, -128;
shl.b32 %r4994, %r1147, 8;
or.b32 %r1149, %r4994, -2147483648;
shr.u32 %r1150, %r1148, 5;
mov.u64 %rd2612, 0;
mov.u32 %r8454, 0;
mov.u64 %rd1531, __cudart_i2opi_f;
mov.u64 %rd2613, %rd2612;
$L__BB0_869:
.pragma "nounroll";
shl.b64 %rd1530, %rd2612, 2;
add.s64 %rd1532, %rd1531, %rd1530;
ld.global.nc.u32 %r4995, [%rd1532];
mad.wide.u32 %rd1533, %r4995, %r1149, %rd2613;
shr.u64 %rd2613, %rd1533, 32;
add.s64 %rd1534, %rd1, %rd1530;
st.local.u32 [%rd1534], %rd1533;
add.s32 %r8454, %r8454, 1;
cvt.s64.s32 %rd2612, %r8454;
setp.ne.s32 %p750, %r8454, 6;
@%p750 bra $L__BB0_869;
st.local.u32 [%rd5], %rd2613;
mov.u32 %r4996, 4;
sub.s32 %r1153, %r4996, %r1150;
mov.u32 %r4997, 6;
sub.s32 %r4998, %r4997, %r1150;
mul.wide.s32 %rd1535, %r4998, 4;
add.s64 %rd1536, %rd1, %rd1535;
ld.local.u32 %r8455, [%rd1536];
ld.local.u32 %r8456, [%rd1536+-4];
and.b32 %r1156, %r1148, 31;
setp.eq.s32 %p751, %r1156, 0;
@%p751 bra $L__BB0_872;
mov.u32 %r4999, 32;
sub.s32 %r5000, %r4999, %r1156;
shr.u32 %r5001, %r8456, %r5000;
shl.b32 %r5002, %r8455, %r1156;
add.s32 %r8455, %r5001, %r5002;
mul.wide.s32 %rd1537, %r1153, 4;
add.s64 %rd1538, %rd1, %rd1537;
ld.local.u32 %r5003, [%rd1538];
shr.u32 %r5004, %r5003, %r5000;
shl.b32 %r5005, %r8456, %r1156;
add.s32 %r8456, %r5004, %r5005;
$L__BB0_872:
and.b32 %r5006, %r1147, -2147483648;
shr.u32 %r5007, %r8456, 30;
shl.b32 %r5008, %r8455, 2;
or.b32 %r5009, %r5007, %r5008;
shr.u32 %r5010, %r5009, 31;
shr.u32 %r5011, %r8455, 30;
add.s32 %r5012, %r5010, %r5011;
neg.s32 %r5013, %r5012;
setp.eq.s32 %p752, %r5006, 0;
selp.b32 %r8457, %r5012, %r5013, %p752;
setp.ne.s32 %p753, %r5010, 0;
xor.b32 %r5014, %r5006, -2147483648;
selp.b32 %r5015, %r5014, %r5006, %p753;
selp.b32 %r5016, -1, 0, %p753;
xor.b32 %r5017, %r5009, %r5016;
shl.b32 %r5018, %r8456, 2;
xor.b32 %r5019, %r5018, %r5016;
cvt.u64.u32 %rd1539, %r5017;
cvt.u64.u32 %rd1540, %r5019;
bfi.b64 %rd1541, %rd1539, %rd1540, 32, 32;
cvt.rn.f64.s64 %fd115, %rd1541;
mul.f64 %fd116, %fd115, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3495, %fd116;
setp.eq.s32 %p754, %r5015, 0;
neg.f32 %f3496, %f3495;
selp.f32 %f5495, %f3495, %f3496, %p754;
$L__BB0_874:
add.s32 %r1163, %r8457, 1;
and.b32 %r1164, %r1163, 1;
setp.eq.s32 %p755, %r1164, 0;
selp.f32 %f972, %f5495, 0f3F800000, %p755;
mul.rn.f32 %f973, %f5495, %f5495;
mov.f32 %f5496, 0fB94D4153;
@%p755 bra $L__BB0_876;
mov.f32 %f3499, 0fBAB607ED;
mov.f32 %f3500, 0f37CBAC00;
fma.rn.f32 %f5496, %f3500, %f973, %f3499;
$L__BB0_876:
selp.f32 %f3501, 0f3C0885E4, 0f3D2AAABB, %p755;
fma.rn.f32 %f3502, %f5496, %f973, %f3501;
selp.f32 %f3503, 0fBE2AAAA8, 0fBEFFFFFF, %p755;
fma.rn.f32 %f3504, %f3502, %f973, %f3503;
mov.f32 %f3505, 0f00000000;
fma.rn.f32 %f3506, %f973, %f972, %f3505;
fma.rn.f32 %f5497, %f3504, %f3506, %f972;
and.b32 %r5021, %r1163, 2;
setp.eq.s32 %p757, %r5021, 0;
@%p757 bra $L__BB0_878;
mov.f32 %f3508, 0fBF800000;
fma.rn.f32 %f5497, %f5497, %f3508, %f3505;
$L__BB0_878:
add.f32 %f5519, %f5494, %f5497;
mul.f32 %f3509, %f852, 0f3F22F983;
cvt.rni.s32.f32 %r8461, %f3509;
cvt.rn.f32.s32 %f3510, %r8461;
mov.f32 %f3511, 0fBFC90FDA;
fma.rn.f32 %f3512, %f3510, %f3511, %f852;
mov.f32 %f3513, 0fB3A22168;
fma.rn.f32 %f3514, %f3510, %f3513, %f3512;
mov.f32 %f3515, 0fA7C234C5;
fma.rn.f32 %f5498, %f3510, %f3515, %f3514;
abs.f32 %f981, %f852;
setp.ltu.f32 %p758, %f981, 0f47CE4780;
@%p758 bra $L__BB0_886;
setp.eq.f32 %p759, %f981, 0f7F800000;
@%p759 bra $L__BB0_885;
bra.uni $L__BB0_880;
$L__BB0_885:
mov.f32 %f3518, 0f00000000;
mul.rn.f32 %f5498, %f852, %f3518;
mov.u32 %r8461, 0;
bra.uni $L__BB0_886;
$L__BB0_880:
mov.b32 %r1166, %f852;
shr.u32 %r5023, %r1166, 23;
and.b32 %r5024, %r5023, 255;
add.s32 %r1167, %r5024, -128;
shl.b32 %r5025, %r1166, 8;
or.b32 %r1168, %r5025, -2147483648;
shr.u32 %r1169, %r1167, 5;
mov.u64 %rd2614, 0;
mov.u32 %r8458, 0;
mov.u64 %rd1545, __cudart_i2opi_f;
mov.u64 %rd2615, %rd2614;
$L__BB0_881:
.pragma "nounroll";
shl.b64 %rd1544, %rd2614, 2;
add.s64 %rd1546, %rd1545, %rd1544;
ld.global.nc.u32 %r5026, [%rd1546];
mad.wide.u32 %rd1547, %r5026, %r1168, %rd2615;
shr.u64 %rd2615, %rd1547, 32;
add.s64 %rd1548, %rd1, %rd1544;
st.local.u32 [%rd1548], %rd1547;
add.s32 %r8458, %r8458, 1;
cvt.s64.s32 %rd2614, %r8458;
setp.ne.s32 %p760, %r8458, 6;
@%p760 bra $L__BB0_881;
st.local.u32 [%rd5], %rd2615;
mov.u32 %r5027, 4;
sub.s32 %r1172, %r5027, %r1169;
mov.u32 %r5028, 6;
sub.s32 %r5029, %r5028, %r1169;
mul.wide.s32 %rd1549, %r5029, 4;
add.s64 %rd1550, %rd1, %rd1549;
ld.local.u32 %r8459, [%rd1550];
ld.local.u32 %r8460, [%rd1550+-4];
and.b32 %r1175, %r1167, 31;
setp.eq.s32 %p761, %r1175, 0;
@%p761 bra $L__BB0_884;
mov.u32 %r5030, 32;
sub.s32 %r5031, %r5030, %r1175;
shr.u32 %r5032, %r8460, %r5031;
shl.b32 %r5033, %r8459, %r1175;
add.s32 %r8459, %r5032, %r5033;
mul.wide.s32 %rd1551, %r1172, 4;
add.s64 %rd1552, %rd1, %rd1551;
ld.local.u32 %r5034, [%rd1552];
shr.u32 %r5035, %r5034, %r5031;
shl.b32 %r5036, %r8460, %r1175;
add.s32 %r8460, %r5035, %r5036;
$L__BB0_884:
and.b32 %r5037, %r1166, -2147483648;
shr.u32 %r5038, %r8460, 30;
shl.b32 %r5039, %r8459, 2;
or.b32 %r5040, %r5038, %r5039;
shr.u32 %r5041, %r5040, 31;
shr.u32 %r5042, %r8459, 30;
add.s32 %r5043, %r5041, %r5042;
neg.s32 %r5044, %r5043;
setp.eq.s32 %p762, %r5037, 0;
selp.b32 %r8461, %r5043, %r5044, %p762;
setp.ne.s32 %p763, %r5041, 0;
xor.b32 %r5045, %r5037, -2147483648;
selp.b32 %r5046, %r5045, %r5037, %p763;
selp.b32 %r5047, -1, 0, %p763;
xor.b32 %r5048, %r5040, %r5047;
shl.b32 %r5049, %r8460, 2;
xor.b32 %r5050, %r5049, %r5047;
cvt.u64.u32 %rd1553, %r5048;
cvt.u64.u32 %rd1554, %r5050;
bfi.b64 %rd1555, %rd1553, %rd1554, 32, 32;
cvt.rn.f64.s64 %fd117, %rd1555;
mul.f64 %fd118, %fd117, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3516, %fd118;
setp.eq.s32 %p764, %r5046, 0;
neg.f32 %f3517, %f3516;
selp.f32 %f5498, %f3516, %f3517, %p764;
$L__BB0_886:
and.b32 %r1182, %r8461, 1;
setp.eq.s32 %p765, %r1182, 0;
selp.f32 %f985, %f5498, 0f3F800000, %p765;
mul.rn.f32 %f986, %f5498, %f5498;
mov.f32 %f5499, 0fB94D4153;
@%p765 bra $L__BB0_888;
mov.f32 %f3520, 0fBAB607ED;
mov.f32 %f3521, 0f37CBAC00;
fma.rn.f32 %f5499, %f3521, %f986, %f3520;
$L__BB0_888:
selp.f32 %f3522, 0f3C0885E4, 0f3D2AAABB, %p765;
fma.rn.f32 %f3523, %f5499, %f986, %f3522;
selp.f32 %f3524, 0fBE2AAAA8, 0fBEFFFFFF, %p765;
fma.rn.f32 %f3525, %f3523, %f986, %f3524;
mov.f32 %f3526, 0f00000000;
fma.rn.f32 %f3527, %f986, %f985, %f3526;
fma.rn.f32 %f5500, %f3525, %f3527, %f985;
and.b32 %r5052, %r8461, 2;
setp.eq.s32 %p767, %r5052, 0;
@%p767 bra $L__BB0_890;
mov.f32 %f3529, 0fBF800000;
fma.rn.f32 %f5500, %f5500, %f3529, %f3526;
$L__BB0_890:
mul.f32 %f3530, %f844, 0f3F22F983;
cvt.rni.s32.f32 %r8465, %f3530;
cvt.rn.f32.s32 %f3531, %r8465;
mov.f32 %f3532, 0fBFC90FDA;
fma.rn.f32 %f3533, %f3531, %f3532, %f844;
mov.f32 %f3534, 0fB3A22168;
fma.rn.f32 %f3535, %f3531, %f3534, %f3533;
mov.f32 %f3536, 0fA7C234C5;
fma.rn.f32 %f5501, %f3531, %f3536, %f3535;
abs.f32 %f993, %f844;
setp.ltu.f32 %p768, %f993, 0f47CE4780;
@%p768 bra $L__BB0_898;
setp.eq.f32 %p769, %f993, 0f7F800000;
@%p769 bra $L__BB0_897;
bra.uni $L__BB0_892;
$L__BB0_897:
mov.f32 %f3539, 0f00000000;
mul.rn.f32 %f5501, %f844, %f3539;
mov.u32 %r8465, 0;
bra.uni $L__BB0_898;
$L__BB0_892:
mov.b32 %r1184, %f844;
shr.u32 %r5054, %r1184, 23;
and.b32 %r5055, %r5054, 255;
add.s32 %r1185, %r5055, -128;
shl.b32 %r5056, %r1184, 8;
or.b32 %r1186, %r5056, -2147483648;
shr.u32 %r1187, %r1185, 5;
mov.u64 %rd2616, 0;
mov.u32 %r8462, 0;
mov.u64 %rd1559, __cudart_i2opi_f;
mov.u64 %rd2617, %rd2616;
$L__BB0_893:
.pragma "nounroll";
shl.b64 %rd1558, %rd2616, 2;
add.s64 %rd1560, %rd1559, %rd1558;
ld.global.nc.u32 %r5057, [%rd1560];
mad.wide.u32 %rd1561, %r5057, %r1186, %rd2617;
shr.u64 %rd2617, %rd1561, 32;
add.s64 %rd1562, %rd1, %rd1558;
st.local.u32 [%rd1562], %rd1561;
add.s32 %r8462, %r8462, 1;
cvt.s64.s32 %rd2616, %r8462;
setp.ne.s32 %p770, %r8462, 6;
@%p770 bra $L__BB0_893;
st.local.u32 [%rd5], %rd2617;
mov.u32 %r5058, 4;
sub.s32 %r1190, %r5058, %r1187;
mov.u32 %r5059, 6;
sub.s32 %r5060, %r5059, %r1187;
mul.wide.s32 %rd1563, %r5060, 4;
add.s64 %rd1564, %rd1, %rd1563;
ld.local.u32 %r8463, [%rd1564];
ld.local.u32 %r8464, [%rd1564+-4];
and.b32 %r1193, %r1185, 31;
setp.eq.s32 %p771, %r1193, 0;
@%p771 bra $L__BB0_896;
mov.u32 %r5061, 32;
sub.s32 %r5062, %r5061, %r1193;
shr.u32 %r5063, %r8464, %r5062;
shl.b32 %r5064, %r8463, %r1193;
add.s32 %r8463, %r5063, %r5064;
mul.wide.s32 %rd1565, %r1190, 4;
add.s64 %rd1566, %rd1, %rd1565;
ld.local.u32 %r5065, [%rd1566];
shr.u32 %r5066, %r5065, %r5062;
shl.b32 %r5067, %r8464, %r1193;
add.s32 %r8464, %r5066, %r5067;
$L__BB0_896:
and.b32 %r5068, %r1184, -2147483648;
shr.u32 %r5069, %r8464, 30;
shl.b32 %r5070, %r8463, 2;
or.b32 %r5071, %r5069, %r5070;
shr.u32 %r5072, %r5071, 31;
shr.u32 %r5073, %r8463, 30;
add.s32 %r5074, %r5072, %r5073;
neg.s32 %r5075, %r5074;
setp.eq.s32 %p772, %r5068, 0;
selp.b32 %r8465, %r5074, %r5075, %p772;
setp.ne.s32 %p773, %r5072, 0;
xor.b32 %r5076, %r5068, -2147483648;
selp.b32 %r5077, %r5076, %r5068, %p773;
selp.b32 %r5078, -1, 0, %p773;
xor.b32 %r5079, %r5071, %r5078;
shl.b32 %r5080, %r8464, 2;
xor.b32 %r5081, %r5080, %r5078;
cvt.u64.u32 %rd1567, %r5079;
cvt.u64.u32 %rd1568, %r5081;
bfi.b64 %rd1569, %rd1567, %rd1568, 32, 32;
cvt.rn.f64.s64 %fd119, %rd1569;
mul.f64 %fd120, %fd119, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3537, %fd120;
setp.eq.s32 %p774, %r5077, 0;
neg.f32 %f3538, %f3537;
selp.f32 %f5501, %f3537, %f3538, %p774;
$L__BB0_898:
add.s32 %r1200, %r8465, 1;
and.b32 %r1201, %r1200, 1;
setp.eq.s32 %p775, %r1201, 0;
selp.f32 %f997, %f5501, 0f3F800000, %p775;
mul.rn.f32 %f998, %f5501, %f5501;
mov.f32 %f5502, 0fB94D4153;
@%p775 bra $L__BB0_900;
mov.f32 %f3541, 0fBAB607ED;
mov.f32 %f3542, 0f37CBAC00;
fma.rn.f32 %f5502, %f3542, %f998, %f3541;
$L__BB0_900:
selp.f32 %f3543, 0f3C0885E4, 0f3D2AAABB, %p775;
fma.rn.f32 %f3544, %f5502, %f998, %f3543;
selp.f32 %f3545, 0fBE2AAAA8, 0fBEFFFFFF, %p775;
fma.rn.f32 %f3546, %f3544, %f998, %f3545;
mov.f32 %f3547, 0f00000000;
fma.rn.f32 %f3548, %f998, %f997, %f3547;
fma.rn.f32 %f5503, %f3546, %f3548, %f997;
and.b32 %r5083, %r1200, 2;
setp.eq.s32 %p777, %r5083, 0;
@%p777 bra $L__BB0_902;
mov.f32 %f3550, 0fBF800000;
fma.rn.f32 %f5503, %f5503, %f3550, %f3547;
$L__BB0_902:
add.f32 %f5518, %f5500, %f5503;
mul.f32 %f3551, %f853, 0f3F22F983;
cvt.rni.s32.f32 %r8469, %f3551;
cvt.rn.f32.s32 %f3552, %r8469;
mov.f32 %f3553, 0fBFC90FDA;
fma.rn.f32 %f3554, %f3552, %f3553, %f853;
mov.f32 %f3555, 0fB3A22168;
fma.rn.f32 %f3556, %f3552, %f3555, %f3554;
mov.f32 %f3557, 0fA7C234C5;
fma.rn.f32 %f5504, %f3552, %f3557, %f3556;
abs.f32 %f1006, %f853;
setp.ltu.f32 %p778, %f1006, 0f47CE4780;
@%p778 bra $L__BB0_910;
setp.eq.f32 %p779, %f1006, 0f7F800000;
@%p779 bra $L__BB0_909;
bra.uni $L__BB0_904;
$L__BB0_909:
mov.f32 %f3560, 0f00000000;
mul.rn.f32 %f5504, %f853, %f3560;
mov.u32 %r8469, 0;
bra.uni $L__BB0_910;
$L__BB0_904:
mov.b32 %r1203, %f853;
shr.u32 %r5085, %r1203, 23;
and.b32 %r5086, %r5085, 255;
add.s32 %r1204, %r5086, -128;
shl.b32 %r5087, %r1203, 8;
or.b32 %r1205, %r5087, -2147483648;
shr.u32 %r1206, %r1204, 5;
mov.u64 %rd2618, 0;
mov.u32 %r8466, 0;
mov.u64 %rd1573, __cudart_i2opi_f;
mov.u64 %rd2619, %rd2618;
$L__BB0_905:
.pragma "nounroll";
shl.b64 %rd1572, %rd2618, 2;
add.s64 %rd1574, %rd1573, %rd1572;
ld.global.nc.u32 %r5088, [%rd1574];
mad.wide.u32 %rd1575, %r5088, %r1205, %rd2619;
shr.u64 %rd2619, %rd1575, 32;
add.s64 %rd1576, %rd1, %rd1572;
st.local.u32 [%rd1576], %rd1575;
add.s32 %r8466, %r8466, 1;
cvt.s64.s32 %rd2618, %r8466;
setp.ne.s32 %p780, %r8466, 6;
@%p780 bra $L__BB0_905;
st.local.u32 [%rd5], %rd2619;
mov.u32 %r5089, 4;
sub.s32 %r1209, %r5089, %r1206;
mov.u32 %r5090, 6;
sub.s32 %r5091, %r5090, %r1206;
mul.wide.s32 %rd1577, %r5091, 4;
add.s64 %rd1578, %rd1, %rd1577;
ld.local.u32 %r8467, [%rd1578];
ld.local.u32 %r8468, [%rd1578+-4];
and.b32 %r1212, %r1204, 31;
setp.eq.s32 %p781, %r1212, 0;
@%p781 bra $L__BB0_908;
mov.u32 %r5092, 32;
sub.s32 %r5093, %r5092, %r1212;
shr.u32 %r5094, %r8468, %r5093;
shl.b32 %r5095, %r8467, %r1212;
add.s32 %r8467, %r5094, %r5095;
mul.wide.s32 %rd1579, %r1209, 4;
add.s64 %rd1580, %rd1, %rd1579;
ld.local.u32 %r5096, [%rd1580];
shr.u32 %r5097, %r5096, %r5093;
shl.b32 %r5098, %r8468, %r1212;
add.s32 %r8468, %r5097, %r5098;
$L__BB0_908:
and.b32 %r5099, %r1203, -2147483648;
shr.u32 %r5100, %r8468, 30;
shl.b32 %r5101, %r8467, 2;
or.b32 %r5102, %r5100, %r5101;
shr.u32 %r5103, %r5102, 31;
shr.u32 %r5104, %r8467, 30;
add.s32 %r5105, %r5103, %r5104;
neg.s32 %r5106, %r5105;
setp.eq.s32 %p782, %r5099, 0;
selp.b32 %r8469, %r5105, %r5106, %p782;
setp.ne.s32 %p783, %r5103, 0;
xor.b32 %r5107, %r5099, -2147483648;
selp.b32 %r5108, %r5107, %r5099, %p783;
selp.b32 %r5109, -1, 0, %p783;
xor.b32 %r5110, %r5102, %r5109;
shl.b32 %r5111, %r8468, 2;
xor.b32 %r5112, %r5111, %r5109;
cvt.u64.u32 %rd1581, %r5110;
cvt.u64.u32 %rd1582, %r5112;
bfi.b64 %rd1583, %rd1581, %rd1582, 32, 32;
cvt.rn.f64.s64 %fd121, %rd1583;
mul.f64 %fd122, %fd121, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3558, %fd122;
setp.eq.s32 %p784, %r5108, 0;
neg.f32 %f3559, %f3558;
selp.f32 %f5504, %f3558, %f3559, %p784;
$L__BB0_910:
and.b32 %r1219, %r8469, 1;
setp.eq.s32 %p785, %r1219, 0;
selp.f32 %f1010, %f5504, 0f3F800000, %p785;
mul.rn.f32 %f1011, %f5504, %f5504;
mov.f32 %f5505, 0fB94D4153;
@%p785 bra $L__BB0_912;
mov.f32 %f3562, 0fBAB607ED;
mov.f32 %f3563, 0f37CBAC00;
fma.rn.f32 %f5505, %f3563, %f1011, %f3562;
$L__BB0_912:
selp.f32 %f3564, 0f3C0885E4, 0f3D2AAABB, %p785;
fma.rn.f32 %f3565, %f5505, %f1011, %f3564;
selp.f32 %f3566, 0fBE2AAAA8, 0fBEFFFFFF, %p785;
fma.rn.f32 %f3567, %f3565, %f1011, %f3566;
mov.f32 %f3568, 0f00000000;
fma.rn.f32 %f3569, %f1011, %f1010, %f3568;
fma.rn.f32 %f5506, %f3567, %f3569, %f1010;
and.b32 %r5114, %r8469, 2;
setp.eq.s32 %p787, %r5114, 0;
@%p787 bra $L__BB0_914;
mov.f32 %f3571, 0fBF800000;
fma.rn.f32 %f5506, %f5506, %f3571, %f3568;
$L__BB0_914:
mul.f32 %f3572, %f845, 0f3F22F983;
cvt.rni.s32.f32 %r8473, %f3572;
cvt.rn.f32.s32 %f3573, %r8473;
mov.f32 %f3574, 0fBFC90FDA;
fma.rn.f32 %f3575, %f3573, %f3574, %f845;
mov.f32 %f3576, 0fB3A22168;
fma.rn.f32 %f3577, %f3573, %f3576, %f3575;
mov.f32 %f3578, 0fA7C234C5;
fma.rn.f32 %f5507, %f3573, %f3578, %f3577;
abs.f32 %f1018, %f845;
setp.ltu.f32 %p788, %f1018, 0f47CE4780;
@%p788 bra $L__BB0_922;
setp.eq.f32 %p789, %f1018, 0f7F800000;
@%p789 bra $L__BB0_921;
bra.uni $L__BB0_916;
$L__BB0_921:
mov.f32 %f3581, 0f00000000;
mul.rn.f32 %f5507, %f845, %f3581;
mov.u32 %r8473, 0;
bra.uni $L__BB0_922;
$L__BB0_916:
mov.b32 %r1221, %f845;
shr.u32 %r5116, %r1221, 23;
and.b32 %r5117, %r5116, 255;
add.s32 %r1222, %r5117, -128;
shl.b32 %r5118, %r1221, 8;
or.b32 %r1223, %r5118, -2147483648;
shr.u32 %r1224, %r1222, 5;
mov.u64 %rd2620, 0;
mov.u32 %r8470, 0;
mov.u64 %rd1587, __cudart_i2opi_f;
mov.u64 %rd2621, %rd2620;
$L__BB0_917:
.pragma "nounroll";
shl.b64 %rd1586, %rd2620, 2;
add.s64 %rd1588, %rd1587, %rd1586;
ld.global.nc.u32 %r5119, [%rd1588];
mad.wide.u32 %rd1589, %r5119, %r1223, %rd2621;
shr.u64 %rd2621, %rd1589, 32;
add.s64 %rd1590, %rd1, %rd1586;
st.local.u32 [%rd1590], %rd1589;
add.s32 %r8470, %r8470, 1;
cvt.s64.s32 %rd2620, %r8470;
setp.ne.s32 %p790, %r8470, 6;
@%p790 bra $L__BB0_917;
st.local.u32 [%rd5], %rd2621;
mov.u32 %r5120, 4;
sub.s32 %r1227, %r5120, %r1224;
mov.u32 %r5121, 6;
sub.s32 %r5122, %r5121, %r1224;
mul.wide.s32 %rd1591, %r5122, 4;
add.s64 %rd1592, %rd1, %rd1591;
ld.local.u32 %r8471, [%rd1592];
ld.local.u32 %r8472, [%rd1592+-4];
and.b32 %r1230, %r1222, 31;
setp.eq.s32 %p791, %r1230, 0;
@%p791 bra $L__BB0_920;
mov.u32 %r5123, 32;
sub.s32 %r5124, %r5123, %r1230;
shr.u32 %r5125, %r8472, %r5124;
shl.b32 %r5126, %r8471, %r1230;
add.s32 %r8471, %r5125, %r5126;
mul.wide.s32 %rd1593, %r1227, 4;
add.s64 %rd1594, %rd1, %rd1593;
ld.local.u32 %r5127, [%rd1594];
shr.u32 %r5128, %r5127, %r5124;
shl.b32 %r5129, %r8472, %r1230;
add.s32 %r8472, %r5128, %r5129;
$L__BB0_920:
and.b32 %r5130, %r1221, -2147483648;
shr.u32 %r5131, %r8472, 30;
shl.b32 %r5132, %r8471, 2;
or.b32 %r5133, %r5131, %r5132;
shr.u32 %r5134, %r5133, 31;
shr.u32 %r5135, %r8471, 30;
add.s32 %r5136, %r5134, %r5135;
neg.s32 %r5137, %r5136;
setp.eq.s32 %p792, %r5130, 0;
selp.b32 %r8473, %r5136, %r5137, %p792;
setp.ne.s32 %p793, %r5134, 0;
xor.b32 %r5138, %r5130, -2147483648;
selp.b32 %r5139, %r5138, %r5130, %p793;
selp.b32 %r5140, -1, 0, %p793;
xor.b32 %r5141, %r5133, %r5140;
shl.b32 %r5142, %r8472, 2;
xor.b32 %r5143, %r5142, %r5140;
cvt.u64.u32 %rd1595, %r5141;
cvt.u64.u32 %rd1596, %r5143;
bfi.b64 %rd1597, %rd1595, %rd1596, 32, 32;
cvt.rn.f64.s64 %fd123, %rd1597;
mul.f64 %fd124, %fd123, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3579, %fd124;
setp.eq.s32 %p794, %r5139, 0;
neg.f32 %f3580, %f3579;
selp.f32 %f5507, %f3579, %f3580, %p794;
$L__BB0_922:
add.s32 %r1237, %r8473, 1;
and.b32 %r1238, %r1237, 1;
setp.eq.s32 %p795, %r1238, 0;
selp.f32 %f1022, %f5507, 0f3F800000, %p795;
mul.rn.f32 %f1023, %f5507, %f5507;
mov.f32 %f5508, 0fB94D4153;
@%p795 bra $L__BB0_924;
mov.f32 %f3583, 0fBAB607ED;
mov.f32 %f3584, 0f37CBAC00;
fma.rn.f32 %f5508, %f3584, %f1023, %f3583;
$L__BB0_924:
selp.f32 %f3585, 0f3C0885E4, 0f3D2AAABB, %p795;
fma.rn.f32 %f3586, %f5508, %f1023, %f3585;
selp.f32 %f3587, 0fBE2AAAA8, 0fBEFFFFFF, %p795;
fma.rn.f32 %f3588, %f3586, %f1023, %f3587;
mov.f32 %f3589, 0f00000000;
fma.rn.f32 %f3590, %f1023, %f1022, %f3589;
fma.rn.f32 %f5509, %f3588, %f3590, %f1022;
and.b32 %r5145, %r1237, 2;
setp.eq.s32 %p797, %r5145, 0;
@%p797 bra $L__BB0_926;
mov.f32 %f3592, 0fBF800000;
fma.rn.f32 %f5509, %f5509, %f3592, %f3589;
$L__BB0_926:
add.f32 %f5517, %f5506, %f5509;
mul.f32 %f3593, %f854, 0f3F22F983;
cvt.rni.s32.f32 %r8477, %f3593;
cvt.rn.f32.s32 %f3594, %r8477;
mov.f32 %f3595, 0fBFC90FDA;
fma.rn.f32 %f3596, %f3594, %f3595, %f854;
mov.f32 %f3597, 0fB3A22168;
fma.rn.f32 %f3598, %f3594, %f3597, %f3596;
mov.f32 %f3599, 0fA7C234C5;
fma.rn.f32 %f5510, %f3594, %f3599, %f3598;
abs.f32 %f1031, %f854;
setp.ltu.f32 %p798, %f1031, 0f47CE4780;
@%p798 bra $L__BB0_934;
setp.eq.f32 %p799, %f1031, 0f7F800000;
@%p799 bra $L__BB0_933;
bra.uni $L__BB0_928;
$L__BB0_933:
mov.f32 %f3602, 0f00000000;
mul.rn.f32 %f5510, %f854, %f3602;
mov.u32 %r8477, 0;
bra.uni $L__BB0_934;
$L__BB0_928:
mov.b32 %r1240, %f854;
shr.u32 %r5147, %r1240, 23;
and.b32 %r5148, %r5147, 255;
add.s32 %r1241, %r5148, -128;
shl.b32 %r5149, %r1240, 8;
or.b32 %r1242, %r5149, -2147483648;
shr.u32 %r1243, %r1241, 5;
mov.u64 %rd2622, 0;
mov.u32 %r8474, 0;
mov.u64 %rd1601, __cudart_i2opi_f;
mov.u64 %rd2623, %rd2622;
$L__BB0_929:
.pragma "nounroll";
shl.b64 %rd1600, %rd2622, 2;
add.s64 %rd1602, %rd1601, %rd1600;
ld.global.nc.u32 %r5150, [%rd1602];
mad.wide.u32 %rd1603, %r5150, %r1242, %rd2623;
shr.u64 %rd2623, %rd1603, 32;
add.s64 %rd1604, %rd1, %rd1600;
st.local.u32 [%rd1604], %rd1603;
add.s32 %r8474, %r8474, 1;
cvt.s64.s32 %rd2622, %r8474;
setp.ne.s32 %p800, %r8474, 6;
@%p800 bra $L__BB0_929;
st.local.u32 [%rd5], %rd2623;
mov.u32 %r5151, 4;
sub.s32 %r1246, %r5151, %r1243;
mov.u32 %r5152, 6;
sub.s32 %r5153, %r5152, %r1243;
mul.wide.s32 %rd1605, %r5153, 4;
add.s64 %rd1606, %rd1, %rd1605;
ld.local.u32 %r8475, [%rd1606];
ld.local.u32 %r8476, [%rd1606+-4];
and.b32 %r1249, %r1241, 31;
setp.eq.s32 %p801, %r1249, 0;
@%p801 bra $L__BB0_932;
mov.u32 %r5154, 32;
sub.s32 %r5155, %r5154, %r1249;
shr.u32 %r5156, %r8476, %r5155;
shl.b32 %r5157, %r8475, %r1249;
add.s32 %r8475, %r5156, %r5157;
mul.wide.s32 %rd1607, %r1246, 4;
add.s64 %rd1608, %rd1, %rd1607;
ld.local.u32 %r5158, [%rd1608];
shr.u32 %r5159, %r5158, %r5155;
shl.b32 %r5160, %r8476, %r1249;
add.s32 %r8476, %r5159, %r5160;
$L__BB0_932:
and.b32 %r5161, %r1240, -2147483648;
shr.u32 %r5162, %r8476, 30;
shl.b32 %r5163, %r8475, 2;
or.b32 %r5164, %r5162, %r5163;
shr.u32 %r5165, %r5164, 31;
shr.u32 %r5166, %r8475, 30;
add.s32 %r5167, %r5165, %r5166;
neg.s32 %r5168, %r5167;
setp.eq.s32 %p802, %r5161, 0;
selp.b32 %r8477, %r5167, %r5168, %p802;
setp.ne.s32 %p803, %r5165, 0;
xor.b32 %r5169, %r5161, -2147483648;
selp.b32 %r5170, %r5169, %r5161, %p803;
selp.b32 %r5171, -1, 0, %p803;
xor.b32 %r5172, %r5164, %r5171;
shl.b32 %r5173, %r8476, 2;
xor.b32 %r5174, %r5173, %r5171;
cvt.u64.u32 %rd1609, %r5172;
cvt.u64.u32 %rd1610, %r5174;
bfi.b64 %rd1611, %rd1609, %rd1610, 32, 32;
cvt.rn.f64.s64 %fd125, %rd1611;
mul.f64 %fd126, %fd125, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3600, %fd126;
setp.eq.s32 %p804, %r5170, 0;
neg.f32 %f3601, %f3600;
selp.f32 %f5510, %f3600, %f3601, %p804;
$L__BB0_934:
and.b32 %r1256, %r8477, 1;
setp.eq.s32 %p805, %r1256, 0;
selp.f32 %f1035, %f5510, 0f3F800000, %p805;
mul.rn.f32 %f1036, %f5510, %f5510;
mov.f32 %f5511, 0fB94D4153;
@%p805 bra $L__BB0_936;
mov.f32 %f3604, 0fBAB607ED;
mov.f32 %f3605, 0f37CBAC00;
fma.rn.f32 %f5511, %f3605, %f1036, %f3604;
$L__BB0_936:
selp.f32 %f3606, 0f3C0885E4, 0f3D2AAABB, %p805;
fma.rn.f32 %f3607, %f5511, %f1036, %f3606;
selp.f32 %f3608, 0fBE2AAAA8, 0fBEFFFFFF, %p805;
fma.rn.f32 %f3609, %f3607, %f1036, %f3608;
mov.f32 %f3610, 0f00000000;
fma.rn.f32 %f3611, %f1036, %f1035, %f3610;
fma.rn.f32 %f5512, %f3609, %f3611, %f1035;
and.b32 %r5176, %r8477, 2;
setp.eq.s32 %p807, %r5176, 0;
@%p807 bra $L__BB0_938;
mov.f32 %f3613, 0fBF800000;
fma.rn.f32 %f5512, %f5512, %f3613, %f3610;
$L__BB0_938:
mul.f32 %f3614, %f846, 0f3F22F983;
cvt.rni.s32.f32 %r8481, %f3614;
cvt.rn.f32.s32 %f3615, %r8481;
mov.f32 %f3616, 0fBFC90FDA;
fma.rn.f32 %f3617, %f3615, %f3616, %f846;
mov.f32 %f3618, 0fB3A22168;
fma.rn.f32 %f3619, %f3615, %f3618, %f3617;
mov.f32 %f3620, 0fA7C234C5;
fma.rn.f32 %f5513, %f3615, %f3620, %f3619;
abs.f32 %f1043, %f846;
setp.ltu.f32 %p808, %f1043, 0f47CE4780;
@%p808 bra $L__BB0_946;
setp.eq.f32 %p809, %f1043, 0f7F800000;
@%p809 bra $L__BB0_945;
bra.uni $L__BB0_940;
$L__BB0_945:
mov.f32 %f3623, 0f00000000;
mul.rn.f32 %f5513, %f846, %f3623;
mov.u32 %r8481, 0;
bra.uni $L__BB0_946;
$L__BB0_940:
mov.b32 %r1258, %f846;
shr.u32 %r5178, %r1258, 23;
and.b32 %r5179, %r5178, 255;
add.s32 %r1259, %r5179, -128;
shl.b32 %r5180, %r1258, 8;
or.b32 %r1260, %r5180, -2147483648;
shr.u32 %r1261, %r1259, 5;
mov.u64 %rd2624, 0;
mov.u32 %r8478, 0;
mov.u64 %rd1615, __cudart_i2opi_f;
mov.u64 %rd2625, %rd2624;
$L__BB0_941:
.pragma "nounroll";
shl.b64 %rd1614, %rd2624, 2;
add.s64 %rd1616, %rd1615, %rd1614;
ld.global.nc.u32 %r5181, [%rd1616];
mad.wide.u32 %rd1617, %r5181, %r1260, %rd2625;
shr.u64 %rd2625, %rd1617, 32;
add.s64 %rd1618, %rd1, %rd1614;
st.local.u32 [%rd1618], %rd1617;
add.s32 %r8478, %r8478, 1;
cvt.s64.s32 %rd2624, %r8478;
setp.ne.s32 %p810, %r8478, 6;
@%p810 bra $L__BB0_941;
st.local.u32 [%rd5], %rd2625;
mov.u32 %r5182, 4;
sub.s32 %r1264, %r5182, %r1261;
mov.u32 %r5183, 6;
sub.s32 %r5184, %r5183, %r1261;
mul.wide.s32 %rd1619, %r5184, 4;
add.s64 %rd1620, %rd1, %rd1619;
ld.local.u32 %r8479, [%rd1620];
ld.local.u32 %r8480, [%rd1620+-4];
and.b32 %r1267, %r1259, 31;
setp.eq.s32 %p811, %r1267, 0;
@%p811 bra $L__BB0_944;
mov.u32 %r5185, 32;
sub.s32 %r5186, %r5185, %r1267;
shr.u32 %r5187, %r8480, %r5186;
shl.b32 %r5188, %r8479, %r1267;
add.s32 %r8479, %r5187, %r5188;
mul.wide.s32 %rd1621, %r1264, 4;
add.s64 %rd1622, %rd1, %rd1621;
ld.local.u32 %r5189, [%rd1622];
shr.u32 %r5190, %r5189, %r5186;
shl.b32 %r5191, %r8480, %r1267;
add.s32 %r8480, %r5190, %r5191;
$L__BB0_944:
and.b32 %r5192, %r1258, -2147483648;
shr.u32 %r5193, %r8480, 30;
shl.b32 %r5194, %r8479, 2;
or.b32 %r5195, %r5193, %r5194;
shr.u32 %r5196, %r5195, 31;
shr.u32 %r5197, %r8479, 30;
add.s32 %r5198, %r5196, %r5197;
neg.s32 %r5199, %r5198;
setp.eq.s32 %p812, %r5192, 0;
selp.b32 %r8481, %r5198, %r5199, %p812;
setp.ne.s32 %p813, %r5196, 0;
xor.b32 %r5200, %r5192, -2147483648;
selp.b32 %r5201, %r5200, %r5192, %p813;
selp.b32 %r5202, -1, 0, %p813;
xor.b32 %r5203, %r5195, %r5202;
shl.b32 %r5204, %r8480, 2;
xor.b32 %r5205, %r5204, %r5202;
cvt.u64.u32 %rd1623, %r5203;
cvt.u64.u32 %rd1624, %r5205;
bfi.b64 %rd1625, %rd1623, %rd1624, 32, 32;
cvt.rn.f64.s64 %fd127, %rd1625;
mul.f64 %fd128, %fd127, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3621, %fd128;
setp.eq.s32 %p814, %r5201, 0;
neg.f32 %f3622, %f3621;
selp.f32 %f5513, %f3621, %f3622, %p814;
$L__BB0_946:
add.s32 %r1274, %r8481, 1;
and.b32 %r1275, %r1274, 1;
setp.eq.s32 %p815, %r1275, 0;
selp.f32 %f1047, %f5513, 0f3F800000, %p815;
mul.rn.f32 %f1048, %f5513, %f5513;
mov.f32 %f5514, 0fB94D4153;
@%p815 bra $L__BB0_948;
mov.f32 %f3625, 0fBAB607ED;
mov.f32 %f3626, 0f37CBAC00;
fma.rn.f32 %f5514, %f3626, %f1048, %f3625;
$L__BB0_948:
selp.f32 %f3627, 0f3C0885E4, 0f3D2AAABB, %p815;
fma.rn.f32 %f3628, %f5514, %f1048, %f3627;
selp.f32 %f3629, 0fBE2AAAA8, 0fBEFFFFFF, %p815;
fma.rn.f32 %f3630, %f3628, %f1048, %f3629;
mov.f32 %f3631, 0f00000000;
fma.rn.f32 %f3632, %f1048, %f1047, %f3631;
fma.rn.f32 %f5515, %f3630, %f3632, %f1047;
and.b32 %r5207, %r1274, 2;
setp.eq.s32 %p817, %r5207, 0;
@%p817 bra $L__BB0_950;
mov.f32 %f3634, 0fBF800000;
fma.rn.f32 %f5515, %f5515, %f3634, %f3631;
$L__BB0_950:
add.f32 %f5516, %f5512, %f5515;
bra.uni $L__BB0_951;
$L__BB0_530:
mov.b32 %r678, %f5348;
shr.u32 %r4124, %r678, 23;
and.b32 %r4125, %r4124, 255;
add.s32 %r679, %r4125, -128;
shl.b32 %r4126, %r678, 8;
or.b32 %r680, %r4126, -2147483648;
shr.u32 %r681, %r679, 5;
mov.u64 %rd2562, 0;
mov.u32 %r8354, 0;
mov.u64 %rd1152, __cudart_i2opi_f;
mov.u64 %rd2563, %rd2562;
$L__BB0_531:
.pragma "nounroll";
shl.b64 %rd1151, %rd2562, 2;
add.s64 %rd1153, %rd1152, %rd1151;
ld.global.nc.u32 %r4127, [%rd1153];
mad.wide.u32 %rd1154, %r4127, %r680, %rd2563;
shr.u64 %rd2563, %rd1154, 32;
add.s64 %rd1155, %rd1, %rd1151;
st.local.u32 [%rd1155], %rd1154;
add.s32 %r8354, %r8354, 1;
cvt.s64.s32 %rd2562, %r8354;
setp.ne.s32 %p462, %r8354, 6;
@%p462 bra $L__BB0_531;
st.local.u32 [%rd5], %rd2563;
mov.u32 %r4128, 4;
sub.s32 %r684, %r4128, %r681;
mov.u32 %r4129, 6;
sub.s32 %r4130, %r4129, %r681;
mul.wide.s32 %rd1156, %r4130, 4;
add.s64 %rd1157, %rd1, %rd1156;
ld.local.u32 %r8355, [%rd1157];
ld.local.u32 %r8356, [%rd1157+-4];
and.b32 %r687, %r679, 31;
setp.eq.s32 %p463, %r687, 0;
@%p463 bra $L__BB0_534;
mov.u32 %r4131, 32;
sub.s32 %r4132, %r4131, %r687;
shr.u32 %r4133, %r8356, %r4132;
shl.b32 %r4134, %r8355, %r687;
add.s32 %r8355, %r4133, %r4134;
mul.wide.s32 %rd1158, %r684, 4;
add.s64 %rd1159, %rd1, %rd1158;
ld.local.u32 %r4135, [%rd1159];
shr.u32 %r4136, %r4135, %r4132;
shl.b32 %r4137, %r8356, %r687;
add.s32 %r8356, %r4136, %r4137;
$L__BB0_534:
and.b32 %r4138, %r678, -2147483648;
shr.u32 %r4139, %r8356, 30;
shl.b32 %r4140, %r8355, 2;
or.b32 %r4141, %r4139, %r4140;
shr.u32 %r4142, %r4141, 31;
shr.u32 %r4143, %r8355, 30;
add.s32 %r4144, %r4142, %r4143;
neg.s32 %r4145, %r4144;
setp.eq.s32 %p464, %r4138, 0;
selp.b32 %r8357, %r4144, %r4145, %p464;
setp.ne.s32 %p465, %r4142, 0;
xor.b32 %r4146, %r4138, -2147483648;
selp.b32 %r4147, %r4146, %r4138, %p465;
selp.b32 %r4148, -1, 0, %p465;
xor.b32 %r4149, %r4141, %r4148;
shl.b32 %r4150, %r8356, 2;
xor.b32 %r4151, %r4150, %r4148;
cvt.u64.u32 %rd1160, %r4149;
cvt.u64.u32 %rd1161, %r4151;
bfi.b64 %rd1162, %rd1160, %rd1161, 32, 32;
cvt.rn.f64.s64 %fd65, %rd1162;
mul.f64 %fd66, %fd65, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2955, %fd66;
setp.eq.s32 %p466, %r4147, 0;
neg.f32 %f2956, %f2955;
selp.f32 %f5369, %f2955, %f2956, %p466;
$L__BB0_536:
and.b32 %r694, %r8357, 1;
setp.eq.s32 %p467, %r694, 0;
selp.f32 %f577, %f5369, 0f3F800000, %p467;
mul.rn.f32 %f578, %f5369, %f5369;
mov.f32 %f5370, 0fB94D4153;
@%p467 bra $L__BB0_538;
mov.f32 %f2959, 0fBAB607ED;
mov.f32 %f2960, 0f37CBAC00;
fma.rn.f32 %f5370, %f2960, %f578, %f2959;
$L__BB0_538:
selp.f32 %f2961, 0f3C0885E4, 0f3D2AAABB, %p467;
fma.rn.f32 %f2962, %f5370, %f578, %f2961;
selp.f32 %f2963, 0fBE2AAAA8, 0fBEFFFFFF, %p467;
fma.rn.f32 %f2964, %f2962, %f578, %f2963;
mov.f32 %f2965, 0f00000000;
fma.rn.f32 %f2966, %f578, %f577, %f2965;
fma.rn.f32 %f5213, %f2964, %f2966, %f577;
and.b32 %r4153, %r8357, 2;
setp.eq.s32 %p469, %r4153, 0;
@%p469 bra $L__BB0_540;
mov.f32 %f2968, 0fBF800000;
fma.rn.f32 %f5213, %f5213, %f2968, %f2965;
$L__BB0_540:
setp.lt.s32 %p8, %r14, %r676;
@%p459 bra $L__BB0_553;
mul.f32 %f2969, %f5531, 0f3F22F983;
cvt.rni.s32.f32 %r8361, %f2969;
cvt.rn.f32.s32 %f2970, %r8361;
mov.f32 %f2971, 0fBFC90FDA;
fma.rn.f32 %f2972, %f2970, %f2971, %f5531;
mov.f32 %f2973, 0fB3A22168;
fma.rn.f32 %f2974, %f2970, %f2973, %f2972;
mov.f32 %f2975, 0fA7C234C5;
fma.rn.f32 %f5373, %f2970, %f2975, %f2974;
abs.f32 %f586, %f5531;
setp.ltu.f32 %p471, %f586, 0f47CE4780;
@%p471 bra $L__BB0_549;
setp.eq.f32 %p472, %f586, 0f7F800000;
@%p472 bra $L__BB0_548;
bra.uni $L__BB0_543;
$L__BB0_548:
mov.f32 %f2978, 0f00000000;
mul.rn.f32 %f5373, %f5531, %f2978;
mov.u32 %r8361, 0;
bra.uni $L__BB0_549;
$L__BB0_543:
mov.b32 %r696, %f5531;
shr.u32 %r4155, %r696, 23;
and.b32 %r4156, %r4155, 255;
add.s32 %r697, %r4156, -128;
shl.b32 %r4157, %r696, 8;
or.b32 %r698, %r4157, -2147483648;
shr.u32 %r699, %r697, 5;
mov.u64 %rd2564, 0;
mov.u32 %r8358, 0;
mov.u64 %rd1166, __cudart_i2opi_f;
mov.u64 %rd2565, %rd2564;
$L__BB0_544:
.pragma "nounroll";
shl.b64 %rd1165, %rd2564, 2;
add.s64 %rd1167, %rd1166, %rd1165;
ld.global.nc.u32 %r4158, [%rd1167];
mad.wide.u32 %rd1168, %r4158, %r698, %rd2565;
shr.u64 %rd2565, %rd1168, 32;
add.s64 %rd1169, %rd1, %rd1165;
st.local.u32 [%rd1169], %rd1168;
add.s32 %r8358, %r8358, 1;
cvt.s64.s32 %rd2564, %r8358;
setp.ne.s32 %p473, %r8358, 6;
@%p473 bra $L__BB0_544;
st.local.u32 [%rd5], %rd2565;
mov.u32 %r4159, 4;
sub.s32 %r702, %r4159, %r699;
mov.u32 %r4160, 6;
sub.s32 %r4161, %r4160, %r699;
mul.wide.s32 %rd1170, %r4161, 4;
add.s64 %rd1171, %rd1, %rd1170;
ld.local.u32 %r8359, [%rd1171];
ld.local.u32 %r8360, [%rd1171+-4];
and.b32 %r705, %r697, 31;
setp.eq.s32 %p474, %r705, 0;
@%p474 bra $L__BB0_547;
mov.u32 %r4162, 32;
sub.s32 %r4163, %r4162, %r705;
shr.u32 %r4164, %r8360, %r4163;
shl.b32 %r4165, %r8359, %r705;
add.s32 %r8359, %r4164, %r4165;
mul.wide.s32 %rd1172, %r702, 4;
add.s64 %rd1173, %rd1, %rd1172;
ld.local.u32 %r4166, [%rd1173];
shr.u32 %r4167, %r4166, %r4163;
shl.b32 %r4168, %r8360, %r705;
add.s32 %r8360, %r4167, %r4168;
$L__BB0_547:
and.b32 %r4169, %r696, -2147483648;
shr.u32 %r4170, %r8360, 30;
shl.b32 %r4171, %r8359, 2;
or.b32 %r4172, %r4170, %r4171;
shr.u32 %r4173, %r4172, 31;
shr.u32 %r4174, %r8359, 30;
add.s32 %r4175, %r4173, %r4174;
neg.s32 %r4176, %r4175;
setp.eq.s32 %p475, %r4169, 0;
selp.b32 %r8361, %r4175, %r4176, %p475;
setp.ne.s32 %p476, %r4173, 0;
xor.b32 %r4177, %r4169, -2147483648;
selp.b32 %r4178, %r4177, %r4169, %p476;
selp.b32 %r4179, -1, 0, %p476;
xor.b32 %r4180, %r4172, %r4179;
shl.b32 %r4181, %r8360, 2;
xor.b32 %r4182, %r4181, %r4179;
cvt.u64.u32 %rd1174, %r4180;
cvt.u64.u32 %rd1175, %r4182;
bfi.b64 %rd1176, %rd1174, %rd1175, 32, 32;
cvt.rn.f64.s64 %fd67, %rd1176;
mul.f64 %fd68, %fd67, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2976, %fd68;
setp.eq.s32 %p477, %r4178, 0;
neg.f32 %f2977, %f2976;
selp.f32 %f5373, %f2976, %f2977, %p477;
$L__BB0_549:
add.s32 %r712, %r8361, 1;
and.b32 %r713, %r712, 1;
setp.eq.s32 %p478, %r713, 0;
selp.f32 %f590, %f5373, 0f3F800000, %p478;
mul.rn.f32 %f591, %f5373, %f5373;
mov.f32 %f5374, 0fB94D4153;
@%p478 bra $L__BB0_551;
mov.f32 %f2980, 0fBAB607ED;
mov.f32 %f2981, 0f37CBAC00;
fma.rn.f32 %f5374, %f2981, %f591, %f2980;
$L__BB0_551:
selp.f32 %f2982, 0f3C0885E4, 0f3D2AAABB, %p478;
fma.rn.f32 %f2983, %f5374, %f591, %f2982;
selp.f32 %f2984, 0fBE2AAAA8, 0fBEFFFFFF, %p478;
fma.rn.f32 %f2985, %f2983, %f591, %f2984;
mov.f32 %f2986, 0f00000000;
fma.rn.f32 %f2987, %f591, %f590, %f2986;
fma.rn.f32 %f5215, %f2985, %f2987, %f590;
and.b32 %r4184, %r712, 2;
setp.eq.s32 %p480, %r4184, 0;
@%p480 bra $L__BB0_553;
mov.f32 %f2989, 0fBF800000;
fma.rn.f32 %f5215, %f5215, %f2989, %f2986;
$L__BB0_553:
selp.f32 %f598, %f5215, %f5216, %p8;
selp.f32 %f599, %f5213, %f5214, %p8;
@%p459 bra $L__BB0_555;
add.f32 %f5523, %f599, %f598;
mov.f32 %f5214, %f5213;
mov.f32 %f5216, %f5215;
$L__BB0_555:
@%p426 bra $L__BB0_584;
shl.b32 %r4186, %r12, 5;
mov.u32 %r4187, -32;
sub.s32 %r714, %r4187, %r4186;
setp.ge.s32 %p484, %r14, %r714;
@%p484 bra $L__BB0_569;
mul.f32 %f2992, %f5347, 0f3F22F983;
cvt.rni.s32.f32 %r8365, %f2992;
cvt.rn.f32.s32 %f2993, %r8365;
mov.f32 %f2994, 0fBFC90FDA;
fma.rn.f32 %f2995, %f2993, %f2994, %f5347;
mov.f32 %f2996, 0fB3A22168;
fma.rn.f32 %f2997, %f2993, %f2996, %f2995;
mov.f32 %f2998, 0fA7C234C5;
fma.rn.f32 %f5382, %f2993, %f2998, %f2997;
abs.f32 %f607, %f5347;
setp.ltu.f32 %p485, %f607, 0f47CE4780;
@%p485 bra $L__BB0_565;
setp.eq.f32 %p486, %f607, 0f7F800000;
@%p486 bra $L__BB0_564;
bra.uni $L__BB0_559;
$L__BB0_564:
mov.f32 %f3001, 0f00000000;
mul.rn.f32 %f5382, %f5347, %f3001;
mov.u32 %r8365, 0;
bra.uni $L__BB0_565;
$L__BB0_559:
mov.b32 %r716, %f5347;
shr.u32 %r4189, %r716, 23;
and.b32 %r4190, %r4189, 255;
add.s32 %r717, %r4190, -128;
shl.b32 %r4191, %r716, 8;
or.b32 %r718, %r4191, -2147483648;
shr.u32 %r719, %r717, 5;
mov.u64 %rd2566, 0;
mov.u32 %r8362, 0;
mov.u64 %rd1180, __cudart_i2opi_f;
mov.u64 %rd2567, %rd2566;
$L__BB0_560:
.pragma "nounroll";
shl.b64 %rd1179, %rd2566, 2;
add.s64 %rd1181, %rd1180, %rd1179;
ld.global.nc.u32 %r4192, [%rd1181];
mad.wide.u32 %rd1182, %r4192, %r718, %rd2567;
shr.u64 %rd2567, %rd1182, 32;
add.s64 %rd1183, %rd1, %rd1179;
st.local.u32 [%rd1183], %rd1182;
add.s32 %r8362, %r8362, 1;
cvt.s64.s32 %rd2566, %r8362;
setp.ne.s32 %p487, %r8362, 6;
@%p487 bra $L__BB0_560;
st.local.u32 [%rd5], %rd2567;
mov.u32 %r4193, 4;
sub.s32 %r722, %r4193, %r719;
mov.u32 %r4194, 6;
sub.s32 %r4195, %r4194, %r719;
mul.wide.s32 %rd1184, %r4195, 4;
add.s64 %rd1185, %rd1, %rd1184;
ld.local.u32 %r8363, [%rd1185];
ld.local.u32 %r8364, [%rd1185+-4];
and.b32 %r725, %r717, 31;
setp.eq.s32 %p488, %r725, 0;
@%p488 bra $L__BB0_563;
mov.u32 %r4196, 32;
sub.s32 %r4197, %r4196, %r725;
shr.u32 %r4198, %r8364, %r4197;
shl.b32 %r4199, %r8363, %r725;
add.s32 %r8363, %r4198, %r4199;
mul.wide.s32 %rd1186, %r722, 4;
add.s64 %rd1187, %rd1, %rd1186;
ld.local.u32 %r4200, [%rd1187];
shr.u32 %r4201, %r4200, %r4197;
shl.b32 %r4202, %r8364, %r725;
add.s32 %r8364, %r4201, %r4202;
$L__BB0_563:
and.b32 %r4203, %r716, -2147483648;
shr.u32 %r4204, %r8364, 30;
shl.b32 %r4205, %r8363, 2;
or.b32 %r4206, %r4204, %r4205;
shr.u32 %r4207, %r4206, 31;
shr.u32 %r4208, %r8363, 30;
add.s32 %r4209, %r4207, %r4208;
neg.s32 %r4210, %r4209;
setp.eq.s32 %p489, %r4203, 0;
selp.b32 %r8365, %r4209, %r4210, %p489;
setp.ne.s32 %p490, %r4207, 0;
xor.b32 %r4211, %r4203, -2147483648;
selp.b32 %r4212, %r4211, %r4203, %p490;
selp.b32 %r4213, -1, 0, %p490;
xor.b32 %r4214, %r4206, %r4213;
shl.b32 %r4215, %r8364, 2;
xor.b32 %r4216, %r4215, %r4213;
cvt.u64.u32 %rd1188, %r4214;
cvt.u64.u32 %rd1189, %r4216;
bfi.b64 %rd1190, %rd1188, %rd1189, 32, 32;
cvt.rn.f64.s64 %fd69, %rd1190;
mul.f64 %fd70, %fd69, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2999, %fd70;
setp.eq.s32 %p491, %r4212, 0;
neg.f32 %f3000, %f2999;
selp.f32 %f5382, %f2999, %f3000, %p491;
$L__BB0_565:
and.b32 %r732, %r8365, 1;
setp.eq.s32 %p492, %r732, 0;
selp.f32 %f611, %f5382, 0f3F800000, %p492;
mul.rn.f32 %f612, %f5382, %f5382;
mov.f32 %f5383, 0fB94D4153;
@%p492 bra $L__BB0_567;
mov.f32 %f3003, 0fBAB607ED;
mov.f32 %f3004, 0f37CBAC00;
fma.rn.f32 %f5383, %f3004, %f612, %f3003;
$L__BB0_567:
selp.f32 %f3005, 0f3C0885E4, 0f3D2AAABB, %p492;
fma.rn.f32 %f3006, %f5383, %f612, %f3005;
selp.f32 %f3007, 0fBE2AAAA8, 0fBEFFFFFF, %p492;
fma.rn.f32 %f3008, %f3006, %f612, %f3007;
mov.f32 %f3009, 0f00000000;
fma.rn.f32 %f3010, %f612, %f611, %f3009;
fma.rn.f32 %f5213, %f3008, %f3010, %f611;
and.b32 %r4218, %r8365, 2;
setp.eq.s32 %p494, %r4218, 0;
@%p494 bra $L__BB0_569;
mov.f32 %f3012, 0fBF800000;
fma.rn.f32 %f5213, %f5213, %f3012, %f3009;
$L__BB0_569:
setp.lt.s32 %p9, %r14, %r714;
@%p484 bra $L__BB0_582;
mul.f32 %f3013, %f5339, 0f3F22F983;
cvt.rni.s32.f32 %r8369, %f3013;
cvt.rn.f32.s32 %f3014, %r8369;
mov.f32 %f3015, 0fBFC90FDA;
fma.rn.f32 %f3016, %f3014, %f3015, %f5339;
mov.f32 %f3017, 0fB3A22168;
fma.rn.f32 %f3018, %f3014, %f3017, %f3016;
mov.f32 %f3019, 0fA7C234C5;
fma.rn.f32 %f5386, %f3014, %f3019, %f3018;
abs.f32 %f620, %f5339;
setp.ltu.f32 %p496, %f620, 0f47CE4780;
@%p496 bra $L__BB0_578;
setp.eq.f32 %p497, %f620, 0f7F800000;
@%p497 bra $L__BB0_577;
bra.uni $L__BB0_572;
$L__BB0_577:
mov.f32 %f3022, 0f00000000;
mul.rn.f32 %f5386, %f5339, %f3022;
mov.u32 %r8369, 0;
bra.uni $L__BB0_578;
$L__BB0_572:
mov.b32 %r734, %f5339;
shr.u32 %r4220, %r734, 23;
and.b32 %r4221, %r4220, 255;
add.s32 %r735, %r4221, -128;
shl.b32 %r4222, %r734, 8;
or.b32 %r736, %r4222, -2147483648;
shr.u32 %r737, %r735, 5;
mov.u64 %rd2568, 0;
mov.u32 %r8366, 0;
mov.u64 %rd1194, __cudart_i2opi_f;
mov.u64 %rd2569, %rd2568;
$L__BB0_573:
.pragma "nounroll";
shl.b64 %rd1193, %rd2568, 2;
add.s64 %rd1195, %rd1194, %rd1193;
ld.global.nc.u32 %r4223, [%rd1195];
mad.wide.u32 %rd1196, %r4223, %r736, %rd2569;
shr.u64 %rd2569, %rd1196, 32;
add.s64 %rd1197, %rd1, %rd1193;
st.local.u32 [%rd1197], %rd1196;
add.s32 %r8366, %r8366, 1;
cvt.s64.s32 %rd2568, %r8366;
setp.ne.s32 %p498, %r8366, 6;
@%p498 bra $L__BB0_573;
st.local.u32 [%rd5], %rd2569;
mov.u32 %r4224, 4;
sub.s32 %r740, %r4224, %r737;
mov.u32 %r4225, 6;
sub.s32 %r4226, %r4225, %r737;
mul.wide.s32 %rd1198, %r4226, 4;
add.s64 %rd1199, %rd1, %rd1198;
ld.local.u32 %r8367, [%rd1199];
ld.local.u32 %r8368, [%rd1199+-4];
and.b32 %r743, %r735, 31;
setp.eq.s32 %p499, %r743, 0;
@%p499 bra $L__BB0_576;
mov.u32 %r4227, 32;
sub.s32 %r4228, %r4227, %r743;
shr.u32 %r4229, %r8368, %r4228;
shl.b32 %r4230, %r8367, %r743;
add.s32 %r8367, %r4229, %r4230;
mul.wide.s32 %rd1200, %r740, 4;
add.s64 %rd1201, %rd1, %rd1200;
ld.local.u32 %r4231, [%rd1201];
shr.u32 %r4232, %r4231, %r4228;
shl.b32 %r4233, %r8368, %r743;
add.s32 %r8368, %r4232, %r4233;
$L__BB0_576:
and.b32 %r4234, %r734, -2147483648;
shr.u32 %r4235, %r8368, 30;
shl.b32 %r4236, %r8367, 2;
or.b32 %r4237, %r4235, %r4236;
shr.u32 %r4238, %r4237, 31;
shr.u32 %r4239, %r8367, 30;
add.s32 %r4240, %r4238, %r4239;
neg.s32 %r4241, %r4240;
setp.eq.s32 %p500, %r4234, 0;
selp.b32 %r8369, %r4240, %r4241, %p500;
setp.ne.s32 %p501, %r4238, 0;
xor.b32 %r4242, %r4234, -2147483648;
selp.b32 %r4243, %r4242, %r4234, %p501;
selp.b32 %r4244, -1, 0, %p501;
xor.b32 %r4245, %r4237, %r4244;
shl.b32 %r4246, %r8368, 2;
xor.b32 %r4247, %r4246, %r4244;
cvt.u64.u32 %rd1202, %r4245;
cvt.u64.u32 %rd1203, %r4247;
bfi.b64 %rd1204, %rd1202, %rd1203, 32, 32;
cvt.rn.f64.s64 %fd71, %rd1204;
mul.f64 %fd72, %fd71, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3020, %fd72;
setp.eq.s32 %p502, %r4243, 0;
neg.f32 %f3021, %f3020;
selp.f32 %f5386, %f3020, %f3021, %p502;
$L__BB0_578:
add.s32 %r750, %r8369, 1;
and.b32 %r751, %r750, 1;
setp.eq.s32 %p503, %r751, 0;
selp.f32 %f624, %f5386, 0f3F800000, %p503;
mul.rn.f32 %f625, %f5386, %f5386;
mov.f32 %f5387, 0fB94D4153;
@%p503 bra $L__BB0_580;
mov.f32 %f3024, 0fBAB607ED;
mov.f32 %f3025, 0f37CBAC00;
fma.rn.f32 %f5387, %f3025, %f625, %f3024;
$L__BB0_580:
selp.f32 %f3026, 0f3C0885E4, 0f3D2AAABB, %p503;
fma.rn.f32 %f3027, %f5387, %f625, %f3026;
selp.f32 %f3028, 0fBE2AAAA8, 0fBEFFFFFF, %p503;
fma.rn.f32 %f3029, %f3027, %f625, %f3028;
mov.f32 %f3030, 0f00000000;
fma.rn.f32 %f3031, %f625, %f624, %f3030;
fma.rn.f32 %f5215, %f3029, %f3031, %f624;
and.b32 %r4249, %r750, 2;
setp.eq.s32 %p505, %r4249, 0;
@%p505 bra $L__BB0_582;
mov.f32 %f3033, 0fBF800000;
fma.rn.f32 %f5215, %f5215, %f3033, %f3030;
$L__BB0_582:
selp.f32 %f632, %f5215, %f5216, %p9;
selp.f32 %f633, %f5213, %f5214, %p9;
@%p484 bra $L__BB0_584;
add.f32 %f5522, %f633, %f632;
mov.f32 %f5214, %f5213;
mov.f32 %f5216, %f5215;
$L__BB0_584:
@%p430 bra $L__BB0_613;
shl.b32 %r4251, %r12, 5;
neg.s32 %r752, %r4251;
setp.ge.s32 %p509, %r14, %r752;
@%p509 bra $L__BB0_598;
mul.f32 %f3036, %f5346, 0f3F22F983;
cvt.rni.s32.f32 %r8373, %f3036;
cvt.rn.f32.s32 %f3037, %r8373;
mov.f32 %f3038, 0fBFC90FDA;
fma.rn.f32 %f3039, %f3037, %f3038, %f5346;
mov.f32 %f3040, 0fB3A22168;
fma.rn.f32 %f3041, %f3037, %f3040, %f3039;
mov.f32 %f3042, 0fA7C234C5;
fma.rn.f32 %f5395, %f3037, %f3042, %f3041;
abs.f32 %f641, %f5346;
setp.ltu.f32 %p510, %f641, 0f47CE4780;
@%p510 bra $L__BB0_594;
setp.eq.f32 %p511, %f641, 0f7F800000;
@%p511 bra $L__BB0_593;
bra.uni $L__BB0_588;
$L__BB0_593:
mov.f32 %f3045, 0f00000000;
mul.rn.f32 %f5395, %f5346, %f3045;
mov.u32 %r8373, 0;
bra.uni $L__BB0_594;
$L__BB0_588:
mov.b32 %r754, %f5346;
shr.u32 %r4253, %r754, 23;
and.b32 %r4254, %r4253, 255;
add.s32 %r755, %r4254, -128;
shl.b32 %r4255, %r754, 8;
or.b32 %r756, %r4255, -2147483648;
shr.u32 %r757, %r755, 5;
mov.u64 %rd2570, 0;
mov.u32 %r8370, 0;
mov.u64 %rd1208, __cudart_i2opi_f;
mov.u64 %rd2571, %rd2570;
$L__BB0_589:
.pragma "nounroll";
shl.b64 %rd1207, %rd2570, 2;
add.s64 %rd1209, %rd1208, %rd1207;
ld.global.nc.u32 %r4256, [%rd1209];
mad.wide.u32 %rd1210, %r4256, %r756, %rd2571;
shr.u64 %rd2571, %rd1210, 32;
add.s64 %rd1211, %rd1, %rd1207;
st.local.u32 [%rd1211], %rd1210;
add.s32 %r8370, %r8370, 1;
cvt.s64.s32 %rd2570, %r8370;
setp.ne.s32 %p512, %r8370, 6;
@%p512 bra $L__BB0_589;
st.local.u32 [%rd5], %rd2571;
mov.u32 %r4257, 4;
sub.s32 %r760, %r4257, %r757;
mov.u32 %r4258, 6;
sub.s32 %r4259, %r4258, %r757;
mul.wide.s32 %rd1212, %r4259, 4;
add.s64 %rd1213, %rd1, %rd1212;
ld.local.u32 %r8371, [%rd1213];
ld.local.u32 %r8372, [%rd1213+-4];
and.b32 %r763, %r755, 31;
setp.eq.s32 %p513, %r763, 0;
@%p513 bra $L__BB0_592;
mov.u32 %r4260, 32;
sub.s32 %r4261, %r4260, %r763;
shr.u32 %r4262, %r8372, %r4261;
shl.b32 %r4263, %r8371, %r763;
add.s32 %r8371, %r4262, %r4263;
mul.wide.s32 %rd1214, %r760, 4;
add.s64 %rd1215, %rd1, %rd1214;
ld.local.u32 %r4264, [%rd1215];
shr.u32 %r4265, %r4264, %r4261;
shl.b32 %r4266, %r8372, %r763;
add.s32 %r8372, %r4265, %r4266;
$L__BB0_592:
and.b32 %r4267, %r754, -2147483648;
shr.u32 %r4268, %r8372, 30;
shl.b32 %r4269, %r8371, 2;
or.b32 %r4270, %r4268, %r4269;
shr.u32 %r4271, %r4270, 31;
shr.u32 %r4272, %r8371, 30;
add.s32 %r4273, %r4271, %r4272;
neg.s32 %r4274, %r4273;
setp.eq.s32 %p514, %r4267, 0;
selp.b32 %r8373, %r4273, %r4274, %p514;
setp.ne.s32 %p515, %r4271, 0;
xor.b32 %r4275, %r4267, -2147483648;
selp.b32 %r4276, %r4275, %r4267, %p515;
selp.b32 %r4277, -1, 0, %p515;
xor.b32 %r4278, %r4270, %r4277;
shl.b32 %r4279, %r8372, 2;
xor.b32 %r4280, %r4279, %r4277;
cvt.u64.u32 %rd1216, %r4278;
cvt.u64.u32 %rd1217, %r4280;
bfi.b64 %rd1218, %rd1216, %rd1217, 32, 32;
cvt.rn.f64.s64 %fd73, %rd1218;
mul.f64 %fd74, %fd73, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3043, %fd74;
setp.eq.s32 %p516, %r4276, 0;
neg.f32 %f3044, %f3043;
selp.f32 %f5395, %f3043, %f3044, %p516;
$L__BB0_594:
and.b32 %r770, %r8373, 1;
setp.eq.s32 %p517, %r770, 0;
selp.f32 %f645, %f5395, 0f3F800000, %p517;
mul.rn.f32 %f646, %f5395, %f5395;
mov.f32 %f5396, 0fB94D4153;
@%p517 bra $L__BB0_596;
mov.f32 %f3047, 0fBAB607ED;
mov.f32 %f3048, 0f37CBAC00;
fma.rn.f32 %f5396, %f3048, %f646, %f3047;
$L__BB0_596:
selp.f32 %f3049, 0f3C0885E4, 0f3D2AAABB, %p517;
fma.rn.f32 %f3050, %f5396, %f646, %f3049;
selp.f32 %f3051, 0fBE2AAAA8, 0fBEFFFFFF, %p517;
fma.rn.f32 %f3052, %f3050, %f646, %f3051;
mov.f32 %f3053, 0f00000000;
fma.rn.f32 %f3054, %f646, %f645, %f3053;
fma.rn.f32 %f5213, %f3052, %f3054, %f645;
and.b32 %r4282, %r8373, 2;
setp.eq.s32 %p519, %r4282, 0;
@%p519 bra $L__BB0_598;
mov.f32 %f3056, 0fBF800000;
fma.rn.f32 %f5213, %f5213, %f3056, %f3053;
$L__BB0_598:
setp.lt.s32 %p10, %r14, %r752;
@%p509 bra $L__BB0_611;
mul.f32 %f3057, %f5338, 0f3F22F983;
cvt.rni.s32.f32 %r8377, %f3057;
cvt.rn.f32.s32 %f3058, %r8377;
mov.f32 %f3059, 0fBFC90FDA;
fma.rn.f32 %f3060, %f3058, %f3059, %f5338;
mov.f32 %f3061, 0fB3A22168;
fma.rn.f32 %f3062, %f3058, %f3061, %f3060;
mov.f32 %f3063, 0fA7C234C5;
fma.rn.f32 %f5399, %f3058, %f3063, %f3062;
abs.f32 %f654, %f5338;
setp.ltu.f32 %p521, %f654, 0f47CE4780;
@%p521 bra $L__BB0_607;
setp.eq.f32 %p522, %f654, 0f7F800000;
@%p522 bra $L__BB0_606;
bra.uni $L__BB0_601;
$L__BB0_606:
mov.f32 %f3066, 0f00000000;
mul.rn.f32 %f5399, %f5338, %f3066;
mov.u32 %r8377, 0;
bra.uni $L__BB0_607;
$L__BB0_601:
mov.b32 %r772, %f5338;
shr.u32 %r4284, %r772, 23;
and.b32 %r4285, %r4284, 255;
add.s32 %r773, %r4285, -128;
shl.b32 %r4286, %r772, 8;
or.b32 %r774, %r4286, -2147483648;
shr.u32 %r775, %r773, 5;
mov.u64 %rd2572, 0;
mov.u32 %r8374, 0;
mov.u64 %rd1222, __cudart_i2opi_f;
mov.u64 %rd2573, %rd2572;
$L__BB0_602:
.pragma "nounroll";
shl.b64 %rd1221, %rd2572, 2;
add.s64 %rd1223, %rd1222, %rd1221;
ld.global.nc.u32 %r4287, [%rd1223];
mad.wide.u32 %rd1224, %r4287, %r774, %rd2573;
shr.u64 %rd2573, %rd1224, 32;
add.s64 %rd1225, %rd1, %rd1221;
st.local.u32 [%rd1225], %rd1224;
add.s32 %r8374, %r8374, 1;
cvt.s64.s32 %rd2572, %r8374;
setp.ne.s32 %p523, %r8374, 6;
@%p523 bra $L__BB0_602;
st.local.u32 [%rd5], %rd2573;
mov.u32 %r4288, 4;
sub.s32 %r778, %r4288, %r775;
mov.u32 %r4289, 6;
sub.s32 %r4290, %r4289, %r775;
mul.wide.s32 %rd1226, %r4290, 4;
add.s64 %rd1227, %rd1, %rd1226;
ld.local.u32 %r8375, [%rd1227];
ld.local.u32 %r8376, [%rd1227+-4];
and.b32 %r781, %r773, 31;
setp.eq.s32 %p524, %r781, 0;
@%p524 bra $L__BB0_605;
mov.u32 %r4291, 32;
sub.s32 %r4292, %r4291, %r781;
shr.u32 %r4293, %r8376, %r4292;
shl.b32 %r4294, %r8375, %r781;
add.s32 %r8375, %r4293, %r4294;
mul.wide.s32 %rd1228, %r778, 4;
add.s64 %rd1229, %rd1, %rd1228;
ld.local.u32 %r4295, [%rd1229];
shr.u32 %r4296, %r4295, %r4292;
shl.b32 %r4297, %r8376, %r781;
add.s32 %r8376, %r4296, %r4297;
$L__BB0_605:
and.b32 %r4298, %r772, -2147483648;
shr.u32 %r4299, %r8376, 30;
shl.b32 %r4300, %r8375, 2;
or.b32 %r4301, %r4299, %r4300;
shr.u32 %r4302, %r4301, 31;
shr.u32 %r4303, %r8375, 30;
add.s32 %r4304, %r4302, %r4303;
neg.s32 %r4305, %r4304;
setp.eq.s32 %p525, %r4298, 0;
selp.b32 %r8377, %r4304, %r4305, %p525;
setp.ne.s32 %p526, %r4302, 0;
xor.b32 %r4306, %r4298, -2147483648;
selp.b32 %r4307, %r4306, %r4298, %p526;
selp.b32 %r4308, -1, 0, %p526;
xor.b32 %r4309, %r4301, %r4308;
shl.b32 %r4310, %r8376, 2;
xor.b32 %r4311, %r4310, %r4308;
cvt.u64.u32 %rd1230, %r4309;
cvt.u64.u32 %rd1231, %r4311;
bfi.b64 %rd1232, %rd1230, %rd1231, 32, 32;
cvt.rn.f64.s64 %fd75, %rd1232;
mul.f64 %fd76, %fd75, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3064, %fd76;
setp.eq.s32 %p527, %r4307, 0;
neg.f32 %f3065, %f3064;
selp.f32 %f5399, %f3064, %f3065, %p527;
$L__BB0_607:
add.s32 %r788, %r8377, 1;
and.b32 %r789, %r788, 1;
setp.eq.s32 %p528, %r789, 0;
selp.f32 %f658, %f5399, 0f3F800000, %p528;
mul.rn.f32 %f659, %f5399, %f5399;
mov.f32 %f5400, 0fB94D4153;
@%p528 bra $L__BB0_609;
mov.f32 %f3068, 0fBAB607ED;
mov.f32 %f3069, 0f37CBAC00;
fma.rn.f32 %f5400, %f3069, %f659, %f3068;
$L__BB0_609:
selp.f32 %f3070, 0f3C0885E4, 0f3D2AAABB, %p528;
fma.rn.f32 %f3071, %f5400, %f659, %f3070;
selp.f32 %f3072, 0fBE2AAAA8, 0fBEFFFFFF, %p528;
fma.rn.f32 %f3073, %f3071, %f659, %f3072;
mov.f32 %f3074, 0f00000000;
fma.rn.f32 %f3075, %f659, %f658, %f3074;
fma.rn.f32 %f5215, %f3073, %f3075, %f658;
and.b32 %r4313, %r788, 2;
setp.eq.s32 %p530, %r4313, 0;
@%p530 bra $L__BB0_611;
mov.f32 %f3077, 0fBF800000;
fma.rn.f32 %f5215, %f5215, %f3077, %f3074;
$L__BB0_611:
selp.f32 %f666, %f5215, %f5216, %p10;
selp.f32 %f667, %f5213, %f5214, %p10;
@%p509 bra $L__BB0_613;
add.f32 %f5521, %f667, %f666;
mov.f32 %f5214, %f5213;
mov.f32 %f5216, %f5215;
$L__BB0_613:
@%p430 bra $L__BB0_642;
shl.b32 %r4315, %r12, 5;
mov.u32 %r4316, -32;
sub.s32 %r790, %r4316, %r4315;
setp.ge.s32 %p534, %r14, %r790;
@%p534 bra $L__BB0_627;
mul.f32 %f3080, %f5345, 0f3F22F983;
cvt.rni.s32.f32 %r8381, %f3080;
cvt.rn.f32.s32 %f3081, %r8381;
mov.f32 %f3082, 0fBFC90FDA;
fma.rn.f32 %f3083, %f3081, %f3082, %f5345;
mov.f32 %f3084, 0fB3A22168;
fma.rn.f32 %f3085, %f3081, %f3084, %f3083;
mov.f32 %f3086, 0fA7C234C5;
fma.rn.f32 %f5408, %f3081, %f3086, %f3085;
abs.f32 %f675, %f5345;
setp.ltu.f32 %p535, %f675, 0f47CE4780;
@%p535 bra $L__BB0_623;
setp.eq.f32 %p536, %f675, 0f7F800000;
@%p536 bra $L__BB0_622;
bra.uni $L__BB0_617;
$L__BB0_622:
mov.f32 %f3089, 0f00000000;
mul.rn.f32 %f5408, %f5345, %f3089;
mov.u32 %r8381, 0;
bra.uni $L__BB0_623;
$L__BB0_617:
mov.b32 %r792, %f5345;
shr.u32 %r4318, %r792, 23;
and.b32 %r4319, %r4318, 255;
add.s32 %r793, %r4319, -128;
shl.b32 %r4320, %r792, 8;
or.b32 %r794, %r4320, -2147483648;
shr.u32 %r795, %r793, 5;
mov.u64 %rd2574, 0;
mov.u32 %r8378, 0;
mov.u64 %rd1236, __cudart_i2opi_f;
mov.u64 %rd2575, %rd2574;
$L__BB0_618:
.pragma "nounroll";
shl.b64 %rd1235, %rd2574, 2;
add.s64 %rd1237, %rd1236, %rd1235;
ld.global.nc.u32 %r4321, [%rd1237];
mad.wide.u32 %rd1238, %r4321, %r794, %rd2575;
shr.u64 %rd2575, %rd1238, 32;
add.s64 %rd1239, %rd1, %rd1235;
st.local.u32 [%rd1239], %rd1238;
add.s32 %r8378, %r8378, 1;
cvt.s64.s32 %rd2574, %r8378;
setp.ne.s32 %p537, %r8378, 6;
@%p537 bra $L__BB0_618;
st.local.u32 [%rd5], %rd2575;
mov.u32 %r4322, 4;
sub.s32 %r798, %r4322, %r795;
mov.u32 %r4323, 6;
sub.s32 %r4324, %r4323, %r795;
mul.wide.s32 %rd1240, %r4324, 4;
add.s64 %rd1241, %rd1, %rd1240;
ld.local.u32 %r8379, [%rd1241];
ld.local.u32 %r8380, [%rd1241+-4];
and.b32 %r801, %r793, 31;
setp.eq.s32 %p538, %r801, 0;
@%p538 bra $L__BB0_621;
mov.u32 %r4325, 32;
sub.s32 %r4326, %r4325, %r801;
shr.u32 %r4327, %r8380, %r4326;
shl.b32 %r4328, %r8379, %r801;
add.s32 %r8379, %r4327, %r4328;
mul.wide.s32 %rd1242, %r798, 4;
add.s64 %rd1243, %rd1, %rd1242;
ld.local.u32 %r4329, [%rd1243];
shr.u32 %r4330, %r4329, %r4326;
shl.b32 %r4331, %r8380, %r801;
add.s32 %r8380, %r4330, %r4331;
$L__BB0_621:
and.b32 %r4332, %r792, -2147483648;
shr.u32 %r4333, %r8380, 30;
shl.b32 %r4334, %r8379, 2;
or.b32 %r4335, %r4333, %r4334;
shr.u32 %r4336, %r4335, 31;
shr.u32 %r4337, %r8379, 30;
add.s32 %r4338, %r4336, %r4337;
neg.s32 %r4339, %r4338;
setp.eq.s32 %p539, %r4332, 0;
selp.b32 %r8381, %r4338, %r4339, %p539;
setp.ne.s32 %p540, %r4336, 0;
xor.b32 %r4340, %r4332, -2147483648;
selp.b32 %r4341, %r4340, %r4332, %p540;
selp.b32 %r4342, -1, 0, %p540;
xor.b32 %r4343, %r4335, %r4342;
shl.b32 %r4344, %r8380, 2;
xor.b32 %r4345, %r4344, %r4342;
cvt.u64.u32 %rd1244, %r4343;
cvt.u64.u32 %rd1245, %r4345;
bfi.b64 %rd1246, %rd1244, %rd1245, 32, 32;
cvt.rn.f64.s64 %fd77, %rd1246;
mul.f64 %fd78, %fd77, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3087, %fd78;
setp.eq.s32 %p541, %r4341, 0;
neg.f32 %f3088, %f3087;
selp.f32 %f5408, %f3087, %f3088, %p541;
$L__BB0_623:
and.b32 %r808, %r8381, 1;
setp.eq.s32 %p542, %r808, 0;
selp.f32 %f679, %f5408, 0f3F800000, %p542;
mul.rn.f32 %f680, %f5408, %f5408;
mov.f32 %f5409, 0fB94D4153;
@%p542 bra $L__BB0_625;
mov.f32 %f3091, 0fBAB607ED;
mov.f32 %f3092, 0f37CBAC00;
fma.rn.f32 %f5409, %f3092, %f680, %f3091;
$L__BB0_625:
selp.f32 %f3093, 0f3C0885E4, 0f3D2AAABB, %p542;
fma.rn.f32 %f3094, %f5409, %f680, %f3093;
selp.f32 %f3095, 0fBE2AAAA8, 0fBEFFFFFF, %p542;
fma.rn.f32 %f3096, %f3094, %f680, %f3095;
mov.f32 %f3097, 0f00000000;
fma.rn.f32 %f3098, %f680, %f679, %f3097;
fma.rn.f32 %f5213, %f3096, %f3098, %f679;
and.b32 %r4347, %r8381, 2;
setp.eq.s32 %p544, %r4347, 0;
@%p544 bra $L__BB0_627;
mov.f32 %f3100, 0fBF800000;
fma.rn.f32 %f5213, %f5213, %f3100, %f3097;
$L__BB0_627:
setp.lt.s32 %p11, %r14, %r790;
@%p534 bra $L__BB0_640;
mul.f32 %f3101, %f5337, 0f3F22F983;
cvt.rni.s32.f32 %r8385, %f3101;
cvt.rn.f32.s32 %f3102, %r8385;
mov.f32 %f3103, 0fBFC90FDA;
fma.rn.f32 %f3104, %f3102, %f3103, %f5337;
mov.f32 %f3105, 0fB3A22168;
fma.rn.f32 %f3106, %f3102, %f3105, %f3104;
mov.f32 %f3107, 0fA7C234C5;
fma.rn.f32 %f5412, %f3102, %f3107, %f3106;
abs.f32 %f688, %f5337;
setp.ltu.f32 %p546, %f688, 0f47CE4780;
@%p546 bra $L__BB0_636;
setp.eq.f32 %p547, %f688, 0f7F800000;
@%p547 bra $L__BB0_635;
bra.uni $L__BB0_630;
$L__BB0_635:
mov.f32 %f3110, 0f00000000;
mul.rn.f32 %f5412, %f5337, %f3110;
mov.u32 %r8385, 0;
bra.uni $L__BB0_636;
$L__BB0_630:
mov.b32 %r810, %f5337;
shr.u32 %r4349, %r810, 23;
and.b32 %r4350, %r4349, 255;
add.s32 %r811, %r4350, -128;
shl.b32 %r4351, %r810, 8;
or.b32 %r812, %r4351, -2147483648;
shr.u32 %r813, %r811, 5;
mov.u64 %rd2576, 0;
mov.u32 %r8382, 0;
mov.u64 %rd1250, __cudart_i2opi_f;
mov.u64 %rd2577, %rd2576;
$L__BB0_631:
.pragma "nounroll";
shl.b64 %rd1249, %rd2576, 2;
add.s64 %rd1251, %rd1250, %rd1249;
ld.global.nc.u32 %r4352, [%rd1251];
mad.wide.u32 %rd1252, %r4352, %r812, %rd2577;
shr.u64 %rd2577, %rd1252, 32;
add.s64 %rd1253, %rd1, %rd1249;
st.local.u32 [%rd1253], %rd1252;
add.s32 %r8382, %r8382, 1;
cvt.s64.s32 %rd2576, %r8382;
setp.ne.s32 %p548, %r8382, 6;
@%p548 bra $L__BB0_631;
st.local.u32 [%rd5], %rd2577;
mov.u32 %r4353, 4;
sub.s32 %r816, %r4353, %r813;
mov.u32 %r4354, 6;
sub.s32 %r4355, %r4354, %r813;
mul.wide.s32 %rd1254, %r4355, 4;
add.s64 %rd1255, %rd1, %rd1254;
ld.local.u32 %r8383, [%rd1255];
ld.local.u32 %r8384, [%rd1255+-4];
and.b32 %r819, %r811, 31;
setp.eq.s32 %p549, %r819, 0;
@%p549 bra $L__BB0_634;
mov.u32 %r4356, 32;
sub.s32 %r4357, %r4356, %r819;
shr.u32 %r4358, %r8384, %r4357;
shl.b32 %r4359, %r8383, %r819;
add.s32 %r8383, %r4358, %r4359;
mul.wide.s32 %rd1256, %r816, 4;
add.s64 %rd1257, %rd1, %rd1256;
ld.local.u32 %r4360, [%rd1257];
shr.u32 %r4361, %r4360, %r4357;
shl.b32 %r4362, %r8384, %r819;
add.s32 %r8384, %r4361, %r4362;
$L__BB0_634:
and.b32 %r4363, %r810, -2147483648;
shr.u32 %r4364, %r8384, 30;
shl.b32 %r4365, %r8383, 2;
or.b32 %r4366, %r4364, %r4365;
shr.u32 %r4367, %r4366, 31;
shr.u32 %r4368, %r8383, 30;
add.s32 %r4369, %r4367, %r4368;
neg.s32 %r4370, %r4369;
setp.eq.s32 %p550, %r4363, 0;
selp.b32 %r8385, %r4369, %r4370, %p550;
setp.ne.s32 %p551, %r4367, 0;
xor.b32 %r4371, %r4363, -2147483648;
selp.b32 %r4372, %r4371, %r4363, %p551;
selp.b32 %r4373, -1, 0, %p551;
xor.b32 %r4374, %r4366, %r4373;
shl.b32 %r4375, %r8384, 2;
xor.b32 %r4376, %r4375, %r4373;
cvt.u64.u32 %rd1258, %r4374;
cvt.u64.u32 %rd1259, %r4376;
bfi.b64 %rd1260, %rd1258, %rd1259, 32, 32;
cvt.rn.f64.s64 %fd79, %rd1260;
mul.f64 %fd80, %fd79, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3108, %fd80;
setp.eq.s32 %p552, %r4372, 0;
neg.f32 %f3109, %f3108;
selp.f32 %f5412, %f3108, %f3109, %p552;
$L__BB0_636:
add.s32 %r826, %r8385, 1;
and.b32 %r827, %r826, 1;
setp.eq.s32 %p553, %r827, 0;
selp.f32 %f692, %f5412, 0f3F800000, %p553;
mul.rn.f32 %f693, %f5412, %f5412;
mov.f32 %f5413, 0fB94D4153;
@%p553 bra $L__BB0_638;
mov.f32 %f3112, 0fBAB607ED;
mov.f32 %f3113, 0f37CBAC00;
fma.rn.f32 %f5413, %f3113, %f693, %f3112;
$L__BB0_638:
selp.f32 %f3114, 0f3C0885E4, 0f3D2AAABB, %p553;
fma.rn.f32 %f3115, %f5413, %f693, %f3114;
selp.f32 %f3116, 0fBE2AAAA8, 0fBEFFFFFF, %p553;
fma.rn.f32 %f3117, %f3115, %f693, %f3116;
mov.f32 %f3118, 0f00000000;
fma.rn.f32 %f3119, %f693, %f692, %f3118;
fma.rn.f32 %f5215, %f3117, %f3119, %f692;
and.b32 %r4378, %r826, 2;
setp.eq.s32 %p555, %r4378, 0;
@%p555 bra $L__BB0_640;
mov.f32 %f3121, 0fBF800000;
fma.rn.f32 %f5215, %f5215, %f3121, %f3118;
$L__BB0_640:
selp.f32 %f700, %f5215, %f5216, %p11;
selp.f32 %f701, %f5213, %f5214, %p11;
@%p534 bra $L__BB0_642;
add.f32 %f5520, %f701, %f700;
mov.f32 %f5214, %f5213;
mov.f32 %f5216, %f5215;
$L__BB0_642:
@%p434 bra $L__BB0_671;
shl.b32 %r4380, %r12, 5;
neg.s32 %r828, %r4380;
setp.ge.s32 %p559, %r14, %r828;
@%p559 bra $L__BB0_656;
mul.f32 %f3124, %f5344, 0f3F22F983;
cvt.rni.s32.f32 %r8389, %f3124;
cvt.rn.f32.s32 %f3125, %r8389;
mov.f32 %f3126, 0fBFC90FDA;
fma.rn.f32 %f3127, %f3125, %f3126, %f5344;
mov.f32 %f3128, 0fB3A22168;
fma.rn.f32 %f3129, %f3125, %f3128, %f3127;
mov.f32 %f3130, 0fA7C234C5;
fma.rn.f32 %f5421, %f3125, %f3130, %f3129;
abs.f32 %f709, %f5344;
setp.ltu.f32 %p560, %f709, 0f47CE4780;
@%p560 bra $L__BB0_652;
setp.eq.f32 %p561, %f709, 0f7F800000;
@%p561 bra $L__BB0_651;
bra.uni $L__BB0_646;
$L__BB0_651:
mov.f32 %f3133, 0f00000000;
mul.rn.f32 %f5421, %f5344, %f3133;
mov.u32 %r8389, 0;
bra.uni $L__BB0_652;
$L__BB0_646:
mov.b32 %r830, %f5344;
shr.u32 %r4382, %r830, 23;
and.b32 %r4383, %r4382, 255;
add.s32 %r831, %r4383, -128;
shl.b32 %r4384, %r830, 8;
or.b32 %r832, %r4384, -2147483648;
shr.u32 %r833, %r831, 5;
mov.u64 %rd2578, 0;
mov.u32 %r8386, 0;
mov.u64 %rd1264, __cudart_i2opi_f;
mov.u64 %rd2579, %rd2578;
$L__BB0_647:
.pragma "nounroll";
shl.b64 %rd1263, %rd2578, 2;
add.s64 %rd1265, %rd1264, %rd1263;
ld.global.nc.u32 %r4385, [%rd1265];
mad.wide.u32 %rd1266, %r4385, %r832, %rd2579;
shr.u64 %rd2579, %rd1266, 32;
add.s64 %rd1267, %rd1, %rd1263;
st.local.u32 [%rd1267], %rd1266;
add.s32 %r8386, %r8386, 1;
cvt.s64.s32 %rd2578, %r8386;
setp.ne.s32 %p562, %r8386, 6;
@%p562 bra $L__BB0_647;
st.local.u32 [%rd5], %rd2579;
mov.u32 %r4386, 4;
sub.s32 %r836, %r4386, %r833;
mov.u32 %r4387, 6;
sub.s32 %r4388, %r4387, %r833;
mul.wide.s32 %rd1268, %r4388, 4;
add.s64 %rd1269, %rd1, %rd1268;
ld.local.u32 %r8387, [%rd1269];
ld.local.u32 %r8388, [%rd1269+-4];
and.b32 %r839, %r831, 31;
setp.eq.s32 %p563, %r839, 0;
@%p563 bra $L__BB0_650;
mov.u32 %r4389, 32;
sub.s32 %r4390, %r4389, %r839;
shr.u32 %r4391, %r8388, %r4390;
shl.b32 %r4392, %r8387, %r839;
add.s32 %r8387, %r4391, %r4392;
mul.wide.s32 %rd1270, %r836, 4;
add.s64 %rd1271, %rd1, %rd1270;
ld.local.u32 %r4393, [%rd1271];
shr.u32 %r4394, %r4393, %r4390;
shl.b32 %r4395, %r8388, %r839;
add.s32 %r8388, %r4394, %r4395;
$L__BB0_650:
and.b32 %r4396, %r830, -2147483648;
shr.u32 %r4397, %r8388, 30;
shl.b32 %r4398, %r8387, 2;
or.b32 %r4399, %r4397, %r4398;
shr.u32 %r4400, %r4399, 31;
shr.u32 %r4401, %r8387, 30;
add.s32 %r4402, %r4400, %r4401;
neg.s32 %r4403, %r4402;
setp.eq.s32 %p564, %r4396, 0;
selp.b32 %r8389, %r4402, %r4403, %p564;
setp.ne.s32 %p565, %r4400, 0;
xor.b32 %r4404, %r4396, -2147483648;
selp.b32 %r4405, %r4404, %r4396, %p565;
selp.b32 %r4406, -1, 0, %p565;
xor.b32 %r4407, %r4399, %r4406;
shl.b32 %r4408, %r8388, 2;
xor.b32 %r4409, %r4408, %r4406;
cvt.u64.u32 %rd1272, %r4407;
cvt.u64.u32 %rd1273, %r4409;
bfi.b64 %rd1274, %rd1272, %rd1273, 32, 32;
cvt.rn.f64.s64 %fd81, %rd1274;
mul.f64 %fd82, %fd81, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3131, %fd82;
setp.eq.s32 %p566, %r4405, 0;
neg.f32 %f3132, %f3131;
selp.f32 %f5421, %f3131, %f3132, %p566;
$L__BB0_652:
and.b32 %r846, %r8389, 1;
setp.eq.s32 %p567, %r846, 0;
selp.f32 %f713, %f5421, 0f3F800000, %p567;
mul.rn.f32 %f714, %f5421, %f5421;
mov.f32 %f5422, 0fB94D4153;
@%p567 bra $L__BB0_654;
mov.f32 %f3135, 0fBAB607ED;
mov.f32 %f3136, 0f37CBAC00;
fma.rn.f32 %f5422, %f3136, %f714, %f3135;
$L__BB0_654:
selp.f32 %f3137, 0f3C0885E4, 0f3D2AAABB, %p567;
fma.rn.f32 %f3138, %f5422, %f714, %f3137;
selp.f32 %f3139, 0fBE2AAAA8, 0fBEFFFFFF, %p567;
fma.rn.f32 %f3140, %f3138, %f714, %f3139;
mov.f32 %f3141, 0f00000000;
fma.rn.f32 %f3142, %f714, %f713, %f3141;
fma.rn.f32 %f5213, %f3140, %f3142, %f713;
and.b32 %r4411, %r8389, 2;
setp.eq.s32 %p569, %r4411, 0;
@%p569 bra $L__BB0_656;
mov.f32 %f3144, 0fBF800000;
fma.rn.f32 %f5213, %f5213, %f3144, %f3141;
$L__BB0_656:
setp.lt.s32 %p12, %r14, %r828;
@%p559 bra $L__BB0_669;
mul.f32 %f3145, %f5336, 0f3F22F983;
cvt.rni.s32.f32 %r8393, %f3145;
cvt.rn.f32.s32 %f3146, %r8393;
mov.f32 %f3147, 0fBFC90FDA;
fma.rn.f32 %f3148, %f3146, %f3147, %f5336;
mov.f32 %f3149, 0fB3A22168;
fma.rn.f32 %f3150, %f3146, %f3149, %f3148;
mov.f32 %f3151, 0fA7C234C5;
fma.rn.f32 %f5425, %f3146, %f3151, %f3150;
abs.f32 %f722, %f5336;
setp.ltu.f32 %p571, %f722, 0f47CE4780;
@%p571 bra $L__BB0_665;
setp.eq.f32 %p572, %f722, 0f7F800000;
@%p572 bra $L__BB0_664;
bra.uni $L__BB0_659;
$L__BB0_664:
mov.f32 %f3154, 0f00000000;
mul.rn.f32 %f5425, %f5336, %f3154;
mov.u32 %r8393, 0;
bra.uni $L__BB0_665;
$L__BB0_659:
mov.b32 %r848, %f5336;
shr.u32 %r4413, %r848, 23;
and.b32 %r4414, %r4413, 255;
add.s32 %r849, %r4414, -128;
shl.b32 %r4415, %r848, 8;
or.b32 %r850, %r4415, -2147483648;
shr.u32 %r851, %r849, 5;
mov.u64 %rd2580, 0;
mov.u32 %r8390, 0;
mov.u64 %rd1278, __cudart_i2opi_f;
mov.u64 %rd2581, %rd2580;
$L__BB0_660:
.pragma "nounroll";
shl.b64 %rd1277, %rd2580, 2;
add.s64 %rd1279, %rd1278, %rd1277;
ld.global.nc.u32 %r4416, [%rd1279];
mad.wide.u32 %rd1280, %r4416, %r850, %rd2581;
shr.u64 %rd2581, %rd1280, 32;
add.s64 %rd1281, %rd1, %rd1277;
st.local.u32 [%rd1281], %rd1280;
add.s32 %r8390, %r8390, 1;
cvt.s64.s32 %rd2580, %r8390;
setp.ne.s32 %p573, %r8390, 6;
@%p573 bra $L__BB0_660;
st.local.u32 [%rd5], %rd2581;
mov.u32 %r4417, 4;
sub.s32 %r854, %r4417, %r851;
mov.u32 %r4418, 6;
sub.s32 %r4419, %r4418, %r851;
mul.wide.s32 %rd1282, %r4419, 4;
add.s64 %rd1283, %rd1, %rd1282;
ld.local.u32 %r8391, [%rd1283];
ld.local.u32 %r8392, [%rd1283+-4];
and.b32 %r857, %r849, 31;
setp.eq.s32 %p574, %r857, 0;
@%p574 bra $L__BB0_663;
mov.u32 %r4420, 32;
sub.s32 %r4421, %r4420, %r857;
shr.u32 %r4422, %r8392, %r4421;
shl.b32 %r4423, %r8391, %r857;
add.s32 %r8391, %r4422, %r4423;
mul.wide.s32 %rd1284, %r854, 4;
add.s64 %rd1285, %rd1, %rd1284;
ld.local.u32 %r4424, [%rd1285];
shr.u32 %r4425, %r4424, %r4421;
shl.b32 %r4426, %r8392, %r857;
add.s32 %r8392, %r4425, %r4426;
$L__BB0_663:
and.b32 %r4427, %r848, -2147483648;
shr.u32 %r4428, %r8392, 30;
shl.b32 %r4429, %r8391, 2;
or.b32 %r4430, %r4428, %r4429;
shr.u32 %r4431, %r4430, 31;
shr.u32 %r4432, %r8391, 30;
add.s32 %r4433, %r4431, %r4432;
neg.s32 %r4434, %r4433;
setp.eq.s32 %p575, %r4427, 0;
selp.b32 %r8393, %r4433, %r4434, %p575;
setp.ne.s32 %p576, %r4431, 0;
xor.b32 %r4435, %r4427, -2147483648;
selp.b32 %r4436, %r4435, %r4427, %p576;
selp.b32 %r4437, -1, 0, %p576;
xor.b32 %r4438, %r4430, %r4437;
shl.b32 %r4439, %r8392, 2;
xor.b32 %r4440, %r4439, %r4437;
cvt.u64.u32 %rd1286, %r4438;
cvt.u64.u32 %rd1287, %r4440;
bfi.b64 %rd1288, %rd1286, %rd1287, 32, 32;
cvt.rn.f64.s64 %fd83, %rd1288;
mul.f64 %fd84, %fd83, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3152, %fd84;
setp.eq.s32 %p577, %r4436, 0;
neg.f32 %f3153, %f3152;
selp.f32 %f5425, %f3152, %f3153, %p577;
$L__BB0_665:
add.s32 %r864, %r8393, 1;
and.b32 %r865, %r864, 1;
setp.eq.s32 %p578, %r865, 0;
selp.f32 %f726, %f5425, 0f3F800000, %p578;
mul.rn.f32 %f727, %f5425, %f5425;
mov.f32 %f5426, 0fB94D4153;
@%p578 bra $L__BB0_667;
mov.f32 %f3156, 0fBAB607ED;
mov.f32 %f3157, 0f37CBAC00;
fma.rn.f32 %f5426, %f3157, %f727, %f3156;
$L__BB0_667:
selp.f32 %f3158, 0f3C0885E4, 0f3D2AAABB, %p578;
fma.rn.f32 %f3159, %f5426, %f727, %f3158;
selp.f32 %f3160, 0fBE2AAAA8, 0fBEFFFFFF, %p578;
fma.rn.f32 %f3161, %f3159, %f727, %f3160;
mov.f32 %f3162, 0f00000000;
fma.rn.f32 %f3163, %f727, %f726, %f3162;
fma.rn.f32 %f5215, %f3161, %f3163, %f726;
and.b32 %r4442, %r864, 2;
setp.eq.s32 %p580, %r4442, 0;
@%p580 bra $L__BB0_669;
mov.f32 %f3165, 0fBF800000;
fma.rn.f32 %f5215, %f5215, %f3165, %f3162;
$L__BB0_669:
selp.f32 %f734, %f5215, %f5216, %p12;
selp.f32 %f735, %f5213, %f5214, %p12;
@%p559 bra $L__BB0_671;
add.f32 %f5519, %f735, %f734;
mov.f32 %f5214, %f5213;
mov.f32 %f5216, %f5215;
$L__BB0_671:
@%p434 bra $L__BB0_700;
shl.b32 %r4444, %r12, 5;
mov.u32 %r4445, -32;
sub.s32 %r866, %r4445, %r4444;
setp.ge.s32 %p584, %r14, %r866;
@%p584 bra $L__BB0_685;
mul.f32 %f3168, %f5343, 0f3F22F983;
cvt.rni.s32.f32 %r8397, %f3168;
cvt.rn.f32.s32 %f3169, %r8397;
mov.f32 %f3170, 0fBFC90FDA;
fma.rn.f32 %f3171, %f3169, %f3170, %f5343;
mov.f32 %f3172, 0fB3A22168;
fma.rn.f32 %f3173, %f3169, %f3172, %f3171;
mov.f32 %f3174, 0fA7C234C5;
fma.rn.f32 %f5434, %f3169, %f3174, %f3173;
abs.f32 %f743, %f5343;
setp.ltu.f32 %p585, %f743, 0f47CE4780;
@%p585 bra $L__BB0_681;
setp.eq.f32 %p586, %f743, 0f7F800000;
@%p586 bra $L__BB0_680;
bra.uni $L__BB0_675;
$L__BB0_680:
mov.f32 %f3177, 0f00000000;
mul.rn.f32 %f5434, %f5343, %f3177;
mov.u32 %r8397, 0;
bra.uni $L__BB0_681;
$L__BB0_675:
mov.b32 %r868, %f5343;
shr.u32 %r4447, %r868, 23;
and.b32 %r4448, %r4447, 255;
add.s32 %r869, %r4448, -128;
shl.b32 %r4449, %r868, 8;
or.b32 %r870, %r4449, -2147483648;
shr.u32 %r871, %r869, 5;
mov.u64 %rd2582, 0;
mov.u32 %r8394, 0;
mov.u64 %rd1292, __cudart_i2opi_f;
mov.u64 %rd2583, %rd2582;
$L__BB0_676:
.pragma "nounroll";
shl.b64 %rd1291, %rd2582, 2;
add.s64 %rd1293, %rd1292, %rd1291;
ld.global.nc.u32 %r4450, [%rd1293];
mad.wide.u32 %rd1294, %r4450, %r870, %rd2583;
shr.u64 %rd2583, %rd1294, 32;
add.s64 %rd1295, %rd1, %rd1291;
st.local.u32 [%rd1295], %rd1294;
add.s32 %r8394, %r8394, 1;
cvt.s64.s32 %rd2582, %r8394;
setp.ne.s32 %p587, %r8394, 6;
@%p587 bra $L__BB0_676;
st.local.u32 [%rd5], %rd2583;
mov.u32 %r4451, 4;
sub.s32 %r874, %r4451, %r871;
mov.u32 %r4452, 6;
sub.s32 %r4453, %r4452, %r871;
mul.wide.s32 %rd1296, %r4453, 4;
add.s64 %rd1297, %rd1, %rd1296;
ld.local.u32 %r8395, [%rd1297];
ld.local.u32 %r8396, [%rd1297+-4];
and.b32 %r877, %r869, 31;
setp.eq.s32 %p588, %r877, 0;
@%p588 bra $L__BB0_679;
mov.u32 %r4454, 32;
sub.s32 %r4455, %r4454, %r877;
shr.u32 %r4456, %r8396, %r4455;
shl.b32 %r4457, %r8395, %r877;
add.s32 %r8395, %r4456, %r4457;
mul.wide.s32 %rd1298, %r874, 4;
add.s64 %rd1299, %rd1, %rd1298;
ld.local.u32 %r4458, [%rd1299];
shr.u32 %r4459, %r4458, %r4455;
shl.b32 %r4460, %r8396, %r877;
add.s32 %r8396, %r4459, %r4460;
$L__BB0_679:
and.b32 %r4461, %r868, -2147483648;
shr.u32 %r4462, %r8396, 30;
shl.b32 %r4463, %r8395, 2;
or.b32 %r4464, %r4462, %r4463;
shr.u32 %r4465, %r4464, 31;
shr.u32 %r4466, %r8395, 30;
add.s32 %r4467, %r4465, %r4466;
neg.s32 %r4468, %r4467;
setp.eq.s32 %p589, %r4461, 0;
selp.b32 %r8397, %r4467, %r4468, %p589;
setp.ne.s32 %p590, %r4465, 0;
xor.b32 %r4469, %r4461, -2147483648;
selp.b32 %r4470, %r4469, %r4461, %p590;
selp.b32 %r4471, -1, 0, %p590;
xor.b32 %r4472, %r4464, %r4471;
shl.b32 %r4473, %r8396, 2;
xor.b32 %r4474, %r4473, %r4471;
cvt.u64.u32 %rd1300, %r4472;
cvt.u64.u32 %rd1301, %r4474;
bfi.b64 %rd1302, %rd1300, %rd1301, 32, 32;
cvt.rn.f64.s64 %fd85, %rd1302;
mul.f64 %fd86, %fd85, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3175, %fd86;
setp.eq.s32 %p591, %r4470, 0;
neg.f32 %f3176, %f3175;
selp.f32 %f5434, %f3175, %f3176, %p591;
$L__BB0_681:
and.b32 %r884, %r8397, 1;
setp.eq.s32 %p592, %r884, 0;
selp.f32 %f747, %f5434, 0f3F800000, %p592;
mul.rn.f32 %f748, %f5434, %f5434;
mov.f32 %f5435, 0fB94D4153;
@%p592 bra $L__BB0_683;
mov.f32 %f3179, 0fBAB607ED;
mov.f32 %f3180, 0f37CBAC00;
fma.rn.f32 %f5435, %f3180, %f748, %f3179;
$L__BB0_683:
selp.f32 %f3181, 0f3C0885E4, 0f3D2AAABB, %p592;
fma.rn.f32 %f3182, %f5435, %f748, %f3181;
selp.f32 %f3183, 0fBE2AAAA8, 0fBEFFFFFF, %p592;
fma.rn.f32 %f3184, %f3182, %f748, %f3183;
mov.f32 %f3185, 0f00000000;
fma.rn.f32 %f3186, %f748, %f747, %f3185;
fma.rn.f32 %f5213, %f3184, %f3186, %f747;
and.b32 %r4476, %r8397, 2;
setp.eq.s32 %p594, %r4476, 0;
@%p594 bra $L__BB0_685;
mov.f32 %f3188, 0fBF800000;
fma.rn.f32 %f5213, %f5213, %f3188, %f3185;
$L__BB0_685:
setp.lt.s32 %p13, %r14, %r866;
@%p584 bra $L__BB0_698;
mul.f32 %f3189, %f5335, 0f3F22F983;
cvt.rni.s32.f32 %r8401, %f3189;
cvt.rn.f32.s32 %f3190, %r8401;
mov.f32 %f3191, 0fBFC90FDA;
fma.rn.f32 %f3192, %f3190, %f3191, %f5335;
mov.f32 %f3193, 0fB3A22168;
fma.rn.f32 %f3194, %f3190, %f3193, %f3192;
mov.f32 %f3195, 0fA7C234C5;
fma.rn.f32 %f5438, %f3190, %f3195, %f3194;
abs.f32 %f756, %f5335;
setp.ltu.f32 %p596, %f756, 0f47CE4780;
@%p596 bra $L__BB0_694;
setp.eq.f32 %p597, %f756, 0f7F800000;
@%p597 bra $L__BB0_693;
bra.uni $L__BB0_688;
$L__BB0_693:
mov.f32 %f3198, 0f00000000;
mul.rn.f32 %f5438, %f5335, %f3198;
mov.u32 %r8401, 0;
bra.uni $L__BB0_694;
$L__BB0_688:
mov.b32 %r886, %f5335;
shr.u32 %r4478, %r886, 23;
and.b32 %r4479, %r4478, 255;
add.s32 %r887, %r4479, -128;
shl.b32 %r4480, %r886, 8;
or.b32 %r888, %r4480, -2147483648;
shr.u32 %r889, %r887, 5;
mov.u64 %rd2584, 0;
mov.u32 %r8398, 0;
mov.u64 %rd1306, __cudart_i2opi_f;
mov.u64 %rd2585, %rd2584;
$L__BB0_689:
.pragma "nounroll";
shl.b64 %rd1305, %rd2584, 2;
add.s64 %rd1307, %rd1306, %rd1305;
ld.global.nc.u32 %r4481, [%rd1307];
mad.wide.u32 %rd1308, %r4481, %r888, %rd2585;
shr.u64 %rd2585, %rd1308, 32;
add.s64 %rd1309, %rd1, %rd1305;
st.local.u32 [%rd1309], %rd1308;
add.s32 %r8398, %r8398, 1;
cvt.s64.s32 %rd2584, %r8398;
setp.ne.s32 %p598, %r8398, 6;
@%p598 bra $L__BB0_689;
st.local.u32 [%rd5], %rd2585;
mov.u32 %r4482, 4;
sub.s32 %r892, %r4482, %r889;
mov.u32 %r4483, 6;
sub.s32 %r4484, %r4483, %r889;
mul.wide.s32 %rd1310, %r4484, 4;
add.s64 %rd1311, %rd1, %rd1310;
ld.local.u32 %r8399, [%rd1311];
ld.local.u32 %r8400, [%rd1311+-4];
and.b32 %r895, %r887, 31;
setp.eq.s32 %p599, %r895, 0;
@%p599 bra $L__BB0_692;
mov.u32 %r4485, 32;
sub.s32 %r4486, %r4485, %r895;
shr.u32 %r4487, %r8400, %r4486;
shl.b32 %r4488, %r8399, %r895;
add.s32 %r8399, %r4487, %r4488;
mul.wide.s32 %rd1312, %r892, 4;
add.s64 %rd1313, %rd1, %rd1312;
ld.local.u32 %r4489, [%rd1313];
shr.u32 %r4490, %r4489, %r4486;
shl.b32 %r4491, %r8400, %r895;
add.s32 %r8400, %r4490, %r4491;
$L__BB0_692:
and.b32 %r4492, %r886, -2147483648;
shr.u32 %r4493, %r8400, 30;
shl.b32 %r4494, %r8399, 2;
or.b32 %r4495, %r4493, %r4494;
shr.u32 %r4496, %r4495, 31;
shr.u32 %r4497, %r8399, 30;
add.s32 %r4498, %r4496, %r4497;
neg.s32 %r4499, %r4498;
setp.eq.s32 %p600, %r4492, 0;
selp.b32 %r8401, %r4498, %r4499, %p600;
setp.ne.s32 %p601, %r4496, 0;
xor.b32 %r4500, %r4492, -2147483648;
selp.b32 %r4501, %r4500, %r4492, %p601;
selp.b32 %r4502, -1, 0, %p601;
xor.b32 %r4503, %r4495, %r4502;
shl.b32 %r4504, %r8400, 2;
xor.b32 %r4505, %r4504, %r4502;
cvt.u64.u32 %rd1314, %r4503;
cvt.u64.u32 %rd1315, %r4505;
bfi.b64 %rd1316, %rd1314, %rd1315, 32, 32;
cvt.rn.f64.s64 %fd87, %rd1316;
mul.f64 %fd88, %fd87, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3196, %fd88;
setp.eq.s32 %p602, %r4501, 0;
neg.f32 %f3197, %f3196;
selp.f32 %f5438, %f3196, %f3197, %p602;
$L__BB0_694:
add.s32 %r902, %r8401, 1;
and.b32 %r903, %r902, 1;
setp.eq.s32 %p603, %r903, 0;
selp.f32 %f760, %f5438, 0f3F800000, %p603;
mul.rn.f32 %f761, %f5438, %f5438;
mov.f32 %f5439, 0fB94D4153;
@%p603 bra $L__BB0_696;
mov.f32 %f3200, 0fBAB607ED;
mov.f32 %f3201, 0f37CBAC00;
fma.rn.f32 %f5439, %f3201, %f761, %f3200;
$L__BB0_696:
selp.f32 %f3202, 0f3C0885E4, 0f3D2AAABB, %p603;
fma.rn.f32 %f3203, %f5439, %f761, %f3202;
selp.f32 %f3204, 0fBE2AAAA8, 0fBEFFFFFF, %p603;
fma.rn.f32 %f3205, %f3203, %f761, %f3204;
mov.f32 %f3206, 0f00000000;
fma.rn.f32 %f3207, %f761, %f760, %f3206;
fma.rn.f32 %f5215, %f3205, %f3207, %f760;
and.b32 %r4507, %r902, 2;
setp.eq.s32 %p605, %r4507, 0;
@%p605 bra $L__BB0_698;
mov.f32 %f3209, 0fBF800000;
fma.rn.f32 %f5215, %f5215, %f3209, %f3206;
$L__BB0_698:
selp.f32 %f768, %f5215, %f5216, %p13;
selp.f32 %f769, %f5213, %f5214, %p13;
@%p584 bra $L__BB0_700;
add.f32 %f5518, %f769, %f768;
mov.f32 %f5214, %f5213;
mov.f32 %f5216, %f5215;
$L__BB0_700:
@%p438 bra $L__BB0_729;
shl.b32 %r4509, %r12, 5;
neg.s32 %r904, %r4509;
setp.ge.s32 %p609, %r14, %r904;
@%p609 bra $L__BB0_714;
mul.f32 %f3212, %f5342, 0f3F22F983;
cvt.rni.s32.f32 %r8405, %f3212;
cvt.rn.f32.s32 %f3213, %r8405;
mov.f32 %f3214, 0fBFC90FDA;
fma.rn.f32 %f3215, %f3213, %f3214, %f5342;
mov.f32 %f3216, 0fB3A22168;
fma.rn.f32 %f3217, %f3213, %f3216, %f3215;
mov.f32 %f3218, 0fA7C234C5;
fma.rn.f32 %f5447, %f3213, %f3218, %f3217;
abs.f32 %f777, %f5342;
setp.ltu.f32 %p610, %f777, 0f47CE4780;
@%p610 bra $L__BB0_710;
setp.eq.f32 %p611, %f777, 0f7F800000;
@%p611 bra $L__BB0_709;
bra.uni $L__BB0_704;
$L__BB0_709:
mov.f32 %f3221, 0f00000000;
mul.rn.f32 %f5447, %f5342, %f3221;
mov.u32 %r8405, 0;
bra.uni $L__BB0_710;
$L__BB0_704:
mov.b32 %r906, %f5342;
shr.u32 %r4511, %r906, 23;
and.b32 %r4512, %r4511, 255;
add.s32 %r907, %r4512, -128;
shl.b32 %r4513, %r906, 8;
or.b32 %r908, %r4513, -2147483648;
shr.u32 %r909, %r907, 5;
mov.u64 %rd2586, 0;
mov.u32 %r8402, 0;
mov.u64 %rd1320, __cudart_i2opi_f;
mov.u64 %rd2587, %rd2586;
$L__BB0_705:
.pragma "nounroll";
shl.b64 %rd1319, %rd2586, 2;
add.s64 %rd1321, %rd1320, %rd1319;
ld.global.nc.u32 %r4514, [%rd1321];
mad.wide.u32 %rd1322, %r4514, %r908, %rd2587;
shr.u64 %rd2587, %rd1322, 32;
add.s64 %rd1323, %rd1, %rd1319;
st.local.u32 [%rd1323], %rd1322;
add.s32 %r8402, %r8402, 1;
cvt.s64.s32 %rd2586, %r8402;
setp.ne.s32 %p612, %r8402, 6;
@%p612 bra $L__BB0_705;
st.local.u32 [%rd5], %rd2587;
mov.u32 %r4515, 4;
sub.s32 %r912, %r4515, %r909;
mov.u32 %r4516, 6;
sub.s32 %r4517, %r4516, %r909;
mul.wide.s32 %rd1324, %r4517, 4;
add.s64 %rd1325, %rd1, %rd1324;
ld.local.u32 %r8403, [%rd1325];
ld.local.u32 %r8404, [%rd1325+-4];
and.b32 %r915, %r907, 31;
setp.eq.s32 %p613, %r915, 0;
@%p613 bra $L__BB0_708;
mov.u32 %r4518, 32;
sub.s32 %r4519, %r4518, %r915;
shr.u32 %r4520, %r8404, %r4519;
shl.b32 %r4521, %r8403, %r915;
add.s32 %r8403, %r4520, %r4521;
mul.wide.s32 %rd1326, %r912, 4;
add.s64 %rd1327, %rd1, %rd1326;
ld.local.u32 %r4522, [%rd1327];
shr.u32 %r4523, %r4522, %r4519;
shl.b32 %r4524, %r8404, %r915;
add.s32 %r8404, %r4523, %r4524;
$L__BB0_708:
and.b32 %r4525, %r906, -2147483648;
shr.u32 %r4526, %r8404, 30;
shl.b32 %r4527, %r8403, 2;
or.b32 %r4528, %r4526, %r4527;
shr.u32 %r4529, %r4528, 31;
shr.u32 %r4530, %r8403, 30;
add.s32 %r4531, %r4529, %r4530;
neg.s32 %r4532, %r4531;
setp.eq.s32 %p614, %r4525, 0;
selp.b32 %r8405, %r4531, %r4532, %p614;
setp.ne.s32 %p615, %r4529, 0;
xor.b32 %r4533, %r4525, -2147483648;
selp.b32 %r4534, %r4533, %r4525, %p615;
selp.b32 %r4535, -1, 0, %p615;
xor.b32 %r4536, %r4528, %r4535;
shl.b32 %r4537, %r8404, 2;
xor.b32 %r4538, %r4537, %r4535;
cvt.u64.u32 %rd1328, %r4536;
cvt.u64.u32 %rd1329, %r4538;
bfi.b64 %rd1330, %rd1328, %rd1329, 32, 32;
cvt.rn.f64.s64 %fd89, %rd1330;
mul.f64 %fd90, %fd89, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3219, %fd90;
setp.eq.s32 %p616, %r4534, 0;
neg.f32 %f3220, %f3219;
selp.f32 %f5447, %f3219, %f3220, %p616;
$L__BB0_710:
and.b32 %r922, %r8405, 1;
setp.eq.s32 %p617, %r922, 0;
selp.f32 %f781, %f5447, 0f3F800000, %p617;
mul.rn.f32 %f782, %f5447, %f5447;
mov.f32 %f5448, 0fB94D4153;
@%p617 bra $L__BB0_712;
mov.f32 %f3223, 0fBAB607ED;
mov.f32 %f3224, 0f37CBAC00;
fma.rn.f32 %f5448, %f3224, %f782, %f3223;
$L__BB0_712:
selp.f32 %f3225, 0f3C0885E4, 0f3D2AAABB, %p617;
fma.rn.f32 %f3226, %f5448, %f782, %f3225;
selp.f32 %f3227, 0fBE2AAAA8, 0fBEFFFFFF, %p617;
fma.rn.f32 %f3228, %f3226, %f782, %f3227;
mov.f32 %f3229, 0f00000000;
fma.rn.f32 %f3230, %f782, %f781, %f3229;
fma.rn.f32 %f5213, %f3228, %f3230, %f781;
and.b32 %r4540, %r8405, 2;
setp.eq.s32 %p619, %r4540, 0;
@%p619 bra $L__BB0_714;
mov.f32 %f3232, 0fBF800000;
fma.rn.f32 %f5213, %f5213, %f3232, %f3229;
$L__BB0_714:
setp.lt.s32 %p14, %r14, %r904;
@%p609 bra $L__BB0_727;
mul.f32 %f3233, %f5334, 0f3F22F983;
cvt.rni.s32.f32 %r8409, %f3233;
cvt.rn.f32.s32 %f3234, %r8409;
mov.f32 %f3235, 0fBFC90FDA;
fma.rn.f32 %f3236, %f3234, %f3235, %f5334;
mov.f32 %f3237, 0fB3A22168;
fma.rn.f32 %f3238, %f3234, %f3237, %f3236;
mov.f32 %f3239, 0fA7C234C5;
fma.rn.f32 %f5451, %f3234, %f3239, %f3238;
abs.f32 %f790, %f5334;
setp.ltu.f32 %p621, %f790, 0f47CE4780;
@%p621 bra $L__BB0_723;
setp.eq.f32 %p622, %f790, 0f7F800000;
@%p622 bra $L__BB0_722;
bra.uni $L__BB0_717;
$L__BB0_722:
mov.f32 %f3242, 0f00000000;
mul.rn.f32 %f5451, %f5334, %f3242;
mov.u32 %r8409, 0;
bra.uni $L__BB0_723;
$L__BB0_717:
mov.b32 %r924, %f5334;
shr.u32 %r4542, %r924, 23;
and.b32 %r4543, %r4542, 255;
add.s32 %r925, %r4543, -128;
shl.b32 %r4544, %r924, 8;
or.b32 %r926, %r4544, -2147483648;
shr.u32 %r927, %r925, 5;
mov.u64 %rd2588, 0;
mov.u32 %r8406, 0;
mov.u64 %rd1334, __cudart_i2opi_f;
mov.u64 %rd2589, %rd2588;
$L__BB0_718:
.pragma "nounroll";
shl.b64 %rd1333, %rd2588, 2;
add.s64 %rd1335, %rd1334, %rd1333;
ld.global.nc.u32 %r4545, [%rd1335];
mad.wide.u32 %rd1336, %r4545, %r926, %rd2589;
shr.u64 %rd2589, %rd1336, 32;
add.s64 %rd1337, %rd1, %rd1333;
st.local.u32 [%rd1337], %rd1336;
add.s32 %r8406, %r8406, 1;
cvt.s64.s32 %rd2588, %r8406;
setp.ne.s32 %p623, %r8406, 6;
@%p623 bra $L__BB0_718;
st.local.u32 [%rd5], %rd2589;
mov.u32 %r4546, 4;
sub.s32 %r930, %r4546, %r927;
mov.u32 %r4547, 6;
sub.s32 %r4548, %r4547, %r927;
mul.wide.s32 %rd1338, %r4548, 4;
add.s64 %rd1339, %rd1, %rd1338;
ld.local.u32 %r8407, [%rd1339];
ld.local.u32 %r8408, [%rd1339+-4];
and.b32 %r933, %r925, 31;
setp.eq.s32 %p624, %r933, 0;
@%p624 bra $L__BB0_721;
mov.u32 %r4549, 32;
sub.s32 %r4550, %r4549, %r933;
shr.u32 %r4551, %r8408, %r4550;
shl.b32 %r4552, %r8407, %r933;
add.s32 %r8407, %r4551, %r4552;
mul.wide.s32 %rd1340, %r930, 4;
add.s64 %rd1341, %rd1, %rd1340;
ld.local.u32 %r4553, [%rd1341];
shr.u32 %r4554, %r4553, %r4550;
shl.b32 %r4555, %r8408, %r933;
add.s32 %r8408, %r4554, %r4555;
$L__BB0_721:
and.b32 %r4556, %r924, -2147483648;
shr.u32 %r4557, %r8408, 30;
shl.b32 %r4558, %r8407, 2;
or.b32 %r4559, %r4557, %r4558;
shr.u32 %r4560, %r4559, 31;
shr.u32 %r4561, %r8407, 30;
add.s32 %r4562, %r4560, %r4561;
neg.s32 %r4563, %r4562;
setp.eq.s32 %p625, %r4556, 0;
selp.b32 %r8409, %r4562, %r4563, %p625;
setp.ne.s32 %p626, %r4560, 0;
xor.b32 %r4564, %r4556, -2147483648;
selp.b32 %r4565, %r4564, %r4556, %p626;
selp.b32 %r4566, -1, 0, %p626;
xor.b32 %r4567, %r4559, %r4566;
shl.b32 %r4568, %r8408, 2;
xor.b32 %r4569, %r4568, %r4566;
cvt.u64.u32 %rd1342, %r4567;
cvt.u64.u32 %rd1343, %r4569;
bfi.b64 %rd1344, %rd1342, %rd1343, 32, 32;
cvt.rn.f64.s64 %fd91, %rd1344;
mul.f64 %fd92, %fd91, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3240, %fd92;
setp.eq.s32 %p627, %r4565, 0;
neg.f32 %f3241, %f3240;
selp.f32 %f5451, %f3240, %f3241, %p627;
$L__BB0_723:
add.s32 %r940, %r8409, 1;
and.b32 %r941, %r940, 1;
setp.eq.s32 %p628, %r941, 0;
selp.f32 %f794, %f5451, 0f3F800000, %p628;
mul.rn.f32 %f795, %f5451, %f5451;
mov.f32 %f5452, 0fB94D4153;
@%p628 bra $L__BB0_725;
mov.f32 %f3244, 0fBAB607ED;
mov.f32 %f3245, 0f37CBAC00;
fma.rn.f32 %f5452, %f3245, %f795, %f3244;
$L__BB0_725:
selp.f32 %f3246, 0f3C0885E4, 0f3D2AAABB, %p628;
fma.rn.f32 %f3247, %f5452, %f795, %f3246;
selp.f32 %f3248, 0fBE2AAAA8, 0fBEFFFFFF, %p628;
fma.rn.f32 %f3249, %f3247, %f795, %f3248;
mov.f32 %f3250, 0f00000000;
fma.rn.f32 %f3251, %f795, %f794, %f3250;
fma.rn.f32 %f5215, %f3249, %f3251, %f794;
and.b32 %r4571, %r940, 2;
setp.eq.s32 %p630, %r4571, 0;
@%p630 bra $L__BB0_727;
mov.f32 %f3253, 0fBF800000;
fma.rn.f32 %f5215, %f5215, %f3253, %f3250;
$L__BB0_727:
selp.f32 %f802, %f5215, %f5216, %p14;
selp.f32 %f803, %f5213, %f5214, %p14;
@%p609 bra $L__BB0_729;
add.f32 %f5517, %f803, %f802;
mov.f32 %f5214, %f5213;
mov.f32 %f5216, %f5215;
$L__BB0_729:
@%p438 bra $L__BB0_951;
shl.b32 %r4573, %r12, 5;
mov.u32 %r4574, -32;
sub.s32 %r942, %r4574, %r4573;
setp.ge.s32 %p634, %r14, %r942;
@%p634 bra $L__BB0_743;
mul.f32 %f3256, %f5341, 0f3F22F983;
cvt.rni.s32.f32 %r8413, %f3256;
cvt.rn.f32.s32 %f3257, %r8413;
mov.f32 %f3258, 0fBFC90FDA;
fma.rn.f32 %f3259, %f3257, %f3258, %f5341;
mov.f32 %f3260, 0fB3A22168;
fma.rn.f32 %f3261, %f3257, %f3260, %f3259;
mov.f32 %f3262, 0fA7C234C5;
fma.rn.f32 %f5460, %f3257, %f3262, %f3261;
abs.f32 %f811, %f5341;
setp.ltu.f32 %p635, %f811, 0f47CE4780;
@%p635 bra $L__BB0_739;
setp.eq.f32 %p636, %f811, 0f7F800000;
@%p636 bra $L__BB0_738;
bra.uni $L__BB0_733;
$L__BB0_738:
mov.f32 %f3265, 0f00000000;
mul.rn.f32 %f5460, %f5341, %f3265;
mov.u32 %r8413, 0;
bra.uni $L__BB0_739;
$L__BB0_733:
mov.b32 %r944, %f5341;
shr.u32 %r4576, %r944, 23;
and.b32 %r4577, %r4576, 255;
add.s32 %r945, %r4577, -128;
shl.b32 %r4578, %r944, 8;
or.b32 %r946, %r4578, -2147483648;
shr.u32 %r947, %r945, 5;
mov.u64 %rd2590, 0;
mov.u32 %r8410, 0;
mov.u64 %rd1348, __cudart_i2opi_f;
mov.u64 %rd2591, %rd2590;
$L__BB0_734:
.pragma "nounroll";
shl.b64 %rd1347, %rd2590, 2;
add.s64 %rd1349, %rd1348, %rd1347;
ld.global.nc.u32 %r4579, [%rd1349];
mad.wide.u32 %rd1350, %r4579, %r946, %rd2591;
shr.u64 %rd2591, %rd1350, 32;
add.s64 %rd1351, %rd1, %rd1347;
st.local.u32 [%rd1351], %rd1350;
add.s32 %r8410, %r8410, 1;
cvt.s64.s32 %rd2590, %r8410;
setp.ne.s32 %p637, %r8410, 6;
@%p637 bra $L__BB0_734;
st.local.u32 [%rd5], %rd2591;
mov.u32 %r4580, 4;
sub.s32 %r950, %r4580, %r947;
mov.u32 %r4581, 6;
sub.s32 %r4582, %r4581, %r947;
mul.wide.s32 %rd1352, %r4582, 4;
add.s64 %rd1353, %rd1, %rd1352;
ld.local.u32 %r8411, [%rd1353];
ld.local.u32 %r8412, [%rd1353+-4];
and.b32 %r953, %r945, 31;
setp.eq.s32 %p638, %r953, 0;
@%p638 bra $L__BB0_737;
mov.u32 %r4583, 32;
sub.s32 %r4584, %r4583, %r953;
shr.u32 %r4585, %r8412, %r4584;
shl.b32 %r4586, %r8411, %r953;
add.s32 %r8411, %r4585, %r4586;
mul.wide.s32 %rd1354, %r950, 4;
add.s64 %rd1355, %rd1, %rd1354;
ld.local.u32 %r4587, [%rd1355];
shr.u32 %r4588, %r4587, %r4584;
shl.b32 %r4589, %r8412, %r953;
add.s32 %r8412, %r4588, %r4589;
$L__BB0_737:
and.b32 %r4590, %r944, -2147483648;
shr.u32 %r4591, %r8412, 30;
shl.b32 %r4592, %r8411, 2;
or.b32 %r4593, %r4591, %r4592;
shr.u32 %r4594, %r4593, 31;
shr.u32 %r4595, %r8411, 30;
add.s32 %r4596, %r4594, %r4595;
neg.s32 %r4597, %r4596;
setp.eq.s32 %p639, %r4590, 0;
selp.b32 %r8413, %r4596, %r4597, %p639;
setp.ne.s32 %p640, %r4594, 0;
xor.b32 %r4598, %r4590, -2147483648;
selp.b32 %r4599, %r4598, %r4590, %p640;
selp.b32 %r4600, -1, 0, %p640;
xor.b32 %r4601, %r4593, %r4600;
shl.b32 %r4602, %r8412, 2;
xor.b32 %r4603, %r4602, %r4600;
cvt.u64.u32 %rd1356, %r4601;
cvt.u64.u32 %rd1357, %r4603;
bfi.b64 %rd1358, %rd1356, %rd1357, 32, 32;
cvt.rn.f64.s64 %fd93, %rd1358;
mul.f64 %fd94, %fd93, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3263, %fd94;
setp.eq.s32 %p641, %r4599, 0;
neg.f32 %f3264, %f3263;
selp.f32 %f5460, %f3263, %f3264, %p641;
$L__BB0_739:
and.b32 %r960, %r8413, 1;
setp.eq.s32 %p642, %r960, 0;
selp.f32 %f815, %f5460, 0f3F800000, %p642;
mul.rn.f32 %f816, %f5460, %f5460;
mov.f32 %f5461, 0fB94D4153;
@%p642 bra $L__BB0_741;
mov.f32 %f3267, 0fBAB607ED;
mov.f32 %f3268, 0f37CBAC00;
fma.rn.f32 %f5461, %f3268, %f816, %f3267;
$L__BB0_741:
selp.f32 %f3269, 0f3C0885E4, 0f3D2AAABB, %p642;
fma.rn.f32 %f3270, %f5461, %f816, %f3269;
selp.f32 %f3271, 0fBE2AAAA8, 0fBEFFFFFF, %p642;
fma.rn.f32 %f3272, %f3270, %f816, %f3271;
mov.f32 %f3273, 0f00000000;
fma.rn.f32 %f3274, %f816, %f815, %f3273;
fma.rn.f32 %f5213, %f3272, %f3274, %f815;
and.b32 %r4605, %r8413, 2;
setp.eq.s32 %p644, %r4605, 0;
@%p644 bra $L__BB0_743;
mov.f32 %f3276, 0fBF800000;
fma.rn.f32 %f5213, %f5213, %f3276, %f3273;
$L__BB0_743:
setp.lt.s32 %p15, %r14, %r942;
@%p634 bra $L__BB0_756;
mul.f32 %f3277, %f5333, 0f3F22F983;
cvt.rni.s32.f32 %r8417, %f3277;
cvt.rn.f32.s32 %f3278, %r8417;
mov.f32 %f3279, 0fBFC90FDA;
fma.rn.f32 %f3280, %f3278, %f3279, %f5333;
mov.f32 %f3281, 0fB3A22168;
fma.rn.f32 %f3282, %f3278, %f3281, %f3280;
mov.f32 %f3283, 0fA7C234C5;
fma.rn.f32 %f5464, %f3278, %f3283, %f3282;
abs.f32 %f824, %f5333;
setp.ltu.f32 %p646, %f824, 0f47CE4780;
@%p646 bra $L__BB0_752;
setp.eq.f32 %p647, %f824, 0f7F800000;
@%p647 bra $L__BB0_751;
bra.uni $L__BB0_746;
$L__BB0_751:
mov.f32 %f3286, 0f00000000;
mul.rn.f32 %f5464, %f5333, %f3286;
mov.u32 %r8417, 0;
bra.uni $L__BB0_752;
$L__BB0_746:
mov.b32 %r962, %f5333;
shr.u32 %r4607, %r962, 23;
and.b32 %r4608, %r4607, 255;
add.s32 %r963, %r4608, -128;
shl.b32 %r4609, %r962, 8;
or.b32 %r964, %r4609, -2147483648;
shr.u32 %r965, %r963, 5;
mov.u64 %rd2592, 0;
mov.u32 %r8414, 0;
mov.u64 %rd1362, __cudart_i2opi_f;
mov.u64 %rd2593, %rd2592;
$L__BB0_747:
.pragma "nounroll";
shl.b64 %rd1361, %rd2592, 2;
add.s64 %rd1363, %rd1362, %rd1361;
ld.global.nc.u32 %r4610, [%rd1363];
mad.wide.u32 %rd1364, %r4610, %r964, %rd2593;
shr.u64 %rd2593, %rd1364, 32;
add.s64 %rd1365, %rd1, %rd1361;
st.local.u32 [%rd1365], %rd1364;
add.s32 %r8414, %r8414, 1;
cvt.s64.s32 %rd2592, %r8414;
setp.ne.s32 %p648, %r8414, 6;
@%p648 bra $L__BB0_747;
st.local.u32 [%rd5], %rd2593;
mov.u32 %r4611, 4;
sub.s32 %r968, %r4611, %r965;
mov.u32 %r4612, 6;
sub.s32 %r4613, %r4612, %r965;
mul.wide.s32 %rd1366, %r4613, 4;
add.s64 %rd1367, %rd1, %rd1366;
ld.local.u32 %r8415, [%rd1367];
ld.local.u32 %r8416, [%rd1367+-4];
and.b32 %r971, %r963, 31;
setp.eq.s32 %p649, %r971, 0;
@%p649 bra $L__BB0_750;
mov.u32 %r4614, 32;
sub.s32 %r4615, %r4614, %r971;
shr.u32 %r4616, %r8416, %r4615;
shl.b32 %r4617, %r8415, %r971;
add.s32 %r8415, %r4616, %r4617;
mul.wide.s32 %rd1368, %r968, 4;
add.s64 %rd1369, %rd1, %rd1368;
ld.local.u32 %r4618, [%rd1369];
shr.u32 %r4619, %r4618, %r4615;
shl.b32 %r4620, %r8416, %r971;
add.s32 %r8416, %r4619, %r4620;
$L__BB0_750:
and.b32 %r4621, %r962, -2147483648;
shr.u32 %r4622, %r8416, 30;
shl.b32 %r4623, %r8415, 2;
or.b32 %r4624, %r4622, %r4623;
shr.u32 %r4625, %r4624, 31;
shr.u32 %r4626, %r8415, 30;
add.s32 %r4627, %r4625, %r4626;
neg.s32 %r4628, %r4627;
setp.eq.s32 %p650, %r4621, 0;
selp.b32 %r8417, %r4627, %r4628, %p650;
setp.ne.s32 %p651, %r4625, 0;
xor.b32 %r4629, %r4621, -2147483648;
selp.b32 %r4630, %r4629, %r4621, %p651;
selp.b32 %r4631, -1, 0, %p651;
xor.b32 %r4632, %r4624, %r4631;
shl.b32 %r4633, %r8416, 2;
xor.b32 %r4634, %r4633, %r4631;
cvt.u64.u32 %rd1370, %r4632;
cvt.u64.u32 %rd1371, %r4634;
bfi.b64 %rd1372, %rd1370, %rd1371, 32, 32;
cvt.rn.f64.s64 %fd95, %rd1372;
mul.f64 %fd96, %fd95, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3284, %fd96;
setp.eq.s32 %p652, %r4630, 0;
neg.f32 %f3285, %f3284;
selp.f32 %f5464, %f3284, %f3285, %p652;
$L__BB0_752:
add.s32 %r978, %r8417, 1;
and.b32 %r979, %r978, 1;
setp.eq.s32 %p653, %r979, 0;
selp.f32 %f828, %f5464, 0f3F800000, %p653;
mul.rn.f32 %f829, %f5464, %f5464;
mov.f32 %f5465, 0fB94D4153;
@%p653 bra $L__BB0_754;
mov.f32 %f3288, 0fBAB607ED;
mov.f32 %f3289, 0f37CBAC00;
fma.rn.f32 %f5465, %f3289, %f829, %f3288;
$L__BB0_754:
selp.f32 %f3290, 0f3C0885E4, 0f3D2AAABB, %p653;
fma.rn.f32 %f3291, %f5465, %f829, %f3290;
selp.f32 %f3292, 0fBE2AAAA8, 0fBEFFFFFF, %p653;
fma.rn.f32 %f3293, %f3291, %f829, %f3292;
mov.f32 %f3294, 0f00000000;
fma.rn.f32 %f3295, %f829, %f828, %f3294;
fma.rn.f32 %f5215, %f3293, %f3295, %f828;
and.b32 %r4636, %r978, 2;
setp.eq.s32 %p655, %r4636, 0;
@%p655 bra $L__BB0_756;
mov.f32 %f3297, 0fBF800000;
fma.rn.f32 %f5215, %f5215, %f3297, %f3294;
$L__BB0_756:
selp.f32 %f836, %f5215, %f5216, %p15;
selp.f32 %f837, %f5213, %f5214, %p15;
@%p634 bra $L__BB0_951;
add.f32 %f5516, %f837, %f836;
mov.f32 %f5214, %f5213;
mov.f32 %f5216, %f5215;
$L__BB0_951:
@%p32 bra $L__BB0_953;
shl.b32 %r5208, %r12, 2;
mov.u32 %r5209, -8;
sub.s32 %r5210, %r5209, %r5208;
add.s32 %r5211, %r13, -12;
setp.lt.s32 %p819, %r5211, %r5210;
@%p819 bra $L__BB0_1233;
bra.uni $L__BB0_953;
$L__BB0_1233:
mov.u32 %r5938, %ctaid.x;
shl.b32 %r5939, %r12, 5;
add.s32 %r5940, %r5939, %r1;
mul.hi.s32 %r5941, %r5940, -1840700269;
add.s32 %r5942, %r5941, %r5940;
shr.u32 %r5943, %r5942, 31;
shr.s32 %r5944, %r5942, 2;
add.s32 %r5945, %r5944, %r5943;
mul.lo.s32 %r5946, %r5945, %r2615;
mul.lo.s32 %r5947, %r5945, 7;
sub.s32 %r5948, %r5940, %r5947;
mul.lo.s32 %r5949, %r5948, %r2616;
add.s32 %r5950, %r13, 3;
mad.lo.s32 %r5951, %r2614, %r5938, %r2612;
mad.lo.s32 %r5952, %r5950, %r2613, %r5951;
add.s32 %r5953, %r5952, %r5946;
add.s32 %r5954, %r5953, %r5949;
mul.wide.s32 %rd1874, %r5954, 4;
add.s64 %rd1875, %rd3, %rd1874;
ld.global.f32 %f1382, [%rd1875];
add.s32 %r5955, %r5940, 32;
mul.hi.s32 %r5956, %r5955, -1840700269;
add.s32 %r5957, %r5956, %r5955;
shr.u32 %r5958, %r5957, 31;
shr.s32 %r5959, %r5957, 2;
add.s32 %r5960, %r5959, %r5958;
mul.lo.s32 %r5961, %r5960, %r2615;
mul.lo.s32 %r5962, %r5960, 7;
sub.s32 %r5963, %r5955, %r5962;
mul.lo.s32 %r5964, %r5963, %r2616;
add.s32 %r5965, %r5952, %r5961;
add.s32 %r5966, %r5965, %r5964;
mul.wide.s32 %rd1876, %r5966, 4;
add.s64 %rd1877, %rd3, %rd1876;
ld.global.f32 %f1383, [%rd1877];
add.s32 %r5967, %r5952, %r2613;
add.s32 %r5968, %r5967, %r5946;
add.s32 %r5969, %r5968, %r5949;
mul.wide.s32 %rd1878, %r5969, 4;
add.s64 %rd1879, %rd3, %rd1878;
ld.global.f32 %f1384, [%rd1879];
add.s32 %r5970, %r5967, %r5961;
add.s32 %r5971, %r5970, %r5964;
mul.wide.s32 %rd1880, %r5971, 4;
add.s64 %rd1881, %rd3, %rd1880;
ld.global.f32 %f1385, [%rd1881];
add.s32 %r5972, %r5951, %r2612;
mad.lo.s32 %r5973, %r13, %r2613, %r5972;
add.s32 %r5974, %r5973, %r5946;
add.s32 %r5975, %r5974, %r5949;
mul.wide.s32 %rd1882, %r5975, 4;
add.s64 %rd1883, %rd3, %rd1882;
ld.global.f32 %f1386, [%rd1883];
add.s32 %r5976, %r5973, %r5961;
add.s32 %r5977, %r5976, %r5964;
mul.wide.s32 %rd1884, %r5977, 4;
add.s64 %rd1885, %rd3, %rd1884;
ld.global.f32 %f1387, [%rd1885];
add.s32 %r5978, %r5973, %r2613;
add.s32 %r5979, %r5978, %r5946;
add.s32 %r5980, %r5979, %r5949;
mul.wide.s32 %rd1886, %r5980, 4;
add.s64 %rd1887, %rd3, %rd1886;
ld.global.f32 %f1388, [%rd1887];
add.s32 %r5981, %r5978, %r5961;
add.s32 %r5982, %r5981, %r5964;
mul.wide.s32 %rd1888, %r5982, 4;
add.s64 %rd1889, %rd3, %rd1888;
ld.global.f32 %f1389, [%rd1889];
mul.hi.s32 %r5983, %r5940, 954437177;
shr.u32 %r5984, %r5983, 31;
shr.s32 %r5985, %r5983, 1;
add.s32 %r5986, %r5985, %r5984;
mul.lo.s32 %r5987, %r5986, %r2605;
mul.lo.s32 %r5988, %r5986, 9;
sub.s32 %r5989, %r5940, %r5988;
mul.lo.s32 %r5990, %r5989, %r2606;
add.s32 %r5991, %r13, 2;
shl.b32 %r5992, %r2602, 1;
mad.lo.s32 %r5993, %r2604, %r5938, %r5992;
mad.lo.s32 %r5994, %r5991, %r2603, %r5993;
add.s32 %r5995, %r5994, %r5987;
add.s32 %r5996, %r5995, %r5990;
mul.wide.s32 %rd1890, %r5996, 4;
add.s64 %rd1891, %rd2, %rd1890;
ld.global.f32 %f1390, [%rd1891];
mul.hi.s32 %r5997, %r5955, 954437177;
shr.u32 %r5998, %r5997, 31;
shr.s32 %r5999, %r5997, 1;
add.s32 %r6000, %r5999, %r5998;
mul.lo.s32 %r6001, %r6000, %r2605;
mul.lo.s32 %r6002, %r6000, 9;
sub.s32 %r6003, %r5955, %r6002;
mul.lo.s32 %r6004, %r6003, %r2606;
add.s32 %r6005, %r5994, %r6001;
add.s32 %r6006, %r6005, %r6004;
mul.wide.s32 %rd1892, %r6006, 4;
add.s64 %rd1893, %rd2, %rd1892;
ld.global.f32 %f1391, [%rd1893];
add.s32 %r6007, %r5993, %r2602;
mad.lo.s32 %r6008, %r13, %r2603, %r6007;
add.s32 %r6009, %r6008, %r5987;
add.s32 %r6010, %r6009, %r5990;
mul.wide.s32 %rd1894, %r6010, 4;
add.s64 %rd1895, %rd2, %rd1894;
ld.global.f32 %f1392, [%rd1895];
add.s32 %r6011, %r6008, %r6001;
add.s32 %r6012, %r6011, %r6004;
mul.wide.s32 %rd1896, %r6012, 4;
add.s64 %rd1897, %rd2, %rd1896;
ld.global.f32 %f1393, [%rd1897];
mul.wide.s32 %rd1898, %r2603, 4;
add.s64 %rd1899, %rd1895, %rd1898;
ld.global.f32 %f1394, [%rd1899];
add.s64 %rd1900, %rd1897, %rd1898;
ld.global.f32 %f1395, [%rd1900];
add.s64 %rd1901, %rd1899, %rd1898;
ld.global.f32 %f1396, [%rd1901];
add.s64 %rd1902, %rd1900, %rd1898;
ld.global.f32 %f1397, [%rd1902];
mul.f32 %f3987, %f1390, 0f3F22F983;
cvt.rni.s32.f32 %r8549, %f3987;
cvt.rn.f32.s32 %f3988, %r8549;
mov.f32 %f3989, 0fBFC90FDA;
fma.rn.f32 %f3990, %f3988, %f3989, %f1390;
mov.f32 %f3991, 0fB3A22168;
fma.rn.f32 %f3992, %f3988, %f3991, %f3990;
mov.f32 %f3993, 0fA7C234C5;
fma.rn.f32 %f5659, %f3988, %f3993, %f3992;
abs.f32 %f1399, %f1390;
setp.ltu.f32 %p1052, %f1399, 0f47CE4780;
@%p1052 bra $L__BB0_1241;
setp.eq.f32 %p1053, %f1399, 0f7F800000;
@%p1053 bra $L__BB0_1240;
bra.uni $L__BB0_1235;
$L__BB0_1240:
mov.f32 %f3996, 0f00000000;
mul.rn.f32 %f5659, %f1390, %f3996;
mov.u32 %r8549, 0;
bra.uni $L__BB0_1241;
$L__BB0_953:
mov.u32 %r1276, %ctaid.x;
mul.lo.s32 %r1277, %r2614, %r1276;
add.s32 %r5212, %r13, -15;
mov.u32 %r5213, -8;
sub.s32 %r1278, %r5213, %r12;
setp.ge.s32 %p820, %r5212, %r1278;
add.s32 %r5214, %r13, 3;
add.s32 %r5215, %r2612, %r1277;
mad.lo.s32 %r1279, %r5214, %r2613, %r5215;
@%p820 bra $L__BB0_956;
shl.b32 %r1280, %r12, 5;
neg.s32 %r5216, %r1280;
setp.ge.s32 %p821, %r14, %r5216;
@%p821 bra $L__BB0_956;
add.s32 %r5217, %r1280, %r1;
mul.hi.s32 %r5218, %r5217, -1840700269;
add.s32 %r5219, %r5218, %r5217;
shr.u32 %r5220, %r5219, 31;
shr.s32 %r5221, %r5219, 2;
add.s32 %r5222, %r5221, %r5220;
mul.lo.s32 %r5223, %r5222, 7;
sub.s32 %r5224, %r5217, %r5223;
mad.lo.s32 %r5225, %r5222, %r2615, %r1279;
mad.lo.s32 %r5226, %r5224, %r2616, %r5225;
mul.wide.s32 %rd1626, %r5226, 4;
add.s64 %rd1627, %rd3, %rd1626;
ld.global.f32 %f5531, [%rd1627];
$L__BB0_956:
@%p820 bra $L__BB0_959;
shl.b32 %r1281, %r12, 5;
mov.u32 %r5228, -32;
sub.s32 %r5229, %r5228, %r1281;
setp.ge.s32 %p823, %r14, %r5229;
@%p823 bra $L__BB0_959;
add.s32 %r5230, %r1281, %r1;
add.s32 %r5231, %r5230, 32;
mul.hi.s32 %r5232, %r5231, -1840700269;
add.s32 %r5233, %r5232, %r5231;
shr.u32 %r5234, %r5233, 31;
shr.s32 %r5235, %r5233, 2;
add.s32 %r5236, %r5235, %r5234;
mul.lo.s32 %r5237, %r5236, 7;
sub.s32 %r5238, %r5231, %r5237;
mad.lo.s32 %r5239, %r5236, %r2615, %r1279;
mad.lo.s32 %r5240, %r5238, %r2616, %r5239;
mul.wide.s32 %rd1628, %r5240, 4;
add.s64 %rd1629, %rd3, %rd1628;
ld.global.f32 %f5339, [%rd1629];
$L__BB0_959:
mov.u32 %r5242, -9;
sub.s32 %r1282, %r5242, %r12;
setp.ge.s32 %p824, %r5212, %r1282;
add.s32 %r1283, %r1279, %r2613;
@%p824 bra $L__BB0_962;
shl.b32 %r1284, %r12, 5;
neg.s32 %r5243, %r1284;
setp.ge.s32 %p825, %r14, %r5243;
@%p825 bra $L__BB0_962;
add.s32 %r5244, %r1284, %r1;
mul.hi.s32 %r5245, %r5244, -1840700269;
add.s32 %r5246, %r5245, %r5244;
shr.u32 %r5247, %r5246, 31;
shr.s32 %r5248, %r5246, 2;
add.s32 %r5249, %r5248, %r5247;
mul.lo.s32 %r5250, %r5249, 7;
sub.s32 %r5251, %r5244, %r5250;
mad.lo.s32 %r5252, %r5249, %r2615, %r1283;
mad.lo.s32 %r5253, %r5251, %r2616, %r5252;
mul.wide.s32 %rd1630, %r5253, 4;
add.s64 %rd1631, %rd3, %rd1630;
ld.global.f32 %f5338, [%rd1631];
$L__BB0_962:
@%p824 bra $L__BB0_965;
shl.b32 %r1285, %r12, 5;
mov.u32 %r5255, -32;
sub.s32 %r5256, %r5255, %r1285;
setp.ge.s32 %p827, %r14, %r5256;
@%p827 bra $L__BB0_965;
add.s32 %r5257, %r1285, %r1;
add.s32 %r5258, %r5257, 32;
mul.hi.s32 %r5259, %r5258, -1840700269;
add.s32 %r5260, %r5259, %r5258;
shr.u32 %r5261, %r5260, 31;
shr.s32 %r5262, %r5260, 2;
add.s32 %r5263, %r5262, %r5261;
mul.lo.s32 %r5264, %r5263, 7;
sub.s32 %r5265, %r5258, %r5264;
mad.lo.s32 %r5266, %r5263, %r2615, %r1283;
mad.lo.s32 %r5267, %r5265, %r2616, %r5266;
mul.wide.s32 %rd1632, %r5267, 4;
add.s64 %rd1633, %rd3, %rd1632;
ld.global.f32 %f5337, [%rd1633];
$L__BB0_965:
mov.u32 %r5269, -10;
sub.s32 %r1286, %r5269, %r12;
setp.ge.s32 %p828, %r5212, %r1286;
shl.b32 %r5270, %r2612, 1;
add.s32 %r5271, %r5270, %r1277;
mad.lo.s32 %r1287, %r13, %r2613, %r5271;
@%p828 bra $L__BB0_968;
shl.b32 %r1288, %r12, 5;
neg.s32 %r5272, %r1288;
setp.ge.s32 %p829, %r14, %r5272;
@%p829 bra $L__BB0_968;
add.s32 %r5273, %r1288, %r1;
mul.hi.s32 %r5274, %r5273, -1840700269;
add.s32 %r5275, %r5274, %r5273;
shr.u32 %r5276, %r5275, 31;
shr.s32 %r5277, %r5275, 2;
add.s32 %r5278, %r5277, %r5276;
mul.lo.s32 %r5279, %r5278, 7;
sub.s32 %r5280, %r5273, %r5279;
mad.lo.s32 %r5281, %r5278, %r2615, %r1287;
mad.lo.s32 %r5282, %r5280, %r2616, %r5281;
mul.wide.s32 %rd1634, %r5282, 4;
add.s64 %rd1635, %rd3, %rd1634;
ld.global.f32 %f5336, [%rd1635];
$L__BB0_968:
@%p828 bra $L__BB0_971;
shl.b32 %r1289, %r12, 5;
mov.u32 %r5284, -32;
sub.s32 %r5285, %r5284, %r1289;
setp.ge.s32 %p831, %r14, %r5285;
@%p831 bra $L__BB0_971;
add.s32 %r5286, %r1289, %r1;
add.s32 %r5287, %r5286, 32;
mul.hi.s32 %r5288, %r5287, -1840700269;
add.s32 %r5289, %r5288, %r5287;
shr.u32 %r5290, %r5289, 31;
shr.s32 %r5291, %r5289, 2;
add.s32 %r5292, %r5291, %r5290;
mul.lo.s32 %r5293, %r5292, 7;
sub.s32 %r5294, %r5287, %r5293;
mad.lo.s32 %r5295, %r5292, %r2615, %r1287;
mad.lo.s32 %r5296, %r5294, %r2616, %r5295;
mul.wide.s32 %rd1636, %r5296, 4;
add.s64 %rd1637, %rd3, %rd1636;
ld.global.f32 %f5335, [%rd1637];
$L__BB0_971:
mov.u32 %r5298, -11;
sub.s32 %r1290, %r5298, %r12;
setp.ge.s32 %p832, %r5212, %r1290;
add.s32 %r1291, %r1287, %r2613;
@%p832 bra $L__BB0_974;
shl.b32 %r1292, %r12, 5;
neg.s32 %r5299, %r1292;
setp.ge.s32 %p833, %r14, %r5299;
@%p833 bra $L__BB0_974;
add.s32 %r5300, %r1292, %r1;
mul.hi.s32 %r5301, %r5300, -1840700269;
add.s32 %r5302, %r5301, %r5300;
shr.u32 %r5303, %r5302, 31;
shr.s32 %r5304, %r5302, 2;
add.s32 %r5305, %r5304, %r5303;
mul.lo.s32 %r5306, %r5305, 7;
sub.s32 %r5307, %r5300, %r5306;
mad.lo.s32 %r5308, %r5305, %r2615, %r1291;
mad.lo.s32 %r5309, %r5307, %r2616, %r5308;
mul.wide.s32 %rd1638, %r5309, 4;
add.s64 %rd1639, %rd3, %rd1638;
ld.global.f32 %f5334, [%rd1639];
$L__BB0_974:
@%p832 bra $L__BB0_977;
shl.b32 %r1293, %r12, 5;
mov.u32 %r5311, -32;
sub.s32 %r5312, %r5311, %r1293;
setp.ge.s32 %p835, %r14, %r5312;
@%p835 bra $L__BB0_977;
add.s32 %r5313, %r1293, %r1;
add.s32 %r5314, %r5313, 32;
mul.hi.s32 %r5315, %r5314, -1840700269;
add.s32 %r5316, %r5315, %r5314;
shr.u32 %r5317, %r5316, 31;
shr.s32 %r5318, %r5316, 2;
add.s32 %r5319, %r5318, %r5317;
mul.lo.s32 %r5320, %r5319, 7;
sub.s32 %r5321, %r5314, %r5320;
mad.lo.s32 %r5322, %r5319, %r2615, %r1291;
mad.lo.s32 %r5323, %r5321, %r2616, %r5322;
mul.wide.s32 %rd1640, %r5323, 4;
add.s64 %rd1641, %rd3, %rd1640;
ld.global.f32 %f5333, [%rd1641];
$L__BB0_977:
add.s32 %r5325, %r13, 2;
mul.lo.s32 %r1294, %r5325, %r2603;
shl.b32 %r5326, %r2602, 1;
mad.lo.s32 %r1295, %r2604, %r1276, %r5326;
add.s32 %r1296, %r1295, %r1294;
@%p820 bra $L__BB0_980;
shl.b32 %r1297, %r12, 5;
neg.s32 %r5327, %r1297;
setp.ge.s32 %p837, %r14, %r5327;
@%p837 bra $L__BB0_980;
add.s32 %r5328, %r1297, %r1;
mul.hi.s32 %r5329, %r5328, 954437177;
shr.u32 %r5330, %r5329, 31;
shr.s32 %r5331, %r5329, 1;
add.s32 %r5332, %r5331, %r5330;
mul.lo.s32 %r5333, %r5332, 9;
sub.s32 %r5334, %r5328, %r5333;
mad.lo.s32 %r5335, %r5332, %r2605, %r1296;
mad.lo.s32 %r5336, %r5334, %r2606, %r5335;
mul.wide.s32 %rd1642, %r5336, 4;
add.s64 %rd1643, %rd2, %rd1642;
ld.global.f32 %f5348, [%rd1643];
$L__BB0_980:
@%p820 bra $L__BB0_983;
shl.b32 %r1298, %r12, 5;
mov.u32 %r5338, -32;
sub.s32 %r5339, %r5338, %r1298;
setp.ge.s32 %p839, %r14, %r5339;
@%p839 bra $L__BB0_983;
add.s32 %r5340, %r1298, %r1;
add.s32 %r5341, %r5340, 32;
mul.hi.s32 %r5342, %r5341, 954437177;
shr.u32 %r5343, %r5342, 31;
shr.s32 %r5344, %r5342, 1;
add.s32 %r5345, %r5344, %r5343;
mul.lo.s32 %r5346, %r5345, 9;
sub.s32 %r5347, %r5341, %r5346;
mad.lo.s32 %r5348, %r5345, %r2605, %r1296;
mad.lo.s32 %r5349, %r5347, %r2606, %r5348;
mul.wide.s32 %rd1644, %r5349, 4;
add.s64 %rd1645, %rd2, %rd1644;
ld.global.f32 %f5347, [%rd1645];
$L__BB0_983:
add.s32 %r1299, %r1295, %r2602;
mad.lo.s32 %r1300, %r13, %r2603, %r1299;
@%p824 bra $L__BB0_986;
shl.b32 %r1301, %r12, 5;
neg.s32 %r5351, %r1301;
setp.ge.s32 %p841, %r14, %r5351;
@%p841 bra $L__BB0_986;
add.s32 %r5352, %r1301, %r1;
mul.hi.s32 %r5353, %r5352, 954437177;
shr.u32 %r5354, %r5353, 31;
shr.s32 %r5355, %r5353, 1;
add.s32 %r5356, %r5355, %r5354;
mul.lo.s32 %r5357, %r5356, 9;
sub.s32 %r5358, %r5352, %r5357;
mad.lo.s32 %r5359, %r5356, %r2605, %r1300;
mad.lo.s32 %r5360, %r5358, %r2606, %r5359;
mul.wide.s32 %rd1646, %r5360, 4;
add.s64 %rd1647, %rd2, %rd1646;
ld.global.f32 %f5346, [%rd1647];
$L__BB0_986:
@%p824 bra $L__BB0_989;
shl.b32 %r1302, %r12, 5;
mov.u32 %r5362, -32;
sub.s32 %r5363, %r5362, %r1302;
setp.ge.s32 %p843, %r14, %r5363;
@%p843 bra $L__BB0_989;
add.s32 %r5364, %r1302, %r1;
add.s32 %r5365, %r5364, 32;
mul.hi.s32 %r5366, %r5365, 954437177;
shr.u32 %r5367, %r5366, 31;
shr.s32 %r5368, %r5366, 1;
add.s32 %r5369, %r5368, %r5367;
mul.lo.s32 %r5370, %r5369, 9;
sub.s32 %r5371, %r5365, %r5370;
mad.lo.s32 %r5372, %r5369, %r2605, %r1300;
mad.lo.s32 %r5373, %r5371, %r2606, %r5372;
mul.wide.s32 %rd1648, %r5373, 4;
add.s64 %rd1649, %rd2, %rd1648;
ld.global.f32 %f5345, [%rd1649];
$L__BB0_989:
add.s32 %r1303, %r1300, %r2603;
@%p828 bra $L__BB0_992;
shl.b32 %r1304, %r12, 5;
neg.s32 %r5375, %r1304;
setp.ge.s32 %p845, %r14, %r5375;
@%p845 bra $L__BB0_992;
add.s32 %r5376, %r1304, %r1;
mul.hi.s32 %r5377, %r5376, 954437177;
shr.u32 %r5378, %r5377, 31;
shr.s32 %r5379, %r5377, 1;
add.s32 %r5380, %r5379, %r5378;
mul.lo.s32 %r5381, %r5380, 9;
sub.s32 %r5382, %r5376, %r5381;
mad.lo.s32 %r5383, %r5380, %r2605, %r1303;
mad.lo.s32 %r5384, %r5382, %r2606, %r5383;
mul.wide.s32 %rd1650, %r5384, 4;
add.s64 %rd1651, %rd2, %rd1650;
ld.global.f32 %f5344, [%rd1651];
$L__BB0_992:
@%p828 bra $L__BB0_995;
shl.b32 %r1305, %r12, 5;
mov.u32 %r5386, -32;
sub.s32 %r5387, %r5386, %r1305;
setp.ge.s32 %p847, %r14, %r5387;
@%p847 bra $L__BB0_995;
add.s32 %r5388, %r1305, %r1;
add.s32 %r5389, %r5388, 32;
mul.hi.s32 %r5390, %r5389, 954437177;
shr.u32 %r5391, %r5390, 31;
shr.s32 %r5392, %r5390, 1;
add.s32 %r5393, %r5392, %r5391;
mul.lo.s32 %r5394, %r5393, 9;
sub.s32 %r5395, %r5389, %r5394;
mad.lo.s32 %r5396, %r5393, %r2605, %r1303;
mad.lo.s32 %r5397, %r5395, %r2606, %r5396;
mul.wide.s32 %rd1652, %r5397, 4;
add.s64 %rd1653, %rd2, %rd1652;
ld.global.f32 %f5343, [%rd1653];
$L__BB0_995:
add.s32 %r1306, %r1299, %r1294;
@%p832 bra $L__BB0_998;
shl.b32 %r1307, %r12, 5;
neg.s32 %r5399, %r1307;
setp.ge.s32 %p849, %r14, %r5399;
@%p849 bra $L__BB0_998;
add.s32 %r5400, %r1307, %r1;
mul.hi.s32 %r5401, %r5400, 954437177;
shr.u32 %r5402, %r5401, 31;
shr.s32 %r5403, %r5401, 1;
add.s32 %r5404, %r5403, %r5402;
mul.lo.s32 %r5405, %r5404, 9;
sub.s32 %r5406, %r5400, %r5405;
mad.lo.s32 %r5407, %r5404, %r2605, %r1306;
mad.lo.s32 %r5408, %r5406, %r2606, %r5407;
mul.wide.s32 %rd1654, %r5408, 4;
add.s64 %rd1655, %rd2, %rd1654;
ld.global.f32 %f5342, [%rd1655];
$L__BB0_998:
@%p832 bra $L__BB0_1001;
shl.b32 %r1308, %r12, 5;
mov.u32 %r5410, -32;
sub.s32 %r5411, %r5410, %r1308;
setp.ge.s32 %p851, %r14, %r5411;
@%p851 bra $L__BB0_1001;
add.s32 %r5412, %r1308, %r1;
add.s32 %r5413, %r5412, 32;
mul.hi.s32 %r5414, %r5413, 954437177;
shr.u32 %r5415, %r5414, 31;
shr.s32 %r5416, %r5414, 1;
add.s32 %r5417, %r5416, %r5415;
mul.lo.s32 %r5418, %r5417, 9;
sub.s32 %r5419, %r5413, %r5418;
mad.lo.s32 %r5420, %r5417, %r2605, %r1306;
mad.lo.s32 %r5421, %r5419, %r2606, %r5420;
mul.wide.s32 %rd1656, %r5421, 4;
add.s64 %rd1657, %rd2, %rd1656;
ld.global.f32 %f5341, [%rd1657];
$L__BB0_1001:
@%p820 bra $L__BB0_1030;
shl.b32 %r5423, %r12, 5;
neg.s32 %r1309, %r5423;
setp.ge.s32 %p853, %r14, %r1309;
@%p853 bra $L__BB0_1015;
mul.f32 %f3636, %f5348, 0f3F22F983;
cvt.rni.s32.f32 %r8485, %f3636;
cvt.rn.f32.s32 %f3637, %r8485;
mov.f32 %f3638, 0fBFC90FDA;
fma.rn.f32 %f3639, %f3637, %f3638, %f5348;
mov.f32 %f3640, 0fB3A22168;
fma.rn.f32 %f3641, %f3637, %f3640, %f3639;
mov.f32 %f3642, 0fA7C234C5;
fma.rn.f32 %f5560, %f3637, %f3642, %f3641;
abs.f32 %f1116, %f5348;
setp.ltu.f32 %p854, %f1116, 0f47CE4780;
@%p854 bra $L__BB0_1011;
setp.eq.f32 %p855, %f1116, 0f7F800000;
@%p855 bra $L__BB0_1010;
bra.uni $L__BB0_1005;
$L__BB0_1010:
mov.f32 %f3645, 0f00000000;
mul.rn.f32 %f5560, %f5348, %f3645;
mov.u32 %r8485, 0;
bra.uni $L__BB0_1011;
$L__BB0_1235:
mov.b32 %r1614, %f1390;
shr.u32 %r6014, %r1614, 23;
and.b32 %r6015, %r6014, 255;
add.s32 %r1615, %r6015, -128;
shl.b32 %r6016, %r1614, 8;
or.b32 %r1616, %r6016, -2147483648;
shr.u32 %r1617, %r1615, 5;
mov.u64 %rd2660, 0;
mov.u32 %r8546, 0;
mov.u64 %rd1906, __cudart_i2opi_f;
mov.u64 %rd2661, %rd2660;
$L__BB0_1236:
.pragma "nounroll";
shl.b64 %rd1905, %rd2660, 2;
add.s64 %rd1907, %rd1906, %rd1905;
ld.global.nc.u32 %r6017, [%rd1907];
mad.wide.u32 %rd1908, %r6017, %r1616, %rd2661;
shr.u64 %rd2661, %rd1908, 32;
add.s64 %rd1909, %rd1, %rd1905;
st.local.u32 [%rd1909], %rd1908;
add.s32 %r8546, %r8546, 1;
cvt.s64.s32 %rd2660, %r8546;
setp.ne.s32 %p1054, %r8546, 6;
@%p1054 bra $L__BB0_1236;
st.local.u32 [%rd5], %rd2661;
mov.u32 %r6018, 4;
sub.s32 %r1620, %r6018, %r1617;
mov.u32 %r6019, 6;
sub.s32 %r6020, %r6019, %r1617;
mul.wide.s32 %rd1910, %r6020, 4;
add.s64 %rd1911, %rd1, %rd1910;
ld.local.u32 %r8547, [%rd1911];
ld.local.u32 %r8548, [%rd1911+-4];
and.b32 %r1623, %r1615, 31;
setp.eq.s32 %p1055, %r1623, 0;
@%p1055 bra $L__BB0_1239;
mov.u32 %r6021, 32;
sub.s32 %r6022, %r6021, %r1623;
shr.u32 %r6023, %r8548, %r6022;
shl.b32 %r6024, %r8547, %r1623;
add.s32 %r8547, %r6023, %r6024;
mul.wide.s32 %rd1912, %r1620, 4;
add.s64 %rd1913, %rd1, %rd1912;
ld.local.u32 %r6025, [%rd1913];
shr.u32 %r6026, %r6025, %r6022;
shl.b32 %r6027, %r8548, %r1623;
add.s32 %r8548, %r6026, %r6027;
$L__BB0_1239:
and.b32 %r6028, %r1614, -2147483648;
shr.u32 %r6029, %r8548, 30;
shl.b32 %r6030, %r8547, 2;
or.b32 %r6031, %r6029, %r6030;
shr.u32 %r6032, %r6031, 31;
shr.u32 %r6033, %r8547, 30;
add.s32 %r6034, %r6032, %r6033;
neg.s32 %r6035, %r6034;
setp.eq.s32 %p1056, %r6028, 0;
selp.b32 %r8549, %r6034, %r6035, %p1056;
setp.ne.s32 %p1057, %r6032, 0;
xor.b32 %r6036, %r6028, -2147483648;
selp.b32 %r6037, %r6036, %r6028, %p1057;
selp.b32 %r6038, -1, 0, %p1057;
xor.b32 %r6039, %r6031, %r6038;
shl.b32 %r6040, %r8548, 2;
xor.b32 %r6041, %r6040, %r6038;
cvt.u64.u32 %rd1914, %r6039;
cvt.u64.u32 %rd1915, %r6041;
bfi.b64 %rd1916, %rd1914, %rd1915, 32, 32;
cvt.rn.f64.s64 %fd161, %rd1916;
mul.f64 %fd162, %fd161, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3994, %fd162;
setp.eq.s32 %p1058, %r6037, 0;
neg.f32 %f3995, %f3994;
selp.f32 %f5659, %f3994, %f3995, %p1058;
$L__BB0_1241:
and.b32 %r1630, %r8549, 1;
setp.eq.s32 %p1059, %r1630, 0;
selp.f32 %f1403, %f5659, 0f3F800000, %p1059;
mul.rn.f32 %f1404, %f5659, %f5659;
mov.f32 %f5660, 0fB94D4153;
@%p1059 bra $L__BB0_1243;
mov.f32 %f3998, 0fBAB607ED;
mov.f32 %f3999, 0f37CBAC00;
fma.rn.f32 %f5660, %f3999, %f1404, %f3998;
$L__BB0_1243:
selp.f32 %f4000, 0f3C0885E4, 0f3D2AAABB, %p1059;
fma.rn.f32 %f4001, %f5660, %f1404, %f4000;
selp.f32 %f4002, 0fBE2AAAA8, 0fBEFFFFFF, %p1059;
fma.rn.f32 %f4003, %f4001, %f1404, %f4002;
mov.f32 %f4004, 0f00000000;
fma.rn.f32 %f4005, %f1404, %f1403, %f4004;
fma.rn.f32 %f5661, %f4003, %f4005, %f1403;
and.b32 %r6043, %r8549, 2;
setp.eq.s32 %p1061, %r6043, 0;
@%p1061 bra $L__BB0_1245;
mov.f32 %f4007, 0fBF800000;
fma.rn.f32 %f5661, %f5661, %f4007, %f4004;
$L__BB0_1245:
mul.f32 %f4008, %f1382, 0f3F22F983;
cvt.rni.s32.f32 %r8553, %f4008;
cvt.rn.f32.s32 %f4009, %r8553;
mov.f32 %f4010, 0fBFC90FDA;
fma.rn.f32 %f4011, %f4009, %f4010, %f1382;
mov.f32 %f4012, 0fB3A22168;
fma.rn.f32 %f4013, %f4009, %f4012, %f4011;
mov.f32 %f4014, 0fA7C234C5;
fma.rn.f32 %f5662, %f4009, %f4014, %f4013;
abs.f32 %f1411, %f1382;
setp.ltu.f32 %p1062, %f1411, 0f47CE4780;
@%p1062 bra $L__BB0_1253;
setp.eq.f32 %p1063, %f1411, 0f7F800000;
@%p1063 bra $L__BB0_1252;
bra.uni $L__BB0_1247;
$L__BB0_1252:
mov.f32 %f4017, 0f00000000;
mul.rn.f32 %f5662, %f1382, %f4017;
mov.u32 %r8553, 0;
bra.uni $L__BB0_1253;
$L__BB0_1247:
mov.b32 %r1632, %f1382;
shr.u32 %r6045, %r1632, 23;
and.b32 %r6046, %r6045, 255;
add.s32 %r1633, %r6046, -128;
shl.b32 %r6047, %r1632, 8;
or.b32 %r1634, %r6047, -2147483648;
shr.u32 %r1635, %r1633, 5;
mov.u64 %rd2662, 0;
mov.u32 %r8550, 0;
mov.u64 %rd1920, __cudart_i2opi_f;
mov.u64 %rd2663, %rd2662;
$L__BB0_1248:
.pragma "nounroll";
shl.b64 %rd1919, %rd2662, 2;
add.s64 %rd1921, %rd1920, %rd1919;
ld.global.nc.u32 %r6048, [%rd1921];
mad.wide.u32 %rd1922, %r6048, %r1634, %rd2663;
shr.u64 %rd2663, %rd1922, 32;
add.s64 %rd1923, %rd1, %rd1919;
st.local.u32 [%rd1923], %rd1922;
add.s32 %r8550, %r8550, 1;
cvt.s64.s32 %rd2662, %r8550;
setp.ne.s32 %p1064, %r8550, 6;
@%p1064 bra $L__BB0_1248;
st.local.u32 [%rd5], %rd2663;
mov.u32 %r6049, 4;
sub.s32 %r1638, %r6049, %r1635;
mov.u32 %r6050, 6;
sub.s32 %r6051, %r6050, %r1635;
mul.wide.s32 %rd1924, %r6051, 4;
add.s64 %rd1925, %rd1, %rd1924;
ld.local.u32 %r8551, [%rd1925];
ld.local.u32 %r8552, [%rd1925+-4];
and.b32 %r1641, %r1633, 31;
setp.eq.s32 %p1065, %r1641, 0;
@%p1065 bra $L__BB0_1251;
mov.u32 %r6052, 32;
sub.s32 %r6053, %r6052, %r1641;
shr.u32 %r6054, %r8552, %r6053;
shl.b32 %r6055, %r8551, %r1641;
add.s32 %r8551, %r6054, %r6055;
mul.wide.s32 %rd1926, %r1638, 4;
add.s64 %rd1927, %rd1, %rd1926;
ld.local.u32 %r6056, [%rd1927];
shr.u32 %r6057, %r6056, %r6053;
shl.b32 %r6058, %r8552, %r1641;
add.s32 %r8552, %r6057, %r6058;
$L__BB0_1251:
and.b32 %r6059, %r1632, -2147483648;
shr.u32 %r6060, %r8552, 30;
shl.b32 %r6061, %r8551, 2;
or.b32 %r6062, %r6060, %r6061;
shr.u32 %r6063, %r6062, 31;
shr.u32 %r6064, %r8551, 30;
add.s32 %r6065, %r6063, %r6064;
neg.s32 %r6066, %r6065;
setp.eq.s32 %p1066, %r6059, 0;
selp.b32 %r8553, %r6065, %r6066, %p1066;
setp.ne.s32 %p1067, %r6063, 0;
xor.b32 %r6067, %r6059, -2147483648;
selp.b32 %r6068, %r6067, %r6059, %p1067;
selp.b32 %r6069, -1, 0, %p1067;
xor.b32 %r6070, %r6062, %r6069;
shl.b32 %r6071, %r8552, 2;
xor.b32 %r6072, %r6071, %r6069;
cvt.u64.u32 %rd1928, %r6070;
cvt.u64.u32 %rd1929, %r6072;
bfi.b64 %rd1930, %rd1928, %rd1929, 32, 32;
cvt.rn.f64.s64 %fd163, %rd1930;
mul.f64 %fd164, %fd163, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4015, %fd164;
setp.eq.s32 %p1068, %r6068, 0;
neg.f32 %f4016, %f4015;
selp.f32 %f5662, %f4015, %f4016, %p1068;
$L__BB0_1253:
add.s32 %r1648, %r8553, 1;
and.b32 %r1649, %r1648, 1;
setp.eq.s32 %p1069, %r1649, 0;
selp.f32 %f1415, %f5662, 0f3F800000, %p1069;
mul.rn.f32 %f1416, %f5662, %f5662;
mov.f32 %f5663, 0fB94D4153;
@%p1069 bra $L__BB0_1255;
mov.f32 %f4019, 0fBAB607ED;
mov.f32 %f4020, 0f37CBAC00;
fma.rn.f32 %f5663, %f4020, %f1416, %f4019;
$L__BB0_1255:
selp.f32 %f4021, 0f3C0885E4, 0f3D2AAABB, %p1069;
fma.rn.f32 %f4022, %f5663, %f1416, %f4021;
selp.f32 %f4023, 0fBE2AAAA8, 0fBEFFFFFF, %p1069;
fma.rn.f32 %f4024, %f4022, %f1416, %f4023;
mov.f32 %f4025, 0f00000000;
fma.rn.f32 %f4026, %f1416, %f1415, %f4025;
fma.rn.f32 %f5664, %f4024, %f4026, %f1415;
and.b32 %r6074, %r1648, 2;
setp.eq.s32 %p1071, %r6074, 0;
@%p1071 bra $L__BB0_1257;
mov.f32 %f4028, 0fBF800000;
fma.rn.f32 %f5664, %f5664, %f4028, %f4025;
$L__BB0_1257:
add.f32 %f5714, %f5661, %f5664;
mul.f32 %f4029, %f1391, 0f3F22F983;
cvt.rni.s32.f32 %r8557, %f4029;
cvt.rn.f32.s32 %f4030, %r8557;
mov.f32 %f4031, 0fBFC90FDA;
fma.rn.f32 %f4032, %f4030, %f4031, %f1391;
mov.f32 %f4033, 0fB3A22168;
fma.rn.f32 %f4034, %f4030, %f4033, %f4032;
mov.f32 %f4035, 0fA7C234C5;
fma.rn.f32 %f5665, %f4030, %f4035, %f4034;
abs.f32 %f1424, %f1391;
setp.ltu.f32 %p1072, %f1424, 0f47CE4780;
@%p1072 bra $L__BB0_1265;
setp.eq.f32 %p1073, %f1424, 0f7F800000;
@%p1073 bra $L__BB0_1264;
bra.uni $L__BB0_1259;
$L__BB0_1264:
mov.f32 %f4038, 0f00000000;
mul.rn.f32 %f5665, %f1391, %f4038;
mov.u32 %r8557, 0;
bra.uni $L__BB0_1265;
$L__BB0_1259:
mov.b32 %r1651, %f1391;
shr.u32 %r6076, %r1651, 23;
and.b32 %r6077, %r6076, 255;
add.s32 %r1652, %r6077, -128;
shl.b32 %r6078, %r1651, 8;
or.b32 %r1653, %r6078, -2147483648;
shr.u32 %r1654, %r1652, 5;
mov.u64 %rd2664, 0;
mov.u32 %r8554, 0;
mov.u64 %rd1934, __cudart_i2opi_f;
mov.u64 %rd2665, %rd2664;
$L__BB0_1260:
.pragma "nounroll";
shl.b64 %rd1933, %rd2664, 2;
add.s64 %rd1935, %rd1934, %rd1933;
ld.global.nc.u32 %r6079, [%rd1935];
mad.wide.u32 %rd1936, %r6079, %r1653, %rd2665;
shr.u64 %rd2665, %rd1936, 32;
add.s64 %rd1937, %rd1, %rd1933;
st.local.u32 [%rd1937], %rd1936;
add.s32 %r8554, %r8554, 1;
cvt.s64.s32 %rd2664, %r8554;
setp.ne.s32 %p1074, %r8554, 6;
@%p1074 bra $L__BB0_1260;
st.local.u32 [%rd5], %rd2665;
mov.u32 %r6080, 4;
sub.s32 %r1657, %r6080, %r1654;
mov.u32 %r6081, 6;
sub.s32 %r6082, %r6081, %r1654;
mul.wide.s32 %rd1938, %r6082, 4;
add.s64 %rd1939, %rd1, %rd1938;
ld.local.u32 %r8555, [%rd1939];
ld.local.u32 %r8556, [%rd1939+-4];
and.b32 %r1660, %r1652, 31;
setp.eq.s32 %p1075, %r1660, 0;
@%p1075 bra $L__BB0_1263;
mov.u32 %r6083, 32;
sub.s32 %r6084, %r6083, %r1660;
shr.u32 %r6085, %r8556, %r6084;
shl.b32 %r6086, %r8555, %r1660;
add.s32 %r8555, %r6085, %r6086;
mul.wide.s32 %rd1940, %r1657, 4;
add.s64 %rd1941, %rd1, %rd1940;
ld.local.u32 %r6087, [%rd1941];
shr.u32 %r6088, %r6087, %r6084;
shl.b32 %r6089, %r8556, %r1660;
add.s32 %r8556, %r6088, %r6089;
$L__BB0_1263:
and.b32 %r6090, %r1651, -2147483648;
shr.u32 %r6091, %r8556, 30;
shl.b32 %r6092, %r8555, 2;
or.b32 %r6093, %r6091, %r6092;
shr.u32 %r6094, %r6093, 31;
shr.u32 %r6095, %r8555, 30;
add.s32 %r6096, %r6094, %r6095;
neg.s32 %r6097, %r6096;
setp.eq.s32 %p1076, %r6090, 0;
selp.b32 %r8557, %r6096, %r6097, %p1076;
setp.ne.s32 %p1077, %r6094, 0;
xor.b32 %r6098, %r6090, -2147483648;
selp.b32 %r6099, %r6098, %r6090, %p1077;
selp.b32 %r6100, -1, 0, %p1077;
xor.b32 %r6101, %r6093, %r6100;
shl.b32 %r6102, %r8556, 2;
xor.b32 %r6103, %r6102, %r6100;
cvt.u64.u32 %rd1942, %r6101;
cvt.u64.u32 %rd1943, %r6103;
bfi.b64 %rd1944, %rd1942, %rd1943, 32, 32;
cvt.rn.f64.s64 %fd165, %rd1944;
mul.f64 %fd166, %fd165, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4036, %fd166;
setp.eq.s32 %p1078, %r6099, 0;
neg.f32 %f4037, %f4036;
selp.f32 %f5665, %f4036, %f4037, %p1078;
$L__BB0_1265:
and.b32 %r1667, %r8557, 1;
setp.eq.s32 %p1079, %r1667, 0;
selp.f32 %f1428, %f5665, 0f3F800000, %p1079;
mul.rn.f32 %f1429, %f5665, %f5665;
mov.f32 %f5666, 0fB94D4153;
@%p1079 bra $L__BB0_1267;
mov.f32 %f4040, 0fBAB607ED;
mov.f32 %f4041, 0f37CBAC00;
fma.rn.f32 %f5666, %f4041, %f1429, %f4040;
$L__BB0_1267:
selp.f32 %f4042, 0f3C0885E4, 0f3D2AAABB, %p1079;
fma.rn.f32 %f4043, %f5666, %f1429, %f4042;
selp.f32 %f4044, 0fBE2AAAA8, 0fBEFFFFFF, %p1079;
fma.rn.f32 %f4045, %f4043, %f1429, %f4044;
mov.f32 %f4046, 0f00000000;
fma.rn.f32 %f4047, %f1429, %f1428, %f4046;
fma.rn.f32 %f5667, %f4045, %f4047, %f1428;
and.b32 %r6105, %r8557, 2;
setp.eq.s32 %p1081, %r6105, 0;
@%p1081 bra $L__BB0_1269;
mov.f32 %f4049, 0fBF800000;
fma.rn.f32 %f5667, %f5667, %f4049, %f4046;
$L__BB0_1269:
mul.f32 %f4050, %f1383, 0f3F22F983;
cvt.rni.s32.f32 %r8561, %f4050;
cvt.rn.f32.s32 %f4051, %r8561;
mov.f32 %f4052, 0fBFC90FDA;
fma.rn.f32 %f4053, %f4051, %f4052, %f1383;
mov.f32 %f4054, 0fB3A22168;
fma.rn.f32 %f4055, %f4051, %f4054, %f4053;
mov.f32 %f4056, 0fA7C234C5;
fma.rn.f32 %f5668, %f4051, %f4056, %f4055;
abs.f32 %f1436, %f1383;
setp.ltu.f32 %p1082, %f1436, 0f47CE4780;
@%p1082 bra $L__BB0_1277;
setp.eq.f32 %p1083, %f1436, 0f7F800000;
@%p1083 bra $L__BB0_1276;
bra.uni $L__BB0_1271;
$L__BB0_1276:
mov.f32 %f4059, 0f00000000;
mul.rn.f32 %f5668, %f1383, %f4059;
mov.u32 %r8561, 0;
bra.uni $L__BB0_1277;
$L__BB0_1271:
mov.b32 %r1669, %f1383;
shr.u32 %r6107, %r1669, 23;
and.b32 %r6108, %r6107, 255;
add.s32 %r1670, %r6108, -128;
shl.b32 %r6109, %r1669, 8;
or.b32 %r1671, %r6109, -2147483648;
shr.u32 %r1672, %r1670, 5;
mov.u64 %rd2666, 0;
mov.u32 %r8558, 0;
mov.u64 %rd1948, __cudart_i2opi_f;
mov.u64 %rd2667, %rd2666;
$L__BB0_1272:
.pragma "nounroll";
shl.b64 %rd1947, %rd2666, 2;
add.s64 %rd1949, %rd1948, %rd1947;
ld.global.nc.u32 %r6110, [%rd1949];
mad.wide.u32 %rd1950, %r6110, %r1671, %rd2667;
shr.u64 %rd2667, %rd1950, 32;
add.s64 %rd1951, %rd1, %rd1947;
st.local.u32 [%rd1951], %rd1950;
add.s32 %r8558, %r8558, 1;
cvt.s64.s32 %rd2666, %r8558;
setp.ne.s32 %p1084, %r8558, 6;
@%p1084 bra $L__BB0_1272;
st.local.u32 [%rd5], %rd2667;
mov.u32 %r6111, 4;
sub.s32 %r1675, %r6111, %r1672;
mov.u32 %r6112, 6;
sub.s32 %r6113, %r6112, %r1672;
mul.wide.s32 %rd1952, %r6113, 4;
add.s64 %rd1953, %rd1, %rd1952;
ld.local.u32 %r8559, [%rd1953];
ld.local.u32 %r8560, [%rd1953+-4];
and.b32 %r1678, %r1670, 31;
setp.eq.s32 %p1085, %r1678, 0;
@%p1085 bra $L__BB0_1275;
mov.u32 %r6114, 32;
sub.s32 %r6115, %r6114, %r1678;
shr.u32 %r6116, %r8560, %r6115;
shl.b32 %r6117, %r8559, %r1678;
add.s32 %r8559, %r6116, %r6117;
mul.wide.s32 %rd1954, %r1675, 4;
add.s64 %rd1955, %rd1, %rd1954;
ld.local.u32 %r6118, [%rd1955];
shr.u32 %r6119, %r6118, %r6115;
shl.b32 %r6120, %r8560, %r1678;
add.s32 %r8560, %r6119, %r6120;
$L__BB0_1275:
and.b32 %r6121, %r1669, -2147483648;
shr.u32 %r6122, %r8560, 30;
shl.b32 %r6123, %r8559, 2;
or.b32 %r6124, %r6122, %r6123;
shr.u32 %r6125, %r6124, 31;
shr.u32 %r6126, %r8559, 30;
add.s32 %r6127, %r6125, %r6126;
neg.s32 %r6128, %r6127;
setp.eq.s32 %p1086, %r6121, 0;
selp.b32 %r8561, %r6127, %r6128, %p1086;
setp.ne.s32 %p1087, %r6125, 0;
xor.b32 %r6129, %r6121, -2147483648;
selp.b32 %r6130, %r6129, %r6121, %p1087;
selp.b32 %r6131, -1, 0, %p1087;
xor.b32 %r6132, %r6124, %r6131;
shl.b32 %r6133, %r8560, 2;
xor.b32 %r6134, %r6133, %r6131;
cvt.u64.u32 %rd1956, %r6132;
cvt.u64.u32 %rd1957, %r6134;
bfi.b64 %rd1958, %rd1956, %rd1957, 32, 32;
cvt.rn.f64.s64 %fd167, %rd1958;
mul.f64 %fd168, %fd167, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4057, %fd168;
setp.eq.s32 %p1088, %r6130, 0;
neg.f32 %f4058, %f4057;
selp.f32 %f5668, %f4057, %f4058, %p1088;
$L__BB0_1277:
add.s32 %r1685, %r8561, 1;
and.b32 %r1686, %r1685, 1;
setp.eq.s32 %p1089, %r1686, 0;
selp.f32 %f1440, %f5668, 0f3F800000, %p1089;
mul.rn.f32 %f1441, %f5668, %f5668;
mov.f32 %f5669, 0fB94D4153;
@%p1089 bra $L__BB0_1279;
mov.f32 %f4061, 0fBAB607ED;
mov.f32 %f4062, 0f37CBAC00;
fma.rn.f32 %f5669, %f4062, %f1441, %f4061;
$L__BB0_1279:
selp.f32 %f4063, 0f3C0885E4, 0f3D2AAABB, %p1089;
fma.rn.f32 %f4064, %f5669, %f1441, %f4063;
selp.f32 %f4065, 0fBE2AAAA8, 0fBEFFFFFF, %p1089;
fma.rn.f32 %f4066, %f4064, %f1441, %f4065;
mov.f32 %f4067, 0f00000000;
fma.rn.f32 %f4068, %f1441, %f1440, %f4067;
fma.rn.f32 %f5670, %f4066, %f4068, %f1440;
and.b32 %r6136, %r1685, 2;
setp.eq.s32 %p1091, %r6136, 0;
@%p1091 bra $L__BB0_1281;
mov.f32 %f4070, 0fBF800000;
fma.rn.f32 %f5670, %f5670, %f4070, %f4067;
$L__BB0_1281:
add.f32 %f5713, %f5667, %f5670;
mul.f32 %f4071, %f1392, 0f3F22F983;
cvt.rni.s32.f32 %r8565, %f4071;
cvt.rn.f32.s32 %f4072, %r8565;
mov.f32 %f4073, 0fBFC90FDA;
fma.rn.f32 %f4074, %f4072, %f4073, %f1392;
mov.f32 %f4075, 0fB3A22168;
fma.rn.f32 %f4076, %f4072, %f4075, %f4074;
mov.f32 %f4077, 0fA7C234C5;
fma.rn.f32 %f5671, %f4072, %f4077, %f4076;
abs.f32 %f1449, %f1392;
setp.ltu.f32 %p1092, %f1449, 0f47CE4780;
@%p1092 bra $L__BB0_1289;
setp.eq.f32 %p1093, %f1449, 0f7F800000;
@%p1093 bra $L__BB0_1288;
bra.uni $L__BB0_1283;
$L__BB0_1288:
mov.f32 %f4080, 0f00000000;
mul.rn.f32 %f5671, %f1392, %f4080;
mov.u32 %r8565, 0;
bra.uni $L__BB0_1289;
$L__BB0_1283:
mov.b32 %r1688, %f1392;
shr.u32 %r6138, %r1688, 23;
and.b32 %r6139, %r6138, 255;
add.s32 %r1689, %r6139, -128;
shl.b32 %r6140, %r1688, 8;
or.b32 %r1690, %r6140, -2147483648;
shr.u32 %r1691, %r1689, 5;
mov.u64 %rd2668, 0;
mov.u32 %r8562, 0;
mov.u64 %rd1962, __cudart_i2opi_f;
mov.u64 %rd2669, %rd2668;
$L__BB0_1284:
.pragma "nounroll";
shl.b64 %rd1961, %rd2668, 2;
add.s64 %rd1963, %rd1962, %rd1961;
ld.global.nc.u32 %r6141, [%rd1963];
mad.wide.u32 %rd1964, %r6141, %r1690, %rd2669;
shr.u64 %rd2669, %rd1964, 32;
add.s64 %rd1965, %rd1, %rd1961;
st.local.u32 [%rd1965], %rd1964;
add.s32 %r8562, %r8562, 1;
cvt.s64.s32 %rd2668, %r8562;
setp.ne.s32 %p1094, %r8562, 6;
@%p1094 bra $L__BB0_1284;
st.local.u32 [%rd5], %rd2669;
mov.u32 %r6142, 4;
sub.s32 %r1694, %r6142, %r1691;
mov.u32 %r6143, 6;
sub.s32 %r6144, %r6143, %r1691;
mul.wide.s32 %rd1966, %r6144, 4;
add.s64 %rd1967, %rd1, %rd1966;
ld.local.u32 %r8563, [%rd1967];
ld.local.u32 %r8564, [%rd1967+-4];
and.b32 %r1697, %r1689, 31;
setp.eq.s32 %p1095, %r1697, 0;
@%p1095 bra $L__BB0_1287;
mov.u32 %r6145, 32;
sub.s32 %r6146, %r6145, %r1697;
shr.u32 %r6147, %r8564, %r6146;
shl.b32 %r6148, %r8563, %r1697;
add.s32 %r8563, %r6147, %r6148;
mul.wide.s32 %rd1968, %r1694, 4;
add.s64 %rd1969, %rd1, %rd1968;
ld.local.u32 %r6149, [%rd1969];
shr.u32 %r6150, %r6149, %r6146;
shl.b32 %r6151, %r8564, %r1697;
add.s32 %r8564, %r6150, %r6151;
$L__BB0_1287:
and.b32 %r6152, %r1688, -2147483648;
shr.u32 %r6153, %r8564, 30;
shl.b32 %r6154, %r8563, 2;
or.b32 %r6155, %r6153, %r6154;
shr.u32 %r6156, %r6155, 31;
shr.u32 %r6157, %r8563, 30;
add.s32 %r6158, %r6156, %r6157;
neg.s32 %r6159, %r6158;
setp.eq.s32 %p1096, %r6152, 0;
selp.b32 %r8565, %r6158, %r6159, %p1096;
setp.ne.s32 %p1097, %r6156, 0;
xor.b32 %r6160, %r6152, -2147483648;
selp.b32 %r6161, %r6160, %r6152, %p1097;
selp.b32 %r6162, -1, 0, %p1097;
xor.b32 %r6163, %r6155, %r6162;
shl.b32 %r6164, %r8564, 2;
xor.b32 %r6165, %r6164, %r6162;
cvt.u64.u32 %rd1970, %r6163;
cvt.u64.u32 %rd1971, %r6165;
bfi.b64 %rd1972, %rd1970, %rd1971, 32, 32;
cvt.rn.f64.s64 %fd169, %rd1972;
mul.f64 %fd170, %fd169, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4078, %fd170;
setp.eq.s32 %p1098, %r6161, 0;
neg.f32 %f4079, %f4078;
selp.f32 %f5671, %f4078, %f4079, %p1098;
$L__BB0_1289:
and.b32 %r1704, %r8565, 1;
setp.eq.s32 %p1099, %r1704, 0;
selp.f32 %f1453, %f5671, 0f3F800000, %p1099;
mul.rn.f32 %f1454, %f5671, %f5671;
mov.f32 %f5672, 0fB94D4153;
@%p1099 bra $L__BB0_1291;
mov.f32 %f4082, 0fBAB607ED;
mov.f32 %f4083, 0f37CBAC00;
fma.rn.f32 %f5672, %f4083, %f1454, %f4082;
$L__BB0_1291:
selp.f32 %f4084, 0f3C0885E4, 0f3D2AAABB, %p1099;
fma.rn.f32 %f4085, %f5672, %f1454, %f4084;
selp.f32 %f4086, 0fBE2AAAA8, 0fBEFFFFFF, %p1099;
fma.rn.f32 %f4087, %f4085, %f1454, %f4086;
mov.f32 %f4088, 0f00000000;
fma.rn.f32 %f4089, %f1454, %f1453, %f4088;
fma.rn.f32 %f5673, %f4087, %f4089, %f1453;
and.b32 %r6167, %r8565, 2;
setp.eq.s32 %p1101, %r6167, 0;
@%p1101 bra $L__BB0_1293;
mov.f32 %f4091, 0fBF800000;
fma.rn.f32 %f5673, %f5673, %f4091, %f4088;
$L__BB0_1293:
mul.f32 %f4092, %f1384, 0f3F22F983;
cvt.rni.s32.f32 %r8569, %f4092;
cvt.rn.f32.s32 %f4093, %r8569;
mov.f32 %f4094, 0fBFC90FDA;
fma.rn.f32 %f4095, %f4093, %f4094, %f1384;
mov.f32 %f4096, 0fB3A22168;
fma.rn.f32 %f4097, %f4093, %f4096, %f4095;
mov.f32 %f4098, 0fA7C234C5;
fma.rn.f32 %f5674, %f4093, %f4098, %f4097;
abs.f32 %f1461, %f1384;
setp.ltu.f32 %p1102, %f1461, 0f47CE4780;
@%p1102 bra $L__BB0_1301;
setp.eq.f32 %p1103, %f1461, 0f7F800000;
@%p1103 bra $L__BB0_1300;
bra.uni $L__BB0_1295;
$L__BB0_1300:
mov.f32 %f4101, 0f00000000;
mul.rn.f32 %f5674, %f1384, %f4101;
mov.u32 %r8569, 0;
bra.uni $L__BB0_1301;
$L__BB0_1295:
mov.b32 %r1706, %f1384;
shr.u32 %r6169, %r1706, 23;
and.b32 %r6170, %r6169, 255;
add.s32 %r1707, %r6170, -128;
shl.b32 %r6171, %r1706, 8;
or.b32 %r1708, %r6171, -2147483648;
shr.u32 %r1709, %r1707, 5;
mov.u64 %rd2670, 0;
mov.u32 %r8566, 0;
mov.u64 %rd1976, __cudart_i2opi_f;
mov.u64 %rd2671, %rd2670;
$L__BB0_1296:
.pragma "nounroll";
shl.b64 %rd1975, %rd2670, 2;
add.s64 %rd1977, %rd1976, %rd1975;
ld.global.nc.u32 %r6172, [%rd1977];
mad.wide.u32 %rd1978, %r6172, %r1708, %rd2671;
shr.u64 %rd2671, %rd1978, 32;
add.s64 %rd1979, %rd1, %rd1975;
st.local.u32 [%rd1979], %rd1978;
add.s32 %r8566, %r8566, 1;
cvt.s64.s32 %rd2670, %r8566;
setp.ne.s32 %p1104, %r8566, 6;
@%p1104 bra $L__BB0_1296;
st.local.u32 [%rd5], %rd2671;
mov.u32 %r6173, 4;
sub.s32 %r1712, %r6173, %r1709;
mov.u32 %r6174, 6;
sub.s32 %r6175, %r6174, %r1709;
mul.wide.s32 %rd1980, %r6175, 4;
add.s64 %rd1981, %rd1, %rd1980;
ld.local.u32 %r8567, [%rd1981];
ld.local.u32 %r8568, [%rd1981+-4];
and.b32 %r1715, %r1707, 31;
setp.eq.s32 %p1105, %r1715, 0;
@%p1105 bra $L__BB0_1299;
mov.u32 %r6176, 32;
sub.s32 %r6177, %r6176, %r1715;
shr.u32 %r6178, %r8568, %r6177;
shl.b32 %r6179, %r8567, %r1715;
add.s32 %r8567, %r6178, %r6179;
mul.wide.s32 %rd1982, %r1712, 4;
add.s64 %rd1983, %rd1, %rd1982;
ld.local.u32 %r6180, [%rd1983];
shr.u32 %r6181, %r6180, %r6177;
shl.b32 %r6182, %r8568, %r1715;
add.s32 %r8568, %r6181, %r6182;
$L__BB0_1299:
and.b32 %r6183, %r1706, -2147483648;
shr.u32 %r6184, %r8568, 30;
shl.b32 %r6185, %r8567, 2;
or.b32 %r6186, %r6184, %r6185;
shr.u32 %r6187, %r6186, 31;
shr.u32 %r6188, %r8567, 30;
add.s32 %r6189, %r6187, %r6188;
neg.s32 %r6190, %r6189;
setp.eq.s32 %p1106, %r6183, 0;
selp.b32 %r8569, %r6189, %r6190, %p1106;
setp.ne.s32 %p1107, %r6187, 0;
xor.b32 %r6191, %r6183, -2147483648;
selp.b32 %r6192, %r6191, %r6183, %p1107;
selp.b32 %r6193, -1, 0, %p1107;
xor.b32 %r6194, %r6186, %r6193;
shl.b32 %r6195, %r8568, 2;
xor.b32 %r6196, %r6195, %r6193;
cvt.u64.u32 %rd1984, %r6194;
cvt.u64.u32 %rd1985, %r6196;
bfi.b64 %rd1986, %rd1984, %rd1985, 32, 32;
cvt.rn.f64.s64 %fd171, %rd1986;
mul.f64 %fd172, %fd171, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4099, %fd172;
setp.eq.s32 %p1108, %r6192, 0;
neg.f32 %f4100, %f4099;
selp.f32 %f5674, %f4099, %f4100, %p1108;
$L__BB0_1301:
add.s32 %r1722, %r8569, 1;
and.b32 %r1723, %r1722, 1;
setp.eq.s32 %p1109, %r1723, 0;
selp.f32 %f1465, %f5674, 0f3F800000, %p1109;
mul.rn.f32 %f1466, %f5674, %f5674;
mov.f32 %f5675, 0fB94D4153;
@%p1109 bra $L__BB0_1303;
mov.f32 %f4103, 0fBAB607ED;
mov.f32 %f4104, 0f37CBAC00;
fma.rn.f32 %f5675, %f4104, %f1466, %f4103;
$L__BB0_1303:
selp.f32 %f4105, 0f3C0885E4, 0f3D2AAABB, %p1109;
fma.rn.f32 %f4106, %f5675, %f1466, %f4105;
selp.f32 %f4107, 0fBE2AAAA8, 0fBEFFFFFF, %p1109;
fma.rn.f32 %f4108, %f4106, %f1466, %f4107;
mov.f32 %f4109, 0f00000000;
fma.rn.f32 %f4110, %f1466, %f1465, %f4109;
fma.rn.f32 %f5676, %f4108, %f4110, %f1465;
and.b32 %r6198, %r1722, 2;
setp.eq.s32 %p1111, %r6198, 0;
@%p1111 bra $L__BB0_1305;
mov.f32 %f4112, 0fBF800000;
fma.rn.f32 %f5676, %f5676, %f4112, %f4109;
$L__BB0_1305:
add.f32 %f5712, %f5673, %f5676;
mul.f32 %f4113, %f1393, 0f3F22F983;
cvt.rni.s32.f32 %r8573, %f4113;
cvt.rn.f32.s32 %f4114, %r8573;
mov.f32 %f4115, 0fBFC90FDA;
fma.rn.f32 %f4116, %f4114, %f4115, %f1393;
mov.f32 %f4117, 0fB3A22168;
fma.rn.f32 %f4118, %f4114, %f4117, %f4116;
mov.f32 %f4119, 0fA7C234C5;
fma.rn.f32 %f5677, %f4114, %f4119, %f4118;
abs.f32 %f1474, %f1393;
setp.ltu.f32 %p1112, %f1474, 0f47CE4780;
@%p1112 bra $L__BB0_1313;
setp.eq.f32 %p1113, %f1474, 0f7F800000;
@%p1113 bra $L__BB0_1312;
bra.uni $L__BB0_1307;
$L__BB0_1312:
mov.f32 %f4122, 0f00000000;
mul.rn.f32 %f5677, %f1393, %f4122;
mov.u32 %r8573, 0;
bra.uni $L__BB0_1313;
$L__BB0_1307:
mov.b32 %r1725, %f1393;
shr.u32 %r6200, %r1725, 23;
and.b32 %r6201, %r6200, 255;
add.s32 %r1726, %r6201, -128;
shl.b32 %r6202, %r1725, 8;
or.b32 %r1727, %r6202, -2147483648;
shr.u32 %r1728, %r1726, 5;
mov.u64 %rd2672, 0;
mov.u32 %r8570, 0;
mov.u64 %rd1990, __cudart_i2opi_f;
mov.u64 %rd2673, %rd2672;
$L__BB0_1308:
.pragma "nounroll";
shl.b64 %rd1989, %rd2672, 2;
add.s64 %rd1991, %rd1990, %rd1989;
ld.global.nc.u32 %r6203, [%rd1991];
mad.wide.u32 %rd1992, %r6203, %r1727, %rd2673;
shr.u64 %rd2673, %rd1992, 32;
add.s64 %rd1993, %rd1, %rd1989;
st.local.u32 [%rd1993], %rd1992;
add.s32 %r8570, %r8570, 1;
cvt.s64.s32 %rd2672, %r8570;
setp.ne.s32 %p1114, %r8570, 6;
@%p1114 bra $L__BB0_1308;
st.local.u32 [%rd5], %rd2673;
mov.u32 %r6204, 4;
sub.s32 %r1731, %r6204, %r1728;
mov.u32 %r6205, 6;
sub.s32 %r6206, %r6205, %r1728;
mul.wide.s32 %rd1994, %r6206, 4;
add.s64 %rd1995, %rd1, %rd1994;
ld.local.u32 %r8571, [%rd1995];
ld.local.u32 %r8572, [%rd1995+-4];
and.b32 %r1734, %r1726, 31;
setp.eq.s32 %p1115, %r1734, 0;
@%p1115 bra $L__BB0_1311;
mov.u32 %r6207, 32;
sub.s32 %r6208, %r6207, %r1734;
shr.u32 %r6209, %r8572, %r6208;
shl.b32 %r6210, %r8571, %r1734;
add.s32 %r8571, %r6209, %r6210;
mul.wide.s32 %rd1996, %r1731, 4;
add.s64 %rd1997, %rd1, %rd1996;
ld.local.u32 %r6211, [%rd1997];
shr.u32 %r6212, %r6211, %r6208;
shl.b32 %r6213, %r8572, %r1734;
add.s32 %r8572, %r6212, %r6213;
$L__BB0_1311:
and.b32 %r6214, %r1725, -2147483648;
shr.u32 %r6215, %r8572, 30;
shl.b32 %r6216, %r8571, 2;
or.b32 %r6217, %r6215, %r6216;
shr.u32 %r6218, %r6217, 31;
shr.u32 %r6219, %r8571, 30;
add.s32 %r6220, %r6218, %r6219;
neg.s32 %r6221, %r6220;
setp.eq.s32 %p1116, %r6214, 0;
selp.b32 %r8573, %r6220, %r6221, %p1116;
setp.ne.s32 %p1117, %r6218, 0;
xor.b32 %r6222, %r6214, -2147483648;
selp.b32 %r6223, %r6222, %r6214, %p1117;
selp.b32 %r6224, -1, 0, %p1117;
xor.b32 %r6225, %r6217, %r6224;
shl.b32 %r6226, %r8572, 2;
xor.b32 %r6227, %r6226, %r6224;
cvt.u64.u32 %rd1998, %r6225;
cvt.u64.u32 %rd1999, %r6227;
bfi.b64 %rd2000, %rd1998, %rd1999, 32, 32;
cvt.rn.f64.s64 %fd173, %rd2000;
mul.f64 %fd174, %fd173, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4120, %fd174;
setp.eq.s32 %p1118, %r6223, 0;
neg.f32 %f4121, %f4120;
selp.f32 %f5677, %f4120, %f4121, %p1118;
$L__BB0_1313:
and.b32 %r1741, %r8573, 1;
setp.eq.s32 %p1119, %r1741, 0;
selp.f32 %f1478, %f5677, 0f3F800000, %p1119;
mul.rn.f32 %f1479, %f5677, %f5677;
mov.f32 %f5678, 0fB94D4153;
@%p1119 bra $L__BB0_1315;
mov.f32 %f4124, 0fBAB607ED;
mov.f32 %f4125, 0f37CBAC00;
fma.rn.f32 %f5678, %f4125, %f1479, %f4124;
$L__BB0_1315:
selp.f32 %f4126, 0f3C0885E4, 0f3D2AAABB, %p1119;
fma.rn.f32 %f4127, %f5678, %f1479, %f4126;
selp.f32 %f4128, 0fBE2AAAA8, 0fBEFFFFFF, %p1119;
fma.rn.f32 %f4129, %f4127, %f1479, %f4128;
mov.f32 %f4130, 0f00000000;
fma.rn.f32 %f4131, %f1479, %f1478, %f4130;
fma.rn.f32 %f5679, %f4129, %f4131, %f1478;
and.b32 %r6229, %r8573, 2;
setp.eq.s32 %p1121, %r6229, 0;
@%p1121 bra $L__BB0_1317;
mov.f32 %f4133, 0fBF800000;
fma.rn.f32 %f5679, %f5679, %f4133, %f4130;
$L__BB0_1317:
mul.f32 %f4134, %f1385, 0f3F22F983;
cvt.rni.s32.f32 %r8577, %f4134;
cvt.rn.f32.s32 %f4135, %r8577;
mov.f32 %f4136, 0fBFC90FDA;
fma.rn.f32 %f4137, %f4135, %f4136, %f1385;
mov.f32 %f4138, 0fB3A22168;
fma.rn.f32 %f4139, %f4135, %f4138, %f4137;
mov.f32 %f4140, 0fA7C234C5;
fma.rn.f32 %f5680, %f4135, %f4140, %f4139;
abs.f32 %f1486, %f1385;
setp.ltu.f32 %p1122, %f1486, 0f47CE4780;
@%p1122 bra $L__BB0_1325;
setp.eq.f32 %p1123, %f1486, 0f7F800000;
@%p1123 bra $L__BB0_1324;
bra.uni $L__BB0_1319;
$L__BB0_1324:
mov.f32 %f4143, 0f00000000;
mul.rn.f32 %f5680, %f1385, %f4143;
mov.u32 %r8577, 0;
bra.uni $L__BB0_1325;
$L__BB0_1319:
mov.b32 %r1743, %f1385;
shr.u32 %r6231, %r1743, 23;
and.b32 %r6232, %r6231, 255;
add.s32 %r1744, %r6232, -128;
shl.b32 %r6233, %r1743, 8;
or.b32 %r1745, %r6233, -2147483648;
shr.u32 %r1746, %r1744, 5;
mov.u64 %rd2674, 0;
mov.u32 %r8574, 0;
mov.u64 %rd2004, __cudart_i2opi_f;
mov.u64 %rd2675, %rd2674;
$L__BB0_1320:
.pragma "nounroll";
shl.b64 %rd2003, %rd2674, 2;
add.s64 %rd2005, %rd2004, %rd2003;
ld.global.nc.u32 %r6234, [%rd2005];
mad.wide.u32 %rd2006, %r6234, %r1745, %rd2675;
shr.u64 %rd2675, %rd2006, 32;
add.s64 %rd2007, %rd1, %rd2003;
st.local.u32 [%rd2007], %rd2006;
add.s32 %r8574, %r8574, 1;
cvt.s64.s32 %rd2674, %r8574;
setp.ne.s32 %p1124, %r8574, 6;
@%p1124 bra $L__BB0_1320;
st.local.u32 [%rd5], %rd2675;
mov.u32 %r6235, 4;
sub.s32 %r1749, %r6235, %r1746;
mov.u32 %r6236, 6;
sub.s32 %r6237, %r6236, %r1746;
mul.wide.s32 %rd2008, %r6237, 4;
add.s64 %rd2009, %rd1, %rd2008;
ld.local.u32 %r8575, [%rd2009];
ld.local.u32 %r8576, [%rd2009+-4];
and.b32 %r1752, %r1744, 31;
setp.eq.s32 %p1125, %r1752, 0;
@%p1125 bra $L__BB0_1323;
mov.u32 %r6238, 32;
sub.s32 %r6239, %r6238, %r1752;
shr.u32 %r6240, %r8576, %r6239;
shl.b32 %r6241, %r8575, %r1752;
add.s32 %r8575, %r6240, %r6241;
mul.wide.s32 %rd2010, %r1749, 4;
add.s64 %rd2011, %rd1, %rd2010;
ld.local.u32 %r6242, [%rd2011];
shr.u32 %r6243, %r6242, %r6239;
shl.b32 %r6244, %r8576, %r1752;
add.s32 %r8576, %r6243, %r6244;
$L__BB0_1323:
and.b32 %r6245, %r1743, -2147483648;
shr.u32 %r6246, %r8576, 30;
shl.b32 %r6247, %r8575, 2;
or.b32 %r6248, %r6246, %r6247;
shr.u32 %r6249, %r6248, 31;
shr.u32 %r6250, %r8575, 30;
add.s32 %r6251, %r6249, %r6250;
neg.s32 %r6252, %r6251;
setp.eq.s32 %p1126, %r6245, 0;
selp.b32 %r8577, %r6251, %r6252, %p1126;
setp.ne.s32 %p1127, %r6249, 0;
xor.b32 %r6253, %r6245, -2147483648;
selp.b32 %r6254, %r6253, %r6245, %p1127;
selp.b32 %r6255, -1, 0, %p1127;
xor.b32 %r6256, %r6248, %r6255;
shl.b32 %r6257, %r8576, 2;
xor.b32 %r6258, %r6257, %r6255;
cvt.u64.u32 %rd2012, %r6256;
cvt.u64.u32 %rd2013, %r6258;
bfi.b64 %rd2014, %rd2012, %rd2013, 32, 32;
cvt.rn.f64.s64 %fd175, %rd2014;
mul.f64 %fd176, %fd175, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4141, %fd176;
setp.eq.s32 %p1128, %r6254, 0;
neg.f32 %f4142, %f4141;
selp.f32 %f5680, %f4141, %f4142, %p1128;
$L__BB0_1325:
add.s32 %r1759, %r8577, 1;
and.b32 %r1760, %r1759, 1;
setp.eq.s32 %p1129, %r1760, 0;
selp.f32 %f1490, %f5680, 0f3F800000, %p1129;
mul.rn.f32 %f1491, %f5680, %f5680;
mov.f32 %f5681, 0fB94D4153;
@%p1129 bra $L__BB0_1327;
mov.f32 %f4145, 0fBAB607ED;
mov.f32 %f4146, 0f37CBAC00;
fma.rn.f32 %f5681, %f4146, %f1491, %f4145;
$L__BB0_1327:
selp.f32 %f4147, 0f3C0885E4, 0f3D2AAABB, %p1129;
fma.rn.f32 %f4148, %f5681, %f1491, %f4147;
selp.f32 %f4149, 0fBE2AAAA8, 0fBEFFFFFF, %p1129;
fma.rn.f32 %f4150, %f4148, %f1491, %f4149;
mov.f32 %f4151, 0f00000000;
fma.rn.f32 %f4152, %f1491, %f1490, %f4151;
fma.rn.f32 %f5682, %f4150, %f4152, %f1490;
and.b32 %r6260, %r1759, 2;
setp.eq.s32 %p1131, %r6260, 0;
@%p1131 bra $L__BB0_1329;
mov.f32 %f4154, 0fBF800000;
fma.rn.f32 %f5682, %f5682, %f4154, %f4151;
$L__BB0_1329:
add.f32 %f5711, %f5679, %f5682;
mul.f32 %f4155, %f1394, 0f3F22F983;
cvt.rni.s32.f32 %r8581, %f4155;
cvt.rn.f32.s32 %f4156, %r8581;
mov.f32 %f4157, 0fBFC90FDA;
fma.rn.f32 %f4158, %f4156, %f4157, %f1394;
mov.f32 %f4159, 0fB3A22168;
fma.rn.f32 %f4160, %f4156, %f4159, %f4158;
mov.f32 %f4161, 0fA7C234C5;
fma.rn.f32 %f5683, %f4156, %f4161, %f4160;
abs.f32 %f1499, %f1394;
setp.ltu.f32 %p1132, %f1499, 0f47CE4780;
@%p1132 bra $L__BB0_1337;
setp.eq.f32 %p1133, %f1499, 0f7F800000;
@%p1133 bra $L__BB0_1336;
bra.uni $L__BB0_1331;
$L__BB0_1336:
mov.f32 %f4164, 0f00000000;
mul.rn.f32 %f5683, %f1394, %f4164;
mov.u32 %r8581, 0;
bra.uni $L__BB0_1337;
$L__BB0_1331:
mov.b32 %r1762, %f1394;
shr.u32 %r6262, %r1762, 23;
and.b32 %r6263, %r6262, 255;
add.s32 %r1763, %r6263, -128;
shl.b32 %r6264, %r1762, 8;
or.b32 %r1764, %r6264, -2147483648;
shr.u32 %r1765, %r1763, 5;
mov.u64 %rd2678, 0;
mov.u32 %r8578, 0;
mov.u64 %rd2676, __cudart_i2opi_f;
mov.u64 %rd2677, %rd1;
$L__BB0_1332:
.pragma "nounroll";
ld.global.nc.u32 %r6265, [%rd2676];
mad.wide.u32 %rd2017, %r6265, %r1764, %rd2678;
shr.u64 %rd2678, %rd2017, 32;
st.local.u32 [%rd2677], %rd2017;
add.s64 %rd2677, %rd2677, 4;
add.s64 %rd2676, %rd2676, 4;
add.s32 %r8578, %r8578, 1;
setp.ne.s32 %p1134, %r8578, 6;
@%p1134 bra $L__BB0_1332;
st.local.u32 [%rd5], %rd2678;
mov.u32 %r6266, 4;
sub.s32 %r1768, %r6266, %r1765;
mov.u32 %r6267, 6;
sub.s32 %r6268, %r6267, %r1765;
mul.wide.s32 %rd2018, %r6268, 4;
add.s64 %rd2019, %rd1, %rd2018;
ld.local.u32 %r8579, [%rd2019];
ld.local.u32 %r8580, [%rd2019+-4];
and.b32 %r1771, %r1763, 31;
setp.eq.s32 %p1135, %r1771, 0;
@%p1135 bra $L__BB0_1335;
mov.u32 %r6269, 32;
sub.s32 %r6270, %r6269, %r1771;
shr.u32 %r6271, %r8580, %r6270;
shl.b32 %r6272, %r8579, %r1771;
add.s32 %r8579, %r6271, %r6272;
mul.wide.s32 %rd2020, %r1768, 4;
add.s64 %rd2021, %rd1, %rd2020;
ld.local.u32 %r6273, [%rd2021];
shr.u32 %r6274, %r6273, %r6270;
shl.b32 %r6275, %r8580, %r1771;
add.s32 %r8580, %r6274, %r6275;
$L__BB0_1335:
and.b32 %r6276, %r1762, -2147483648;
shr.u32 %r6277, %r8580, 30;
shl.b32 %r6278, %r8579, 2;
or.b32 %r6279, %r6277, %r6278;
shr.u32 %r6280, %r6279, 31;
shr.u32 %r6281, %r8579, 30;
add.s32 %r6282, %r6280, %r6281;
neg.s32 %r6283, %r6282;
setp.eq.s32 %p1136, %r6276, 0;
selp.b32 %r8581, %r6282, %r6283, %p1136;
setp.ne.s32 %p1137, %r6280, 0;
xor.b32 %r6284, %r6276, -2147483648;
selp.b32 %r6285, %r6284, %r6276, %p1137;
selp.b32 %r6286, -1, 0, %p1137;
xor.b32 %r6287, %r6279, %r6286;
shl.b32 %r6288, %r8580, 2;
xor.b32 %r6289, %r6288, %r6286;
cvt.u64.u32 %rd2022, %r6287;
cvt.u64.u32 %rd2023, %r6289;
bfi.b64 %rd2024, %rd2022, %rd2023, 32, 32;
cvt.rn.f64.s64 %fd177, %rd2024;
mul.f64 %fd178, %fd177, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4162, %fd178;
setp.eq.s32 %p1138, %r6285, 0;
neg.f32 %f4163, %f4162;
selp.f32 %f5683, %f4162, %f4163, %p1138;
$L__BB0_1337:
and.b32 %r1778, %r8581, 1;
setp.eq.s32 %p1139, %r1778, 0;
selp.f32 %f1503, %f5683, 0f3F800000, %p1139;
mul.rn.f32 %f1504, %f5683, %f5683;
mov.f32 %f5684, 0fB94D4153;
@%p1139 bra $L__BB0_1339;
mov.f32 %f4166, 0fBAB607ED;
mov.f32 %f4167, 0f37CBAC00;
fma.rn.f32 %f5684, %f4167, %f1504, %f4166;
$L__BB0_1339:
selp.f32 %f4168, 0f3C0885E4, 0f3D2AAABB, %p1139;
fma.rn.f32 %f4169, %f5684, %f1504, %f4168;
selp.f32 %f4170, 0fBE2AAAA8, 0fBEFFFFFF, %p1139;
fma.rn.f32 %f4171, %f4169, %f1504, %f4170;
mov.f32 %f4172, 0f00000000;
fma.rn.f32 %f4173, %f1504, %f1503, %f4172;
fma.rn.f32 %f5685, %f4171, %f4173, %f1503;
and.b32 %r6291, %r8581, 2;
setp.eq.s32 %p1141, %r6291, 0;
@%p1141 bra $L__BB0_1341;
mov.f32 %f4175, 0fBF800000;
fma.rn.f32 %f5685, %f5685, %f4175, %f4172;
$L__BB0_1341:
mul.f32 %f4176, %f1386, 0f3F22F983;
cvt.rni.s32.f32 %r8585, %f4176;
cvt.rn.f32.s32 %f4177, %r8585;
mov.f32 %f4178, 0fBFC90FDA;
fma.rn.f32 %f4179, %f4177, %f4178, %f1386;
mov.f32 %f4180, 0fB3A22168;
fma.rn.f32 %f4181, %f4177, %f4180, %f4179;
mov.f32 %f4182, 0fA7C234C5;
fma.rn.f32 %f5686, %f4177, %f4182, %f4181;
abs.f32 %f1511, %f1386;
setp.ltu.f32 %p1142, %f1511, 0f47CE4780;
@%p1142 bra $L__BB0_1349;
setp.eq.f32 %p1143, %f1511, 0f7F800000;
@%p1143 bra $L__BB0_1348;
bra.uni $L__BB0_1343;
$L__BB0_1348:
mov.f32 %f4185, 0f00000000;
mul.rn.f32 %f5686, %f1386, %f4185;
mov.u32 %r8585, 0;
bra.uni $L__BB0_1349;
$L__BB0_1343:
mov.b32 %r1780, %f1386;
shr.u32 %r6293, %r1780, 23;
and.b32 %r6294, %r6293, 255;
add.s32 %r1781, %r6294, -128;
shl.b32 %r6295, %r1780, 8;
or.b32 %r1782, %r6295, -2147483648;
shr.u32 %r1783, %r1781, 5;
mov.u64 %rd2681, 0;
mov.u32 %r8582, 0;
mov.u64 %rd2679, __cudart_i2opi_f;
mov.u64 %rd2680, %rd1;
$L__BB0_1344:
.pragma "nounroll";
ld.global.nc.u32 %r6296, [%rd2679];
mad.wide.u32 %rd2027, %r6296, %r1782, %rd2681;
shr.u64 %rd2681, %rd2027, 32;
st.local.u32 [%rd2680], %rd2027;
add.s64 %rd2680, %rd2680, 4;
add.s64 %rd2679, %rd2679, 4;
add.s32 %r8582, %r8582, 1;
setp.ne.s32 %p1144, %r8582, 6;
@%p1144 bra $L__BB0_1344;
st.local.u32 [%rd5], %rd2681;
mov.u32 %r6297, 4;
sub.s32 %r1786, %r6297, %r1783;
mov.u32 %r6298, 6;
sub.s32 %r6299, %r6298, %r1783;
mul.wide.s32 %rd2028, %r6299, 4;
add.s64 %rd2029, %rd1, %rd2028;
ld.local.u32 %r8583, [%rd2029];
ld.local.u32 %r8584, [%rd2029+-4];
and.b32 %r1789, %r1781, 31;
setp.eq.s32 %p1145, %r1789, 0;
@%p1145 bra $L__BB0_1347;
mov.u32 %r6300, 32;
sub.s32 %r6301, %r6300, %r1789;
shr.u32 %r6302, %r8584, %r6301;
shl.b32 %r6303, %r8583, %r1789;
add.s32 %r8583, %r6302, %r6303;
mul.wide.s32 %rd2030, %r1786, 4;
add.s64 %rd2031, %rd1, %rd2030;
ld.local.u32 %r6304, [%rd2031];
shr.u32 %r6305, %r6304, %r6301;
shl.b32 %r6306, %r8584, %r1789;
add.s32 %r8584, %r6305, %r6306;
$L__BB0_1347:
and.b32 %r6307, %r1780, -2147483648;
shr.u32 %r6308, %r8584, 30;
shl.b32 %r6309, %r8583, 2;
or.b32 %r6310, %r6308, %r6309;
shr.u32 %r6311, %r6310, 31;
shr.u32 %r6312, %r8583, 30;
add.s32 %r6313, %r6311, %r6312;
neg.s32 %r6314, %r6313;
setp.eq.s32 %p1146, %r6307, 0;
selp.b32 %r8585, %r6313, %r6314, %p1146;
setp.ne.s32 %p1147, %r6311, 0;
xor.b32 %r6315, %r6307, -2147483648;
selp.b32 %r6316, %r6315, %r6307, %p1147;
selp.b32 %r6317, -1, 0, %p1147;
xor.b32 %r6318, %r6310, %r6317;
shl.b32 %r6319, %r8584, 2;
xor.b32 %r6320, %r6319, %r6317;
cvt.u64.u32 %rd2032, %r6318;
cvt.u64.u32 %rd2033, %r6320;
bfi.b64 %rd2034, %rd2032, %rd2033, 32, 32;
cvt.rn.f64.s64 %fd179, %rd2034;
mul.f64 %fd180, %fd179, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4183, %fd180;
setp.eq.s32 %p1148, %r6316, 0;
neg.f32 %f4184, %f4183;
selp.f32 %f5686, %f4183, %f4184, %p1148;
$L__BB0_1349:
add.s32 %r1796, %r8585, 1;
and.b32 %r1797, %r1796, 1;
setp.eq.s32 %p1149, %r1797, 0;
selp.f32 %f1515, %f5686, 0f3F800000, %p1149;
mul.rn.f32 %f1516, %f5686, %f5686;
mov.f32 %f5687, 0fB94D4153;
@%p1149 bra $L__BB0_1351;
mov.f32 %f4187, 0fBAB607ED;
mov.f32 %f4188, 0f37CBAC00;
fma.rn.f32 %f5687, %f4188, %f1516, %f4187;
$L__BB0_1351:
selp.f32 %f4189, 0f3C0885E4, 0f3D2AAABB, %p1149;
fma.rn.f32 %f4190, %f5687, %f1516, %f4189;
selp.f32 %f4191, 0fBE2AAAA8, 0fBEFFFFFF, %p1149;
fma.rn.f32 %f4192, %f4190, %f1516, %f4191;
mov.f32 %f4193, 0f00000000;
fma.rn.f32 %f4194, %f1516, %f1515, %f4193;
fma.rn.f32 %f5688, %f4192, %f4194, %f1515;
and.b32 %r6322, %r1796, 2;
setp.eq.s32 %p1151, %r6322, 0;
@%p1151 bra $L__BB0_1353;
mov.f32 %f4196, 0fBF800000;
fma.rn.f32 %f5688, %f5688, %f4196, %f4193;
$L__BB0_1353:
add.f32 %f5710, %f5685, %f5688;
mul.f32 %f4197, %f1395, 0f3F22F983;
cvt.rni.s32.f32 %r8589, %f4197;
cvt.rn.f32.s32 %f4198, %r8589;
mov.f32 %f4199, 0fBFC90FDA;
fma.rn.f32 %f4200, %f4198, %f4199, %f1395;
mov.f32 %f4201, 0fB3A22168;
fma.rn.f32 %f4202, %f4198, %f4201, %f4200;
mov.f32 %f4203, 0fA7C234C5;
fma.rn.f32 %f5689, %f4198, %f4203, %f4202;
abs.f32 %f1524, %f1395;
setp.ltu.f32 %p1152, %f1524, 0f47CE4780;
@%p1152 bra $L__BB0_1361;
setp.eq.f32 %p1153, %f1524, 0f7F800000;
@%p1153 bra $L__BB0_1360;
bra.uni $L__BB0_1355;
$L__BB0_1360:
mov.f32 %f4206, 0f00000000;
mul.rn.f32 %f5689, %f1395, %f4206;
mov.u32 %r8589, 0;
bra.uni $L__BB0_1361;
$L__BB0_1355:
mov.b32 %r1799, %f1395;
shr.u32 %r6324, %r1799, 23;
and.b32 %r6325, %r6324, 255;
add.s32 %r1800, %r6325, -128;
shl.b32 %r6326, %r1799, 8;
or.b32 %r1801, %r6326, -2147483648;
shr.u32 %r1802, %r1800, 5;
mov.u64 %rd2684, 0;
mov.u32 %r8586, 0;
mov.u64 %rd2682, __cudart_i2opi_f;
mov.u64 %rd2683, %rd1;
$L__BB0_1356:
.pragma "nounroll";
ld.global.nc.u32 %r6327, [%rd2682];
mad.wide.u32 %rd2037, %r6327, %r1801, %rd2684;
shr.u64 %rd2684, %rd2037, 32;
st.local.u32 [%rd2683], %rd2037;
add.s64 %rd2683, %rd2683, 4;
add.s64 %rd2682, %rd2682, 4;
add.s32 %r8586, %r8586, 1;
setp.ne.s32 %p1154, %r8586, 6;
@%p1154 bra $L__BB0_1356;
st.local.u32 [%rd5], %rd2684;
mov.u32 %r6328, 4;
sub.s32 %r1805, %r6328, %r1802;
mov.u32 %r6329, 6;
sub.s32 %r6330, %r6329, %r1802;
mul.wide.s32 %rd2038, %r6330, 4;
add.s64 %rd2039, %rd1, %rd2038;
ld.local.u32 %r8587, [%rd2039];
ld.local.u32 %r8588, [%rd2039+-4];
and.b32 %r1808, %r1800, 31;
setp.eq.s32 %p1155, %r1808, 0;
@%p1155 bra $L__BB0_1359;
mov.u32 %r6331, 32;
sub.s32 %r6332, %r6331, %r1808;
shr.u32 %r6333, %r8588, %r6332;
shl.b32 %r6334, %r8587, %r1808;
add.s32 %r8587, %r6333, %r6334;
mul.wide.s32 %rd2040, %r1805, 4;
add.s64 %rd2041, %rd1, %rd2040;
ld.local.u32 %r6335, [%rd2041];
shr.u32 %r6336, %r6335, %r6332;
shl.b32 %r6337, %r8588, %r1808;
add.s32 %r8588, %r6336, %r6337;
$L__BB0_1359:
and.b32 %r6338, %r1799, -2147483648;
shr.u32 %r6339, %r8588, 30;
shl.b32 %r6340, %r8587, 2;
or.b32 %r6341, %r6339, %r6340;
shr.u32 %r6342, %r6341, 31;
shr.u32 %r6343, %r8587, 30;
add.s32 %r6344, %r6342, %r6343;
neg.s32 %r6345, %r6344;
setp.eq.s32 %p1156, %r6338, 0;
selp.b32 %r8589, %r6344, %r6345, %p1156;
setp.ne.s32 %p1157, %r6342, 0;
xor.b32 %r6346, %r6338, -2147483648;
selp.b32 %r6347, %r6346, %r6338, %p1157;
selp.b32 %r6348, -1, 0, %p1157;
xor.b32 %r6349, %r6341, %r6348;
shl.b32 %r6350, %r8588, 2;
xor.b32 %r6351, %r6350, %r6348;
cvt.u64.u32 %rd2042, %r6349;
cvt.u64.u32 %rd2043, %r6351;
bfi.b64 %rd2044, %rd2042, %rd2043, 32, 32;
cvt.rn.f64.s64 %fd181, %rd2044;
mul.f64 %fd182, %fd181, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4204, %fd182;
setp.eq.s32 %p1158, %r6347, 0;
neg.f32 %f4205, %f4204;
selp.f32 %f5689, %f4204, %f4205, %p1158;
$L__BB0_1361:
and.b32 %r1815, %r8589, 1;
setp.eq.s32 %p1159, %r1815, 0;
selp.f32 %f1528, %f5689, 0f3F800000, %p1159;
mul.rn.f32 %f1529, %f5689, %f5689;
mov.f32 %f5690, 0fB94D4153;
@%p1159 bra $L__BB0_1363;
mov.f32 %f4208, 0fBAB607ED;
mov.f32 %f4209, 0f37CBAC00;
fma.rn.f32 %f5690, %f4209, %f1529, %f4208;
$L__BB0_1363:
selp.f32 %f4210, 0f3C0885E4, 0f3D2AAABB, %p1159;
fma.rn.f32 %f4211, %f5690, %f1529, %f4210;
selp.f32 %f4212, 0fBE2AAAA8, 0fBEFFFFFF, %p1159;
fma.rn.f32 %f4213, %f4211, %f1529, %f4212;
mov.f32 %f4214, 0f00000000;
fma.rn.f32 %f4215, %f1529, %f1528, %f4214;
fma.rn.f32 %f5691, %f4213, %f4215, %f1528;
and.b32 %r6353, %r8589, 2;
setp.eq.s32 %p1161, %r6353, 0;
@%p1161 bra $L__BB0_1365;
mov.f32 %f4217, 0fBF800000;
fma.rn.f32 %f5691, %f5691, %f4217, %f4214;
$L__BB0_1365:
mul.f32 %f4218, %f1387, 0f3F22F983;
cvt.rni.s32.f32 %r8593, %f4218;
cvt.rn.f32.s32 %f4219, %r8593;
mov.f32 %f4220, 0fBFC90FDA;
fma.rn.f32 %f4221, %f4219, %f4220, %f1387;
mov.f32 %f4222, 0fB3A22168;
fma.rn.f32 %f4223, %f4219, %f4222, %f4221;
mov.f32 %f4224, 0fA7C234C5;
fma.rn.f32 %f5692, %f4219, %f4224, %f4223;
abs.f32 %f1536, %f1387;
setp.ltu.f32 %p1162, %f1536, 0f47CE4780;
@%p1162 bra $L__BB0_1373;
setp.eq.f32 %p1163, %f1536, 0f7F800000;
@%p1163 bra $L__BB0_1372;
bra.uni $L__BB0_1367;
$L__BB0_1372:
mov.f32 %f4227, 0f00000000;
mul.rn.f32 %f5692, %f1387, %f4227;
mov.u32 %r8593, 0;
bra.uni $L__BB0_1373;
$L__BB0_1367:
mov.b32 %r1817, %f1387;
shr.u32 %r6355, %r1817, 23;
and.b32 %r6356, %r6355, 255;
add.s32 %r1818, %r6356, -128;
shl.b32 %r6357, %r1817, 8;
or.b32 %r1819, %r6357, -2147483648;
shr.u32 %r1820, %r1818, 5;
mov.u64 %rd2687, 0;
mov.u32 %r8590, 0;
mov.u64 %rd2685, __cudart_i2opi_f;
mov.u64 %rd2686, %rd1;
$L__BB0_1368:
.pragma "nounroll";
ld.global.nc.u32 %r6358, [%rd2685];
mad.wide.u32 %rd2047, %r6358, %r1819, %rd2687;
shr.u64 %rd2687, %rd2047, 32;
st.local.u32 [%rd2686], %rd2047;
add.s64 %rd2686, %rd2686, 4;
add.s64 %rd2685, %rd2685, 4;
add.s32 %r8590, %r8590, 1;
setp.ne.s32 %p1164, %r8590, 6;
@%p1164 bra $L__BB0_1368;
st.local.u32 [%rd5], %rd2687;
mov.u32 %r6359, 4;
sub.s32 %r1823, %r6359, %r1820;
mov.u32 %r6360, 6;
sub.s32 %r6361, %r6360, %r1820;
mul.wide.s32 %rd2048, %r6361, 4;
add.s64 %rd2049, %rd1, %rd2048;
ld.local.u32 %r8591, [%rd2049];
ld.local.u32 %r8592, [%rd2049+-4];
and.b32 %r1826, %r1818, 31;
setp.eq.s32 %p1165, %r1826, 0;
@%p1165 bra $L__BB0_1371;
mov.u32 %r6362, 32;
sub.s32 %r6363, %r6362, %r1826;
shr.u32 %r6364, %r8592, %r6363;
shl.b32 %r6365, %r8591, %r1826;
add.s32 %r8591, %r6364, %r6365;
mul.wide.s32 %rd2050, %r1823, 4;
add.s64 %rd2051, %rd1, %rd2050;
ld.local.u32 %r6366, [%rd2051];
shr.u32 %r6367, %r6366, %r6363;
shl.b32 %r6368, %r8592, %r1826;
add.s32 %r8592, %r6367, %r6368;
$L__BB0_1371:
and.b32 %r6369, %r1817, -2147483648;
shr.u32 %r6370, %r8592, 30;
shl.b32 %r6371, %r8591, 2;
or.b32 %r6372, %r6370, %r6371;
shr.u32 %r6373, %r6372, 31;
shr.u32 %r6374, %r8591, 30;
add.s32 %r6375, %r6373, %r6374;
neg.s32 %r6376, %r6375;
setp.eq.s32 %p1166, %r6369, 0;
selp.b32 %r8593, %r6375, %r6376, %p1166;
setp.ne.s32 %p1167, %r6373, 0;
xor.b32 %r6377, %r6369, -2147483648;
selp.b32 %r6378, %r6377, %r6369, %p1167;
selp.b32 %r6379, -1, 0, %p1167;
xor.b32 %r6380, %r6372, %r6379;
shl.b32 %r6381, %r8592, 2;
xor.b32 %r6382, %r6381, %r6379;
cvt.u64.u32 %rd2052, %r6380;
cvt.u64.u32 %rd2053, %r6382;
bfi.b64 %rd2054, %rd2052, %rd2053, 32, 32;
cvt.rn.f64.s64 %fd183, %rd2054;
mul.f64 %fd184, %fd183, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4225, %fd184;
setp.eq.s32 %p1168, %r6378, 0;
neg.f32 %f4226, %f4225;
selp.f32 %f5692, %f4225, %f4226, %p1168;
$L__BB0_1373:
add.s32 %r1833, %r8593, 1;
and.b32 %r1834, %r1833, 1;
setp.eq.s32 %p1169, %r1834, 0;
selp.f32 %f1540, %f5692, 0f3F800000, %p1169;
mul.rn.f32 %f1541, %f5692, %f5692;
mov.f32 %f5693, 0fB94D4153;
@%p1169 bra $L__BB0_1375;
mov.f32 %f4229, 0fBAB607ED;
mov.f32 %f4230, 0f37CBAC00;
fma.rn.f32 %f5693, %f4230, %f1541, %f4229;
$L__BB0_1375:
selp.f32 %f4231, 0f3C0885E4, 0f3D2AAABB, %p1169;
fma.rn.f32 %f4232, %f5693, %f1541, %f4231;
selp.f32 %f4233, 0fBE2AAAA8, 0fBEFFFFFF, %p1169;
fma.rn.f32 %f4234, %f4232, %f1541, %f4233;
mov.f32 %f4235, 0f00000000;
fma.rn.f32 %f4236, %f1541, %f1540, %f4235;
fma.rn.f32 %f5694, %f4234, %f4236, %f1540;
and.b32 %r6384, %r1833, 2;
setp.eq.s32 %p1171, %r6384, 0;
@%p1171 bra $L__BB0_1377;
mov.f32 %f4238, 0fBF800000;
fma.rn.f32 %f5694, %f5694, %f4238, %f4235;
$L__BB0_1377:
add.f32 %f5709, %f5691, %f5694;
mul.f32 %f4239, %f1396, 0f3F22F983;
cvt.rni.s32.f32 %r8597, %f4239;
cvt.rn.f32.s32 %f4240, %r8597;
mov.f32 %f4241, 0fBFC90FDA;
fma.rn.f32 %f4242, %f4240, %f4241, %f1396;
mov.f32 %f4243, 0fB3A22168;
fma.rn.f32 %f4244, %f4240, %f4243, %f4242;
mov.f32 %f4245, 0fA7C234C5;
fma.rn.f32 %f5695, %f4240, %f4245, %f4244;
abs.f32 %f1549, %f1396;
setp.ltu.f32 %p1172, %f1549, 0f47CE4780;
@%p1172 bra $L__BB0_1385;
setp.eq.f32 %p1173, %f1549, 0f7F800000;
@%p1173 bra $L__BB0_1384;
bra.uni $L__BB0_1379;
$L__BB0_1384:
mov.f32 %f4248, 0f00000000;
mul.rn.f32 %f5695, %f1396, %f4248;
mov.u32 %r8597, 0;
bra.uni $L__BB0_1385;
$L__BB0_1379:
mov.b32 %r1836, %f1396;
shr.u32 %r6386, %r1836, 23;
and.b32 %r6387, %r6386, 255;
add.s32 %r1837, %r6387, -128;
shl.b32 %r6388, %r1836, 8;
or.b32 %r1838, %r6388, -2147483648;
shr.u32 %r1839, %r1837, 5;
mov.u64 %rd2690, 0;
mov.u32 %r8594, 0;
mov.u64 %rd2688, __cudart_i2opi_f;
mov.u64 %rd2689, %rd1;
$L__BB0_1380:
.pragma "nounroll";
ld.global.nc.u32 %r6389, [%rd2688];
mad.wide.u32 %rd2057, %r6389, %r1838, %rd2690;
shr.u64 %rd2690, %rd2057, 32;
st.local.u32 [%rd2689], %rd2057;
add.s64 %rd2689, %rd2689, 4;
add.s64 %rd2688, %rd2688, 4;
add.s32 %r8594, %r8594, 1;
setp.ne.s32 %p1174, %r8594, 6;
@%p1174 bra $L__BB0_1380;
st.local.u32 [%rd5], %rd2690;
mov.u32 %r6390, 4;
sub.s32 %r1842, %r6390, %r1839;
mov.u32 %r6391, 6;
sub.s32 %r6392, %r6391, %r1839;
mul.wide.s32 %rd2058, %r6392, 4;
add.s64 %rd2059, %rd1, %rd2058;
ld.local.u32 %r8595, [%rd2059];
ld.local.u32 %r8596, [%rd2059+-4];
and.b32 %r1845, %r1837, 31;
setp.eq.s32 %p1175, %r1845, 0;
@%p1175 bra $L__BB0_1383;
mov.u32 %r6393, 32;
sub.s32 %r6394, %r6393, %r1845;
shr.u32 %r6395, %r8596, %r6394;
shl.b32 %r6396, %r8595, %r1845;
add.s32 %r8595, %r6395, %r6396;
mul.wide.s32 %rd2060, %r1842, 4;
add.s64 %rd2061, %rd1, %rd2060;
ld.local.u32 %r6397, [%rd2061];
shr.u32 %r6398, %r6397, %r6394;
shl.b32 %r6399, %r8596, %r1845;
add.s32 %r8596, %r6398, %r6399;
$L__BB0_1383:
and.b32 %r6400, %r1836, -2147483648;
shr.u32 %r6401, %r8596, 30;
shl.b32 %r6402, %r8595, 2;
or.b32 %r6403, %r6401, %r6402;
shr.u32 %r6404, %r6403, 31;
shr.u32 %r6405, %r8595, 30;
add.s32 %r6406, %r6404, %r6405;
neg.s32 %r6407, %r6406;
setp.eq.s32 %p1176, %r6400, 0;
selp.b32 %r8597, %r6406, %r6407, %p1176;
setp.ne.s32 %p1177, %r6404, 0;
xor.b32 %r6408, %r6400, -2147483648;
selp.b32 %r6409, %r6408, %r6400, %p1177;
selp.b32 %r6410, -1, 0, %p1177;
xor.b32 %r6411, %r6403, %r6410;
shl.b32 %r6412, %r8596, 2;
xor.b32 %r6413, %r6412, %r6410;
cvt.u64.u32 %rd2062, %r6411;
cvt.u64.u32 %rd2063, %r6413;
bfi.b64 %rd2064, %rd2062, %rd2063, 32, 32;
cvt.rn.f64.s64 %fd185, %rd2064;
mul.f64 %fd186, %fd185, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4246, %fd186;
setp.eq.s32 %p1178, %r6409, 0;
neg.f32 %f4247, %f4246;
selp.f32 %f5695, %f4246, %f4247, %p1178;
$L__BB0_1385:
and.b32 %r1852, %r8597, 1;
setp.eq.s32 %p1179, %r1852, 0;
selp.f32 %f1553, %f5695, 0f3F800000, %p1179;
mul.rn.f32 %f1554, %f5695, %f5695;
mov.f32 %f5696, 0fB94D4153;
@%p1179 bra $L__BB0_1387;
mov.f32 %f4250, 0fBAB607ED;
mov.f32 %f4251, 0f37CBAC00;
fma.rn.f32 %f5696, %f4251, %f1554, %f4250;
$L__BB0_1387:
selp.f32 %f4252, 0f3C0885E4, 0f3D2AAABB, %p1179;
fma.rn.f32 %f4253, %f5696, %f1554, %f4252;
selp.f32 %f4254, 0fBE2AAAA8, 0fBEFFFFFF, %p1179;
fma.rn.f32 %f4255, %f4253, %f1554, %f4254;
mov.f32 %f4256, 0f00000000;
fma.rn.f32 %f4257, %f1554, %f1553, %f4256;
fma.rn.f32 %f5697, %f4255, %f4257, %f1553;
and.b32 %r6415, %r8597, 2;
setp.eq.s32 %p1181, %r6415, 0;
@%p1181 bra $L__BB0_1389;
mov.f32 %f4259, 0fBF800000;
fma.rn.f32 %f5697, %f5697, %f4259, %f4256;
$L__BB0_1389:
mul.f32 %f4260, %f1388, 0f3F22F983;
cvt.rni.s32.f32 %r8601, %f4260;
cvt.rn.f32.s32 %f4261, %r8601;
mov.f32 %f4262, 0fBFC90FDA;
fma.rn.f32 %f4263, %f4261, %f4262, %f1388;
mov.f32 %f4264, 0fB3A22168;
fma.rn.f32 %f4265, %f4261, %f4264, %f4263;
mov.f32 %f4266, 0fA7C234C5;
fma.rn.f32 %f5698, %f4261, %f4266, %f4265;
abs.f32 %f1561, %f1388;
setp.ltu.f32 %p1182, %f1561, 0f47CE4780;
@%p1182 bra $L__BB0_1397;
setp.eq.f32 %p1183, %f1561, 0f7F800000;
@%p1183 bra $L__BB0_1396;
bra.uni $L__BB0_1391;
$L__BB0_1396:
mov.f32 %f4269, 0f00000000;
mul.rn.f32 %f5698, %f1388, %f4269;
mov.u32 %r8601, 0;
bra.uni $L__BB0_1397;
$L__BB0_1391:
mov.b32 %r1854, %f1388;
shr.u32 %r6417, %r1854, 23;
and.b32 %r6418, %r6417, 255;
add.s32 %r1855, %r6418, -128;
shl.b32 %r6419, %r1854, 8;
or.b32 %r1856, %r6419, -2147483648;
shr.u32 %r1857, %r1855, 5;
mov.u64 %rd2693, 0;
mov.u32 %r8598, 0;
mov.u64 %rd2691, __cudart_i2opi_f;
mov.u64 %rd2692, %rd1;
$L__BB0_1392:
.pragma "nounroll";
ld.global.nc.u32 %r6420, [%rd2691];
mad.wide.u32 %rd2067, %r6420, %r1856, %rd2693;
shr.u64 %rd2693, %rd2067, 32;
st.local.u32 [%rd2692], %rd2067;
add.s64 %rd2692, %rd2692, 4;
add.s64 %rd2691, %rd2691, 4;
add.s32 %r8598, %r8598, 1;
setp.ne.s32 %p1184, %r8598, 6;
@%p1184 bra $L__BB0_1392;
st.local.u32 [%rd5], %rd2693;
mov.u32 %r6421, 4;
sub.s32 %r1860, %r6421, %r1857;
mov.u32 %r6422, 6;
sub.s32 %r6423, %r6422, %r1857;
mul.wide.s32 %rd2068, %r6423, 4;
add.s64 %rd2069, %rd1, %rd2068;
ld.local.u32 %r8599, [%rd2069];
ld.local.u32 %r8600, [%rd2069+-4];
and.b32 %r1863, %r1855, 31;
setp.eq.s32 %p1185, %r1863, 0;
@%p1185 bra $L__BB0_1395;
mov.u32 %r6424, 32;
sub.s32 %r6425, %r6424, %r1863;
shr.u32 %r6426, %r8600, %r6425;
shl.b32 %r6427, %r8599, %r1863;
add.s32 %r8599, %r6426, %r6427;
mul.wide.s32 %rd2070, %r1860, 4;
add.s64 %rd2071, %rd1, %rd2070;
ld.local.u32 %r6428, [%rd2071];
shr.u32 %r6429, %r6428, %r6425;
shl.b32 %r6430, %r8600, %r1863;
add.s32 %r8600, %r6429, %r6430;
$L__BB0_1395:
and.b32 %r6431, %r1854, -2147483648;
shr.u32 %r6432, %r8600, 30;
shl.b32 %r6433, %r8599, 2;
or.b32 %r6434, %r6432, %r6433;
shr.u32 %r6435, %r6434, 31;
shr.u32 %r6436, %r8599, 30;
add.s32 %r6437, %r6435, %r6436;
neg.s32 %r6438, %r6437;
setp.eq.s32 %p1186, %r6431, 0;
selp.b32 %r8601, %r6437, %r6438, %p1186;
setp.ne.s32 %p1187, %r6435, 0;
xor.b32 %r6439, %r6431, -2147483648;
selp.b32 %r6440, %r6439, %r6431, %p1187;
selp.b32 %r6441, -1, 0, %p1187;
xor.b32 %r6442, %r6434, %r6441;
shl.b32 %r6443, %r8600, 2;
xor.b32 %r6444, %r6443, %r6441;
cvt.u64.u32 %rd2072, %r6442;
cvt.u64.u32 %rd2073, %r6444;
bfi.b64 %rd2074, %rd2072, %rd2073, 32, 32;
cvt.rn.f64.s64 %fd187, %rd2074;
mul.f64 %fd188, %fd187, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4267, %fd188;
setp.eq.s32 %p1188, %r6440, 0;
neg.f32 %f4268, %f4267;
selp.f32 %f5698, %f4267, %f4268, %p1188;
$L__BB0_1397:
add.s32 %r1870, %r8601, 1;
and.b32 %r1871, %r1870, 1;
setp.eq.s32 %p1189, %r1871, 0;
selp.f32 %f1565, %f5698, 0f3F800000, %p1189;
mul.rn.f32 %f1566, %f5698, %f5698;
mov.f32 %f5699, 0fB94D4153;
@%p1189 bra $L__BB0_1399;
mov.f32 %f4271, 0fBAB607ED;
mov.f32 %f4272, 0f37CBAC00;
fma.rn.f32 %f5699, %f4272, %f1566, %f4271;
$L__BB0_1399:
selp.f32 %f4273, 0f3C0885E4, 0f3D2AAABB, %p1189;
fma.rn.f32 %f4274, %f5699, %f1566, %f4273;
selp.f32 %f4275, 0fBE2AAAA8, 0fBEFFFFFF, %p1189;
fma.rn.f32 %f4276, %f4274, %f1566, %f4275;
mov.f32 %f4277, 0f00000000;
fma.rn.f32 %f4278, %f1566, %f1565, %f4277;
fma.rn.f32 %f5700, %f4276, %f4278, %f1565;
and.b32 %r6446, %r1870, 2;
setp.eq.s32 %p1191, %r6446, 0;
@%p1191 bra $L__BB0_1401;
mov.f32 %f4280, 0fBF800000;
fma.rn.f32 %f5700, %f5700, %f4280, %f4277;
$L__BB0_1401:
add.f32 %f5708, %f5697, %f5700;
mul.f32 %f4281, %f1397, 0f3F22F983;
cvt.rni.s32.f32 %r8605, %f4281;
cvt.rn.f32.s32 %f4282, %r8605;
mov.f32 %f4283, 0fBFC90FDA;
fma.rn.f32 %f4284, %f4282, %f4283, %f1397;
mov.f32 %f4285, 0fB3A22168;
fma.rn.f32 %f4286, %f4282, %f4285, %f4284;
mov.f32 %f4287, 0fA7C234C5;
fma.rn.f32 %f5701, %f4282, %f4287, %f4286;
abs.f32 %f1574, %f1397;
setp.ltu.f32 %p1192, %f1574, 0f47CE4780;
@%p1192 bra $L__BB0_1409;
setp.eq.f32 %p1193, %f1574, 0f7F800000;
@%p1193 bra $L__BB0_1408;
bra.uni $L__BB0_1403;
$L__BB0_1408:
mov.f32 %f4290, 0f00000000;
mul.rn.f32 %f5701, %f1397, %f4290;
mov.u32 %r8605, 0;
bra.uni $L__BB0_1409;
$L__BB0_1403:
mov.b32 %r1873, %f1397;
shr.u32 %r6448, %r1873, 23;
and.b32 %r6449, %r6448, 255;
add.s32 %r1874, %r6449, -128;
shl.b32 %r6450, %r1873, 8;
or.b32 %r1875, %r6450, -2147483648;
shr.u32 %r1876, %r1874, 5;
mov.u64 %rd2696, 0;
mov.u32 %r8602, 0;
mov.u64 %rd2694, __cudart_i2opi_f;
mov.u64 %rd2695, %rd1;
$L__BB0_1404:
.pragma "nounroll";
ld.global.nc.u32 %r6451, [%rd2694];
mad.wide.u32 %rd2077, %r6451, %r1875, %rd2696;
shr.u64 %rd2696, %rd2077, 32;
st.local.u32 [%rd2695], %rd2077;
add.s64 %rd2695, %rd2695, 4;
add.s64 %rd2694, %rd2694, 4;
add.s32 %r8602, %r8602, 1;
setp.ne.s32 %p1194, %r8602, 6;
@%p1194 bra $L__BB0_1404;
st.local.u32 [%rd5], %rd2696;
mov.u32 %r6452, 4;
sub.s32 %r1879, %r6452, %r1876;
mov.u32 %r6453, 6;
sub.s32 %r6454, %r6453, %r1876;
mul.wide.s32 %rd2078, %r6454, 4;
add.s64 %rd2079, %rd1, %rd2078;
ld.local.u32 %r8603, [%rd2079];
ld.local.u32 %r8604, [%rd2079+-4];
and.b32 %r1882, %r1874, 31;
setp.eq.s32 %p1195, %r1882, 0;
@%p1195 bra $L__BB0_1407;
mov.u32 %r6455, 32;
sub.s32 %r6456, %r6455, %r1882;
shr.u32 %r6457, %r8604, %r6456;
shl.b32 %r6458, %r8603, %r1882;
add.s32 %r8603, %r6457, %r6458;
mul.wide.s32 %rd2080, %r1879, 4;
add.s64 %rd2081, %rd1, %rd2080;
ld.local.u32 %r6459, [%rd2081];
shr.u32 %r6460, %r6459, %r6456;
shl.b32 %r6461, %r8604, %r1882;
add.s32 %r8604, %r6460, %r6461;
$L__BB0_1407:
and.b32 %r6462, %r1873, -2147483648;
shr.u32 %r6463, %r8604, 30;
shl.b32 %r6464, %r8603, 2;
or.b32 %r6465, %r6463, %r6464;
shr.u32 %r6466, %r6465, 31;
shr.u32 %r6467, %r8603, 30;
add.s32 %r6468, %r6466, %r6467;
neg.s32 %r6469, %r6468;
setp.eq.s32 %p1196, %r6462, 0;
selp.b32 %r8605, %r6468, %r6469, %p1196;
setp.ne.s32 %p1197, %r6466, 0;
xor.b32 %r6470, %r6462, -2147483648;
selp.b32 %r6471, %r6470, %r6462, %p1197;
selp.b32 %r6472, -1, 0, %p1197;
xor.b32 %r6473, %r6465, %r6472;
shl.b32 %r6474, %r8604, 2;
xor.b32 %r6475, %r6474, %r6472;
cvt.u64.u32 %rd2082, %r6473;
cvt.u64.u32 %rd2083, %r6475;
bfi.b64 %rd2084, %rd2082, %rd2083, 32, 32;
cvt.rn.f64.s64 %fd189, %rd2084;
mul.f64 %fd190, %fd189, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4288, %fd190;
setp.eq.s32 %p1198, %r6471, 0;
neg.f32 %f4289, %f4288;
selp.f32 %f5701, %f4288, %f4289, %p1198;
$L__BB0_1409:
and.b32 %r1889, %r8605, 1;
setp.eq.s32 %p1199, %r1889, 0;
selp.f32 %f1578, %f5701, 0f3F800000, %p1199;
mul.rn.f32 %f1579, %f5701, %f5701;
mov.f32 %f5702, 0fB94D4153;
@%p1199 bra $L__BB0_1411;
mov.f32 %f4292, 0fBAB607ED;
mov.f32 %f4293, 0f37CBAC00;
fma.rn.f32 %f5702, %f4293, %f1579, %f4292;
$L__BB0_1411:
selp.f32 %f4294, 0f3C0885E4, 0f3D2AAABB, %p1199;
fma.rn.f32 %f4295, %f5702, %f1579, %f4294;
selp.f32 %f4296, 0fBE2AAAA8, 0fBEFFFFFF, %p1199;
fma.rn.f32 %f4297, %f4295, %f1579, %f4296;
mov.f32 %f4298, 0f00000000;
fma.rn.f32 %f4299, %f1579, %f1578, %f4298;
fma.rn.f32 %f5703, %f4297, %f4299, %f1578;
and.b32 %r6477, %r8605, 2;
setp.eq.s32 %p1201, %r6477, 0;
@%p1201 bra $L__BB0_1413;
mov.f32 %f4301, 0fBF800000;
fma.rn.f32 %f5703, %f5703, %f4301, %f4298;
$L__BB0_1413:
mul.f32 %f4302, %f1389, 0f3F22F983;
cvt.rni.s32.f32 %r8609, %f4302;
cvt.rn.f32.s32 %f4303, %r8609;
mov.f32 %f4304, 0fBFC90FDA;
fma.rn.f32 %f4305, %f4303, %f4304, %f1389;
mov.f32 %f4306, 0fB3A22168;
fma.rn.f32 %f4307, %f4303, %f4306, %f4305;
mov.f32 %f4308, 0fA7C234C5;
fma.rn.f32 %f5704, %f4303, %f4308, %f4307;
abs.f32 %f1586, %f1389;
setp.ltu.f32 %p1202, %f1586, 0f47CE4780;
@%p1202 bra $L__BB0_1421;
setp.eq.f32 %p1203, %f1586, 0f7F800000;
@%p1203 bra $L__BB0_1420;
bra.uni $L__BB0_1415;
$L__BB0_1420:
mov.f32 %f4311, 0f00000000;
mul.rn.f32 %f5704, %f1389, %f4311;
mov.u32 %r8609, 0;
bra.uni $L__BB0_1421;
$L__BB0_1415:
mov.b32 %r1891, %f1389;
shr.u32 %r6479, %r1891, 23;
and.b32 %r6480, %r6479, 255;
add.s32 %r1892, %r6480, -128;
shl.b32 %r6481, %r1891, 8;
or.b32 %r1893, %r6481, -2147483648;
shr.u32 %r1894, %r1892, 5;
mov.u64 %rd2699, 0;
mov.u32 %r8606, 0;
mov.u64 %rd2697, __cudart_i2opi_f;
mov.u64 %rd2698, %rd1;
$L__BB0_1416:
.pragma "nounroll";
ld.global.nc.u32 %r6482, [%rd2697];
mad.wide.u32 %rd2087, %r6482, %r1893, %rd2699;
shr.u64 %rd2699, %rd2087, 32;
st.local.u32 [%rd2698], %rd2087;
add.s64 %rd2698, %rd2698, 4;
add.s64 %rd2697, %rd2697, 4;
add.s32 %r8606, %r8606, 1;
setp.ne.s32 %p1204, %r8606, 6;
@%p1204 bra $L__BB0_1416;
st.local.u32 [%rd5], %rd2699;
mov.u32 %r6483, 4;
sub.s32 %r1897, %r6483, %r1894;
mov.u32 %r6484, 6;
sub.s32 %r6485, %r6484, %r1894;
mul.wide.s32 %rd2088, %r6485, 4;
add.s64 %rd2089, %rd1, %rd2088;
ld.local.u32 %r8607, [%rd2089];
ld.local.u32 %r8608, [%rd2089+-4];
and.b32 %r1900, %r1892, 31;
setp.eq.s32 %p1205, %r1900, 0;
@%p1205 bra $L__BB0_1419;
mov.u32 %r6486, 32;
sub.s32 %r6487, %r6486, %r1900;
shr.u32 %r6488, %r8608, %r6487;
shl.b32 %r6489, %r8607, %r1900;
add.s32 %r8607, %r6488, %r6489;
mul.wide.s32 %rd2090, %r1897, 4;
add.s64 %rd2091, %rd1, %rd2090;
ld.local.u32 %r6490, [%rd2091];
shr.u32 %r6491, %r6490, %r6487;
shl.b32 %r6492, %r8608, %r1900;
add.s32 %r8608, %r6491, %r6492;
$L__BB0_1419:
and.b32 %r6493, %r1891, -2147483648;
shr.u32 %r6494, %r8608, 30;
shl.b32 %r6495, %r8607, 2;
or.b32 %r6496, %r6494, %r6495;
shr.u32 %r6497, %r6496, 31;
shr.u32 %r6498, %r8607, 30;
add.s32 %r6499, %r6497, %r6498;
neg.s32 %r6500, %r6499;
setp.eq.s32 %p1206, %r6493, 0;
selp.b32 %r8609, %r6499, %r6500, %p1206;
setp.ne.s32 %p1207, %r6497, 0;
xor.b32 %r6501, %r6493, -2147483648;
selp.b32 %r6502, %r6501, %r6493, %p1207;
selp.b32 %r6503, -1, 0, %p1207;
xor.b32 %r6504, %r6496, %r6503;
shl.b32 %r6505, %r8608, 2;
xor.b32 %r6506, %r6505, %r6503;
cvt.u64.u32 %rd2092, %r6504;
cvt.u64.u32 %rd2093, %r6506;
bfi.b64 %rd2094, %rd2092, %rd2093, 32, 32;
cvt.rn.f64.s64 %fd191, %rd2094;
mul.f64 %fd192, %fd191, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4309, %fd192;
setp.eq.s32 %p1208, %r6502, 0;
neg.f32 %f4310, %f4309;
selp.f32 %f5704, %f4309, %f4310, %p1208;
$L__BB0_1421:
add.s32 %r1907, %r8609, 1;
and.b32 %r1908, %r1907, 1;
setp.eq.s32 %p1209, %r1908, 0;
selp.f32 %f1590, %f5704, 0f3F800000, %p1209;
mul.rn.f32 %f1591, %f5704, %f5704;
mov.f32 %f5705, 0fB94D4153;
@%p1209 bra $L__BB0_1423;
mov.f32 %f4313, 0fBAB607ED;
mov.f32 %f4314, 0f37CBAC00;
fma.rn.f32 %f5705, %f4314, %f1591, %f4313;
$L__BB0_1423:
selp.f32 %f4315, 0f3C0885E4, 0f3D2AAABB, %p1209;
fma.rn.f32 %f4316, %f5705, %f1591, %f4315;
selp.f32 %f4317, 0fBE2AAAA8, 0fBEFFFFFF, %p1209;
fma.rn.f32 %f4318, %f4316, %f1591, %f4317;
mov.f32 %f4319, 0f00000000;
fma.rn.f32 %f4320, %f1591, %f1590, %f4319;
fma.rn.f32 %f5706, %f4318, %f4320, %f1590;
and.b32 %r6508, %r1907, 2;
setp.eq.s32 %p1211, %r6508, 0;
@%p1211 bra $L__BB0_1425;
mov.f32 %f4322, 0fBF800000;
fma.rn.f32 %f5706, %f5706, %f4322, %f4319;
$L__BB0_1425:
add.f32 %f5707, %f5703, %f5706;
bra.uni $L__BB0_1426;
$L__BB0_1005:
mov.b32 %r1311, %f5348;
shr.u32 %r5425, %r1311, 23;
and.b32 %r5426, %r5425, 255;
add.s32 %r1312, %r5426, -128;
shl.b32 %r5427, %r1311, 8;
or.b32 %r1313, %r5427, -2147483648;
shr.u32 %r1314, %r1312, 5;
mov.u64 %rd2626, 0;
mov.u32 %r8482, 0;
mov.u64 %rd1661, __cudart_i2opi_f;
mov.u64 %rd2627, %rd2626;
$L__BB0_1006:
.pragma "nounroll";
shl.b64 %rd1660, %rd2626, 2;
add.s64 %rd1662, %rd1661, %rd1660;
ld.global.nc.u32 %r5428, [%rd1662];
mad.wide.u32 %rd1663, %r5428, %r1313, %rd2627;
shr.u64 %rd2627, %rd1663, 32;
add.s64 %rd1664, %rd1, %rd1660;
st.local.u32 [%rd1664], %rd1663;
add.s32 %r8482, %r8482, 1;
cvt.s64.s32 %rd2626, %r8482;
setp.ne.s32 %p856, %r8482, 6;
@%p856 bra $L__BB0_1006;
st.local.u32 [%rd5], %rd2627;
mov.u32 %r5429, 4;
sub.s32 %r1317, %r5429, %r1314;
mov.u32 %r5430, 6;
sub.s32 %r5431, %r5430, %r1314;
mul.wide.s32 %rd1665, %r5431, 4;
add.s64 %rd1666, %rd1, %rd1665;
ld.local.u32 %r8483, [%rd1666];
ld.local.u32 %r8484, [%rd1666+-4];
and.b32 %r1320, %r1312, 31;
setp.eq.s32 %p857, %r1320, 0;
@%p857 bra $L__BB0_1009;
mov.u32 %r5432, 32;
sub.s32 %r5433, %r5432, %r1320;
shr.u32 %r5434, %r8484, %r5433;
shl.b32 %r5435, %r8483, %r1320;
add.s32 %r8483, %r5434, %r5435;
mul.wide.s32 %rd1667, %r1317, 4;
add.s64 %rd1668, %rd1, %rd1667;
ld.local.u32 %r5436, [%rd1668];
shr.u32 %r5437, %r5436, %r5433;
shl.b32 %r5438, %r8484, %r1320;
add.s32 %r8484, %r5437, %r5438;
$L__BB0_1009:
and.b32 %r5439, %r1311, -2147483648;
shr.u32 %r5440, %r8484, 30;
shl.b32 %r5441, %r8483, 2;
or.b32 %r5442, %r5440, %r5441;
shr.u32 %r5443, %r5442, 31;
shr.u32 %r5444, %r8483, 30;
add.s32 %r5445, %r5443, %r5444;
neg.s32 %r5446, %r5445;
setp.eq.s32 %p858, %r5439, 0;
selp.b32 %r8485, %r5445, %r5446, %p858;
setp.ne.s32 %p859, %r5443, 0;
xor.b32 %r5447, %r5439, -2147483648;
selp.b32 %r5448, %r5447, %r5439, %p859;
selp.b32 %r5449, -1, 0, %p859;
xor.b32 %r5450, %r5442, %r5449;
shl.b32 %r5451, %r8484, 2;
xor.b32 %r5452, %r5451, %r5449;
cvt.u64.u32 %rd1669, %r5450;
cvt.u64.u32 %rd1670, %r5452;
bfi.b64 %rd1671, %rd1669, %rd1670, 32, 32;
cvt.rn.f64.s64 %fd129, %rd1671;
mul.f64 %fd130, %fd129, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3643, %fd130;
setp.eq.s32 %p860, %r5448, 0;
neg.f32 %f3644, %f3643;
selp.f32 %f5560, %f3643, %f3644, %p860;
$L__BB0_1011:
and.b32 %r1327, %r8485, 1;
setp.eq.s32 %p861, %r1327, 0;
selp.f32 %f1120, %f5560, 0f3F800000, %p861;
mul.rn.f32 %f1121, %f5560, %f5560;
mov.f32 %f5561, 0fB94D4153;
@%p861 bra $L__BB0_1013;
mov.f32 %f3647, 0fBAB607ED;
mov.f32 %f3648, 0f37CBAC00;
fma.rn.f32 %f5561, %f3648, %f1121, %f3647;
$L__BB0_1013:
selp.f32 %f3649, 0f3C0885E4, 0f3D2AAABB, %p861;
fma.rn.f32 %f3650, %f5561, %f1121, %f3649;
selp.f32 %f3651, 0fBE2AAAA8, 0fBEFFFFFF, %p861;
fma.rn.f32 %f3652, %f3650, %f1121, %f3651;
mov.f32 %f3653, 0f00000000;
fma.rn.f32 %f3654, %f1121, %f1120, %f3653;
fma.rn.f32 %f5213, %f3652, %f3654, %f1120;
and.b32 %r5454, %r8485, 2;
setp.eq.s32 %p863, %r5454, 0;
@%p863 bra $L__BB0_1015;
mov.f32 %f3656, 0fBF800000;
fma.rn.f32 %f5213, %f5213, %f3656, %f3653;
$L__BB0_1015:
setp.lt.s32 %p16, %r14, %r1309;
@%p853 bra $L__BB0_1028;
mul.f32 %f3657, %f5531, 0f3F22F983;
cvt.rni.s32.f32 %r8489, %f3657;
cvt.rn.f32.s32 %f3658, %r8489;
mov.f32 %f3659, 0fBFC90FDA;
fma.rn.f32 %f3660, %f3658, %f3659, %f5531;
mov.f32 %f3661, 0fB3A22168;
fma.rn.f32 %f3662, %f3658, %f3661, %f3660;
mov.f32 %f3663, 0fA7C234C5;
fma.rn.f32 %f5564, %f3658, %f3663, %f3662;
abs.f32 %f1129, %f5531;
setp.ltu.f32 %p865, %f1129, 0f47CE4780;
@%p865 bra $L__BB0_1024;
setp.eq.f32 %p866, %f1129, 0f7F800000;
@%p866 bra $L__BB0_1023;
bra.uni $L__BB0_1018;
$L__BB0_1023:
mov.f32 %f3666, 0f00000000;
mul.rn.f32 %f5564, %f5531, %f3666;
mov.u32 %r8489, 0;
bra.uni $L__BB0_1024;
$L__BB0_1018:
mov.b32 %r1329, %f5531;
shr.u32 %r5456, %r1329, 23;
and.b32 %r5457, %r5456, 255;
add.s32 %r1330, %r5457, -128;
shl.b32 %r5458, %r1329, 8;
or.b32 %r1331, %r5458, -2147483648;
shr.u32 %r1332, %r1330, 5;
mov.u64 %rd2628, 0;
mov.u32 %r8486, 0;
mov.u64 %rd1675, __cudart_i2opi_f;
mov.u64 %rd2629, %rd2628;
$L__BB0_1019:
.pragma "nounroll";
shl.b64 %rd1674, %rd2628, 2;
add.s64 %rd1676, %rd1675, %rd1674;
ld.global.nc.u32 %r5459, [%rd1676];
mad.wide.u32 %rd1677, %r5459, %r1331, %rd2629;
shr.u64 %rd2629, %rd1677, 32;
add.s64 %rd1678, %rd1, %rd1674;
st.local.u32 [%rd1678], %rd1677;
add.s32 %r8486, %r8486, 1;
cvt.s64.s32 %rd2628, %r8486;
setp.ne.s32 %p867, %r8486, 6;
@%p867 bra $L__BB0_1019;
st.local.u32 [%rd5], %rd2629;
mov.u32 %r5460, 4;
sub.s32 %r1335, %r5460, %r1332;
mov.u32 %r5461, 6;
sub.s32 %r5462, %r5461, %r1332;
mul.wide.s32 %rd1679, %r5462, 4;
add.s64 %rd1680, %rd1, %rd1679;
ld.local.u32 %r8487, [%rd1680];
ld.local.u32 %r8488, [%rd1680+-4];
and.b32 %r1338, %r1330, 31;
setp.eq.s32 %p868, %r1338, 0;
@%p868 bra $L__BB0_1022;
mov.u32 %r5463, 32;
sub.s32 %r5464, %r5463, %r1338;
shr.u32 %r5465, %r8488, %r5464;
shl.b32 %r5466, %r8487, %r1338;
add.s32 %r8487, %r5465, %r5466;
mul.wide.s32 %rd1681, %r1335, 4;
add.s64 %rd1682, %rd1, %rd1681;
ld.local.u32 %r5467, [%rd1682];
shr.u32 %r5468, %r5467, %r5464;
shl.b32 %r5469, %r8488, %r1338;
add.s32 %r8488, %r5468, %r5469;
$L__BB0_1022:
and.b32 %r5470, %r1329, -2147483648;
shr.u32 %r5471, %r8488, 30;
shl.b32 %r5472, %r8487, 2;
or.b32 %r5473, %r5471, %r5472;
shr.u32 %r5474, %r5473, 31;
shr.u32 %r5475, %r8487, 30;
add.s32 %r5476, %r5474, %r5475;
neg.s32 %r5477, %r5476;
setp.eq.s32 %p869, %r5470, 0;
selp.b32 %r8489, %r5476, %r5477, %p869;
setp.ne.s32 %p870, %r5474, 0;
xor.b32 %r5478, %r5470, -2147483648;
selp.b32 %r5479, %r5478, %r5470, %p870;
selp.b32 %r5480, -1, 0, %p870;
xor.b32 %r5481, %r5473, %r5480;
shl.b32 %r5482, %r8488, 2;
xor.b32 %r5483, %r5482, %r5480;
cvt.u64.u32 %rd1683, %r5481;
cvt.u64.u32 %rd1684, %r5483;
bfi.b64 %rd1685, %rd1683, %rd1684, 32, 32;
cvt.rn.f64.s64 %fd131, %rd1685;
mul.f64 %fd132, %fd131, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3664, %fd132;
setp.eq.s32 %p871, %r5479, 0;
neg.f32 %f3665, %f3664;
selp.f32 %f5564, %f3664, %f3665, %p871;
$L__BB0_1024:
add.s32 %r1345, %r8489, 1;
and.b32 %r1346, %r1345, 1;
setp.eq.s32 %p872, %r1346, 0;
selp.f32 %f1133, %f5564, 0f3F800000, %p872;
mul.rn.f32 %f1134, %f5564, %f5564;
mov.f32 %f5565, 0fB94D4153;
@%p872 bra $L__BB0_1026;
mov.f32 %f3668, 0fBAB607ED;
mov.f32 %f3669, 0f37CBAC00;
fma.rn.f32 %f5565, %f3669, %f1134, %f3668;
$L__BB0_1026:
selp.f32 %f3670, 0f3C0885E4, 0f3D2AAABB, %p872;
fma.rn.f32 %f3671, %f5565, %f1134, %f3670;
selp.f32 %f3672, 0fBE2AAAA8, 0fBEFFFFFF, %p872;
fma.rn.f32 %f3673, %f3671, %f1134, %f3672;
mov.f32 %f3674, 0f00000000;
fma.rn.f32 %f3675, %f1134, %f1133, %f3674;
fma.rn.f32 %f5215, %f3673, %f3675, %f1133;
and.b32 %r5485, %r1345, 2;
setp.eq.s32 %p874, %r5485, 0;
@%p874 bra $L__BB0_1028;
mov.f32 %f3677, 0fBF800000;
fma.rn.f32 %f5215, %f5215, %f3677, %f3674;
$L__BB0_1028:
selp.f32 %f1141, %f5215, %f5216, %p16;
selp.f32 %f1142, %f5213, %f5214, %p16;
@%p853 bra $L__BB0_1030;
add.f32 %f5714, %f1142, %f1141;
mov.f32 %f5214, %f5213;
mov.f32 %f5216, %f5215;
$L__BB0_1030:
@%p820 bra $L__BB0_1059;
shl.b32 %r5487, %r12, 5;
mov.u32 %r5488, -32;
sub.s32 %r1347, %r5488, %r5487;
setp.ge.s32 %p878, %r14, %r1347;
@%p878 bra $L__BB0_1044;
mul.f32 %f3680, %f5347, 0f3F22F983;
cvt.rni.s32.f32 %r8493, %f3680;
cvt.rn.f32.s32 %f3681, %r8493;
mov.f32 %f3682, 0fBFC90FDA;
fma.rn.f32 %f3683, %f3681, %f3682, %f5347;
mov.f32 %f3684, 0fB3A22168;
fma.rn.f32 %f3685, %f3681, %f3684, %f3683;
mov.f32 %f3686, 0fA7C234C5;
fma.rn.f32 %f5573, %f3681, %f3686, %f3685;
abs.f32 %f1150, %f5347;
setp.ltu.f32 %p879, %f1150, 0f47CE4780;
@%p879 bra $L__BB0_1040;
setp.eq.f32 %p880, %f1150, 0f7F800000;
@%p880 bra $L__BB0_1039;
bra.uni $L__BB0_1034;
$L__BB0_1039:
mov.f32 %f3689, 0f00000000;
mul.rn.f32 %f5573, %f5347, %f3689;
mov.u32 %r8493, 0;
bra.uni $L__BB0_1040;
$L__BB0_1034:
mov.b32 %r1349, %f5347;
shr.u32 %r5490, %r1349, 23;
and.b32 %r5491, %r5490, 255;
add.s32 %r1350, %r5491, -128;
shl.b32 %r5492, %r1349, 8;
or.b32 %r1351, %r5492, -2147483648;
shr.u32 %r1352, %r1350, 5;
mov.u64 %rd2630, 0;
mov.u32 %r8490, 0;
mov.u64 %rd1689, __cudart_i2opi_f;
mov.u64 %rd2631, %rd2630;
$L__BB0_1035:
.pragma "nounroll";
shl.b64 %rd1688, %rd2630, 2;
add.s64 %rd1690, %rd1689, %rd1688;
ld.global.nc.u32 %r5493, [%rd1690];
mad.wide.u32 %rd1691, %r5493, %r1351, %rd2631;
shr.u64 %rd2631, %rd1691, 32;
add.s64 %rd1692, %rd1, %rd1688;
st.local.u32 [%rd1692], %rd1691;
add.s32 %r8490, %r8490, 1;
cvt.s64.s32 %rd2630, %r8490;
setp.ne.s32 %p881, %r8490, 6;
@%p881 bra $L__BB0_1035;
st.local.u32 [%rd5], %rd2631;
mov.u32 %r5494, 4;
sub.s32 %r1355, %r5494, %r1352;
mov.u32 %r5495, 6;
sub.s32 %r5496, %r5495, %r1352;
mul.wide.s32 %rd1693, %r5496, 4;
add.s64 %rd1694, %rd1, %rd1693;
ld.local.u32 %r8491, [%rd1694];
ld.local.u32 %r8492, [%rd1694+-4];
and.b32 %r1358, %r1350, 31;
setp.eq.s32 %p882, %r1358, 0;
@%p882 bra $L__BB0_1038;
mov.u32 %r5497, 32;
sub.s32 %r5498, %r5497, %r1358;
shr.u32 %r5499, %r8492, %r5498;
shl.b32 %r5500, %r8491, %r1358;
add.s32 %r8491, %r5499, %r5500;
mul.wide.s32 %rd1695, %r1355, 4;
add.s64 %rd1696, %rd1, %rd1695;
ld.local.u32 %r5501, [%rd1696];
shr.u32 %r5502, %r5501, %r5498;
shl.b32 %r5503, %r8492, %r1358;
add.s32 %r8492, %r5502, %r5503;
$L__BB0_1038:
and.b32 %r5504, %r1349, -2147483648;
shr.u32 %r5505, %r8492, 30;
shl.b32 %r5506, %r8491, 2;
or.b32 %r5507, %r5505, %r5506;
shr.u32 %r5508, %r5507, 31;
shr.u32 %r5509, %r8491, 30;
add.s32 %r5510, %r5508, %r5509;
neg.s32 %r5511, %r5510;
setp.eq.s32 %p883, %r5504, 0;
selp.b32 %r8493, %r5510, %r5511, %p883;
setp.ne.s32 %p884, %r5508, 0;
xor.b32 %r5512, %r5504, -2147483648;
selp.b32 %r5513, %r5512, %r5504, %p884;
selp.b32 %r5514, -1, 0, %p884;
xor.b32 %r5515, %r5507, %r5514;
shl.b32 %r5516, %r8492, 2;
xor.b32 %r5517, %r5516, %r5514;
cvt.u64.u32 %rd1697, %r5515;
cvt.u64.u32 %rd1698, %r5517;
bfi.b64 %rd1699, %rd1697, %rd1698, 32, 32;
cvt.rn.f64.s64 %fd133, %rd1699;
mul.f64 %fd134, %fd133, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3687, %fd134;
setp.eq.s32 %p885, %r5513, 0;
neg.f32 %f3688, %f3687;
selp.f32 %f5573, %f3687, %f3688, %p885;
$L__BB0_1040:
and.b32 %r1365, %r8493, 1;
setp.eq.s32 %p886, %r1365, 0;
selp.f32 %f1154, %f5573, 0f3F800000, %p886;
mul.rn.f32 %f1155, %f5573, %f5573;
mov.f32 %f5574, 0fB94D4153;
@%p886 bra $L__BB0_1042;
mov.f32 %f3691, 0fBAB607ED;
mov.f32 %f3692, 0f37CBAC00;
fma.rn.f32 %f5574, %f3692, %f1155, %f3691;
$L__BB0_1042:
selp.f32 %f3693, 0f3C0885E4, 0f3D2AAABB, %p886;
fma.rn.f32 %f3694, %f5574, %f1155, %f3693;
selp.f32 %f3695, 0fBE2AAAA8, 0fBEFFFFFF, %p886;
fma.rn.f32 %f3696, %f3694, %f1155, %f3695;
mov.f32 %f3697, 0f00000000;
fma.rn.f32 %f3698, %f1155, %f1154, %f3697;
fma.rn.f32 %f5213, %f3696, %f3698, %f1154;
and.b32 %r5519, %r8493, 2;
setp.eq.s32 %p888, %r5519, 0;
@%p888 bra $L__BB0_1044;
mov.f32 %f3700, 0fBF800000;
fma.rn.f32 %f5213, %f5213, %f3700, %f3697;
$L__BB0_1044:
setp.lt.s32 %p17, %r14, %r1347;
@%p878 bra $L__BB0_1057;
mul.f32 %f3701, %f5339, 0f3F22F983;
cvt.rni.s32.f32 %r8497, %f3701;
cvt.rn.f32.s32 %f3702, %r8497;
mov.f32 %f3703, 0fBFC90FDA;
fma.rn.f32 %f3704, %f3702, %f3703, %f5339;
mov.f32 %f3705, 0fB3A22168;
fma.rn.f32 %f3706, %f3702, %f3705, %f3704;
mov.f32 %f3707, 0fA7C234C5;
fma.rn.f32 %f5577, %f3702, %f3707, %f3706;
abs.f32 %f1163, %f5339;
setp.ltu.f32 %p890, %f1163, 0f47CE4780;
@%p890 bra $L__BB0_1053;
setp.eq.f32 %p891, %f1163, 0f7F800000;
@%p891 bra $L__BB0_1052;
bra.uni $L__BB0_1047;
$L__BB0_1052:
mov.f32 %f3710, 0f00000000;
mul.rn.f32 %f5577, %f5339, %f3710;
mov.u32 %r8497, 0;
bra.uni $L__BB0_1053;
$L__BB0_1047:
mov.b32 %r1367, %f5339;
shr.u32 %r5521, %r1367, 23;
and.b32 %r5522, %r5521, 255;
add.s32 %r1368, %r5522, -128;
shl.b32 %r5523, %r1367, 8;
or.b32 %r1369, %r5523, -2147483648;
shr.u32 %r1370, %r1368, 5;
mov.u64 %rd2632, 0;
mov.u32 %r8494, 0;
mov.u64 %rd1703, __cudart_i2opi_f;
mov.u64 %rd2633, %rd2632;
$L__BB0_1048:
.pragma "nounroll";
shl.b64 %rd1702, %rd2632, 2;
add.s64 %rd1704, %rd1703, %rd1702;
ld.global.nc.u32 %r5524, [%rd1704];
mad.wide.u32 %rd1705, %r5524, %r1369, %rd2633;
shr.u64 %rd2633, %rd1705, 32;
add.s64 %rd1706, %rd1, %rd1702;
st.local.u32 [%rd1706], %rd1705;
add.s32 %r8494, %r8494, 1;
cvt.s64.s32 %rd2632, %r8494;
setp.ne.s32 %p892, %r8494, 6;
@%p892 bra $L__BB0_1048;
st.local.u32 [%rd5], %rd2633;
mov.u32 %r5525, 4;
sub.s32 %r1373, %r5525, %r1370;
mov.u32 %r5526, 6;
sub.s32 %r5527, %r5526, %r1370;
mul.wide.s32 %rd1707, %r5527, 4;
add.s64 %rd1708, %rd1, %rd1707;
ld.local.u32 %r8495, [%rd1708];
ld.local.u32 %r8496, [%rd1708+-4];
and.b32 %r1376, %r1368, 31;
setp.eq.s32 %p893, %r1376, 0;
@%p893 bra $L__BB0_1051;
mov.u32 %r5528, 32;
sub.s32 %r5529, %r5528, %r1376;
shr.u32 %r5530, %r8496, %r5529;
shl.b32 %r5531, %r8495, %r1376;
add.s32 %r8495, %r5530, %r5531;
mul.wide.s32 %rd1709, %r1373, 4;
add.s64 %rd1710, %rd1, %rd1709;
ld.local.u32 %r5532, [%rd1710];
shr.u32 %r5533, %r5532, %r5529;
shl.b32 %r5534, %r8496, %r1376;
add.s32 %r8496, %r5533, %r5534;
$L__BB0_1051:
and.b32 %r5535, %r1367, -2147483648;
shr.u32 %r5536, %r8496, 30;
shl.b32 %r5537, %r8495, 2;
or.b32 %r5538, %r5536, %r5537;
shr.u32 %r5539, %r5538, 31;
shr.u32 %r5540, %r8495, 30;
add.s32 %r5541, %r5539, %r5540;
neg.s32 %r5542, %r5541;
setp.eq.s32 %p894, %r5535, 0;
selp.b32 %r8497, %r5541, %r5542, %p894;
setp.ne.s32 %p895, %r5539, 0;
xor.b32 %r5543, %r5535, -2147483648;
selp.b32 %r5544, %r5543, %r5535, %p895;
selp.b32 %r5545, -1, 0, %p895;
xor.b32 %r5546, %r5538, %r5545;
shl.b32 %r5547, %r8496, 2;
xor.b32 %r5548, %r5547, %r5545;
cvt.u64.u32 %rd1711, %r5546;
cvt.u64.u32 %rd1712, %r5548;
bfi.b64 %rd1713, %rd1711, %rd1712, 32, 32;
cvt.rn.f64.s64 %fd135, %rd1713;
mul.f64 %fd136, %fd135, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3708, %fd136;
setp.eq.s32 %p896, %r5544, 0;
neg.f32 %f3709, %f3708;
selp.f32 %f5577, %f3708, %f3709, %p896;
$L__BB0_1053:
add.s32 %r1383, %r8497, 1;
and.b32 %r1384, %r1383, 1;
setp.eq.s32 %p897, %r1384, 0;
selp.f32 %f1167, %f5577, 0f3F800000, %p897;
mul.rn.f32 %f1168, %f5577, %f5577;
mov.f32 %f5578, 0fB94D4153;
@%p897 bra $L__BB0_1055;
mov.f32 %f3712, 0fBAB607ED;
mov.f32 %f3713, 0f37CBAC00;
fma.rn.f32 %f5578, %f3713, %f1168, %f3712;
$L__BB0_1055:
selp.f32 %f3714, 0f3C0885E4, 0f3D2AAABB, %p897;
fma.rn.f32 %f3715, %f5578, %f1168, %f3714;
selp.f32 %f3716, 0fBE2AAAA8, 0fBEFFFFFF, %p897;
fma.rn.f32 %f3717, %f3715, %f1168, %f3716;
mov.f32 %f3718, 0f00000000;
fma.rn.f32 %f3719, %f1168, %f1167, %f3718;
fma.rn.f32 %f5215, %f3717, %f3719, %f1167;
and.b32 %r5550, %r1383, 2;
setp.eq.s32 %p899, %r5550, 0;
@%p899 bra $L__BB0_1057;
mov.f32 %f3721, 0fBF800000;
fma.rn.f32 %f5215, %f5215, %f3721, %f3718;
$L__BB0_1057:
selp.f32 %f1175, %f5215, %f5216, %p17;
selp.f32 %f1176, %f5213, %f5214, %p17;
@%p878 bra $L__BB0_1059;
add.f32 %f5713, %f1176, %f1175;
mov.f32 %f5214, %f5213;
mov.f32 %f5216, %f5215;
$L__BB0_1059:
@%p824 bra $L__BB0_1088;
shl.b32 %r5552, %r12, 5;
neg.s32 %r1385, %r5552;
setp.ge.s32 %p903, %r14, %r1385;
@%p903 bra $L__BB0_1073;
mul.f32 %f3724, %f5346, 0f3F22F983;
cvt.rni.s32.f32 %r8501, %f3724;
cvt.rn.f32.s32 %f3725, %r8501;
mov.f32 %f3726, 0fBFC90FDA;
fma.rn.f32 %f3727, %f3725, %f3726, %f5346;
mov.f32 %f3728, 0fB3A22168;
fma.rn.f32 %f3729, %f3725, %f3728, %f3727;
mov.f32 %f3730, 0fA7C234C5;
fma.rn.f32 %f5586, %f3725, %f3730, %f3729;
abs.f32 %f1184, %f5346;
setp.ltu.f32 %p904, %f1184, 0f47CE4780;
@%p904 bra $L__BB0_1069;
setp.eq.f32 %p905, %f1184, 0f7F800000;
@%p905 bra $L__BB0_1068;
bra.uni $L__BB0_1063;
$L__BB0_1068:
mov.f32 %f3733, 0f00000000;
mul.rn.f32 %f5586, %f5346, %f3733;
mov.u32 %r8501, 0;
bra.uni $L__BB0_1069;
$L__BB0_1063:
mov.b32 %r1387, %f5346;
shr.u32 %r5554, %r1387, 23;
and.b32 %r5555, %r5554, 255;
add.s32 %r1388, %r5555, -128;
shl.b32 %r5556, %r1387, 8;
or.b32 %r1389, %r5556, -2147483648;
shr.u32 %r1390, %r1388, 5;
mov.u64 %rd2634, 0;
mov.u32 %r8498, 0;
mov.u64 %rd1717, __cudart_i2opi_f;
mov.u64 %rd2635, %rd2634;
$L__BB0_1064:
.pragma "nounroll";
shl.b64 %rd1716, %rd2634, 2;
add.s64 %rd1718, %rd1717, %rd1716;
ld.global.nc.u32 %r5557, [%rd1718];
mad.wide.u32 %rd1719, %r5557, %r1389, %rd2635;
shr.u64 %rd2635, %rd1719, 32;
add.s64 %rd1720, %rd1, %rd1716;
st.local.u32 [%rd1720], %rd1719;
add.s32 %r8498, %r8498, 1;
cvt.s64.s32 %rd2634, %r8498;
setp.ne.s32 %p906, %r8498, 6;
@%p906 bra $L__BB0_1064;
st.local.u32 [%rd5], %rd2635;
mov.u32 %r5558, 4;
sub.s32 %r1393, %r5558, %r1390;
mov.u32 %r5559, 6;
sub.s32 %r5560, %r5559, %r1390;
mul.wide.s32 %rd1721, %r5560, 4;
add.s64 %rd1722, %rd1, %rd1721;
ld.local.u32 %r8499, [%rd1722];
ld.local.u32 %r8500, [%rd1722+-4];
and.b32 %r1396, %r1388, 31;
setp.eq.s32 %p907, %r1396, 0;
@%p907 bra $L__BB0_1067;
mov.u32 %r5561, 32;
sub.s32 %r5562, %r5561, %r1396;
shr.u32 %r5563, %r8500, %r5562;
shl.b32 %r5564, %r8499, %r1396;
add.s32 %r8499, %r5563, %r5564;
mul.wide.s32 %rd1723, %r1393, 4;
add.s64 %rd1724, %rd1, %rd1723;
ld.local.u32 %r5565, [%rd1724];
shr.u32 %r5566, %r5565, %r5562;
shl.b32 %r5567, %r8500, %r1396;
add.s32 %r8500, %r5566, %r5567;
$L__BB0_1067:
and.b32 %r5568, %r1387, -2147483648;
shr.u32 %r5569, %r8500, 30;
shl.b32 %r5570, %r8499, 2;
or.b32 %r5571, %r5569, %r5570;
shr.u32 %r5572, %r5571, 31;
shr.u32 %r5573, %r8499, 30;
add.s32 %r5574, %r5572, %r5573;
neg.s32 %r5575, %r5574;
setp.eq.s32 %p908, %r5568, 0;
selp.b32 %r8501, %r5574, %r5575, %p908;
setp.ne.s32 %p909, %r5572, 0;
xor.b32 %r5576, %r5568, -2147483648;
selp.b32 %r5577, %r5576, %r5568, %p909;
selp.b32 %r5578, -1, 0, %p909;
xor.b32 %r5579, %r5571, %r5578;
shl.b32 %r5580, %r8500, 2;
xor.b32 %r5581, %r5580, %r5578;
cvt.u64.u32 %rd1725, %r5579;
cvt.u64.u32 %rd1726, %r5581;
bfi.b64 %rd1727, %rd1725, %rd1726, 32, 32;
cvt.rn.f64.s64 %fd137, %rd1727;
mul.f64 %fd138, %fd137, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3731, %fd138;
setp.eq.s32 %p910, %r5577, 0;
neg.f32 %f3732, %f3731;
selp.f32 %f5586, %f3731, %f3732, %p910;
$L__BB0_1069:
and.b32 %r1403, %r8501, 1;
setp.eq.s32 %p911, %r1403, 0;
selp.f32 %f1188, %f5586, 0f3F800000, %p911;
mul.rn.f32 %f1189, %f5586, %f5586;
mov.f32 %f5587, 0fB94D4153;
@%p911 bra $L__BB0_1071;
mov.f32 %f3735, 0fBAB607ED;
mov.f32 %f3736, 0f37CBAC00;
fma.rn.f32 %f5587, %f3736, %f1189, %f3735;
$L__BB0_1071:
selp.f32 %f3737, 0f3C0885E4, 0f3D2AAABB, %p911;
fma.rn.f32 %f3738, %f5587, %f1189, %f3737;
selp.f32 %f3739, 0fBE2AAAA8, 0fBEFFFFFF, %p911;
fma.rn.f32 %f3740, %f3738, %f1189, %f3739;
mov.f32 %f3741, 0f00000000;
fma.rn.f32 %f3742, %f1189, %f1188, %f3741;
fma.rn.f32 %f5213, %f3740, %f3742, %f1188;
and.b32 %r5583, %r8501, 2;
setp.eq.s32 %p913, %r5583, 0;
@%p913 bra $L__BB0_1073;
mov.f32 %f3744, 0fBF800000;
fma.rn.f32 %f5213, %f5213, %f3744, %f3741;
$L__BB0_1073:
setp.lt.s32 %p18, %r14, %r1385;
@%p903 bra $L__BB0_1086;
mul.f32 %f3745, %f5338, 0f3F22F983;
cvt.rni.s32.f32 %r8505, %f3745;
cvt.rn.f32.s32 %f3746, %r8505;
mov.f32 %f3747, 0fBFC90FDA;
fma.rn.f32 %f3748, %f3746, %f3747, %f5338;
mov.f32 %f3749, 0fB3A22168;
fma.rn.f32 %f3750, %f3746, %f3749, %f3748;
mov.f32 %f3751, 0fA7C234C5;
fma.rn.f32 %f5590, %f3746, %f3751, %f3750;
abs.f32 %f1197, %f5338;
setp.ltu.f32 %p915, %f1197, 0f47CE4780;
@%p915 bra $L__BB0_1082;
setp.eq.f32 %p916, %f1197, 0f7F800000;
@%p916 bra $L__BB0_1081;
bra.uni $L__BB0_1076;
$L__BB0_1081:
mov.f32 %f3754, 0f00000000;
mul.rn.f32 %f5590, %f5338, %f3754;
mov.u32 %r8505, 0;
bra.uni $L__BB0_1082;
$L__BB0_1076:
mov.b32 %r1405, %f5338;
shr.u32 %r5585, %r1405, 23;
and.b32 %r5586, %r5585, 255;
add.s32 %r1406, %r5586, -128;
shl.b32 %r5587, %r1405, 8;
or.b32 %r1407, %r5587, -2147483648;
shr.u32 %r1408, %r1406, 5;
mov.u64 %rd2636, 0;
mov.u32 %r8502, 0;
mov.u64 %rd1731, __cudart_i2opi_f;
mov.u64 %rd2637, %rd2636;
$L__BB0_1077:
.pragma "nounroll";
shl.b64 %rd1730, %rd2636, 2;
add.s64 %rd1732, %rd1731, %rd1730;
ld.global.nc.u32 %r5588, [%rd1732];
mad.wide.u32 %rd1733, %r5588, %r1407, %rd2637;
shr.u64 %rd2637, %rd1733, 32;
add.s64 %rd1734, %rd1, %rd1730;
st.local.u32 [%rd1734], %rd1733;
add.s32 %r8502, %r8502, 1;
cvt.s64.s32 %rd2636, %r8502;
setp.ne.s32 %p917, %r8502, 6;
@%p917 bra $L__BB0_1077;
st.local.u32 [%rd5], %rd2637;
mov.u32 %r5589, 4;
sub.s32 %r1411, %r5589, %r1408;
mov.u32 %r5590, 6;
sub.s32 %r5591, %r5590, %r1408;
mul.wide.s32 %rd1735, %r5591, 4;
add.s64 %rd1736, %rd1, %rd1735;
ld.local.u32 %r8503, [%rd1736];
ld.local.u32 %r8504, [%rd1736+-4];
and.b32 %r1414, %r1406, 31;
setp.eq.s32 %p918, %r1414, 0;
@%p918 bra $L__BB0_1080;
mov.u32 %r5592, 32;
sub.s32 %r5593, %r5592, %r1414;
shr.u32 %r5594, %r8504, %r5593;
shl.b32 %r5595, %r8503, %r1414;
add.s32 %r8503, %r5594, %r5595;
mul.wide.s32 %rd1737, %r1411, 4;
add.s64 %rd1738, %rd1, %rd1737;
ld.local.u32 %r5596, [%rd1738];
shr.u32 %r5597, %r5596, %r5593;
shl.b32 %r5598, %r8504, %r1414;
add.s32 %r8504, %r5597, %r5598;
$L__BB0_1080:
and.b32 %r5599, %r1405, -2147483648;
shr.u32 %r5600, %r8504, 30;
shl.b32 %r5601, %r8503, 2;
or.b32 %r5602, %r5600, %r5601;
shr.u32 %r5603, %r5602, 31;
shr.u32 %r5604, %r8503, 30;
add.s32 %r5605, %r5603, %r5604;
neg.s32 %r5606, %r5605;
setp.eq.s32 %p919, %r5599, 0;
selp.b32 %r8505, %r5605, %r5606, %p919;
setp.ne.s32 %p920, %r5603, 0;
xor.b32 %r5607, %r5599, -2147483648;
selp.b32 %r5608, %r5607, %r5599, %p920;
selp.b32 %r5609, -1, 0, %p920;
xor.b32 %r5610, %r5602, %r5609;
shl.b32 %r5611, %r8504, 2;
xor.b32 %r5612, %r5611, %r5609;
cvt.u64.u32 %rd1739, %r5610;
cvt.u64.u32 %rd1740, %r5612;
bfi.b64 %rd1741, %rd1739, %rd1740, 32, 32;
cvt.rn.f64.s64 %fd139, %rd1741;
mul.f64 %fd140, %fd139, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3752, %fd140;
setp.eq.s32 %p921, %r5608, 0;
neg.f32 %f3753, %f3752;
selp.f32 %f5590, %f3752, %f3753, %p921;
$L__BB0_1082:
add.s32 %r1421, %r8505, 1;
and.b32 %r1422, %r1421, 1;
setp.eq.s32 %p922, %r1422, 0;
selp.f32 %f1201, %f5590, 0f3F800000, %p922;
mul.rn.f32 %f1202, %f5590, %f5590;
mov.f32 %f5591, 0fB94D4153;
@%p922 bra $L__BB0_1084;
mov.f32 %f3756, 0fBAB607ED;
mov.f32 %f3757, 0f37CBAC00;
fma.rn.f32 %f5591, %f3757, %f1202, %f3756;
$L__BB0_1084:
selp.f32 %f3758, 0f3C0885E4, 0f3D2AAABB, %p922;
fma.rn.f32 %f3759, %f5591, %f1202, %f3758;
selp.f32 %f3760, 0fBE2AAAA8, 0fBEFFFFFF, %p922;
fma.rn.f32 %f3761, %f3759, %f1202, %f3760;
mov.f32 %f3762, 0f00000000;
fma.rn.f32 %f3763, %f1202, %f1201, %f3762;
fma.rn.f32 %f5215, %f3761, %f3763, %f1201;
and.b32 %r5614, %r1421, 2;
setp.eq.s32 %p924, %r5614, 0;
@%p924 bra $L__BB0_1086;
mov.f32 %f3765, 0fBF800000;
fma.rn.f32 %f5215, %f5215, %f3765, %f3762;
$L__BB0_1086:
selp.f32 %f1209, %f5215, %f5216, %p18;
selp.f32 %f1210, %f5213, %f5214, %p18;
@%p903 bra $L__BB0_1088;
add.f32 %f5712, %f1210, %f1209;
mov.f32 %f5214, %f5213;
mov.f32 %f5216, %f5215;
$L__BB0_1088:
@%p824 bra $L__BB0_1117;
shl.b32 %r5616, %r12, 5;
mov.u32 %r5617, -32;
sub.s32 %r1423, %r5617, %r5616;
setp.ge.s32 %p928, %r14, %r1423;
@%p928 bra $L__BB0_1102;
mul.f32 %f3768, %f5345, 0f3F22F983;
cvt.rni.s32.f32 %r8509, %f3768;
cvt.rn.f32.s32 %f3769, %r8509;
mov.f32 %f3770, 0fBFC90FDA;
fma.rn.f32 %f3771, %f3769, %f3770, %f5345;
mov.f32 %f3772, 0fB3A22168;
fma.rn.f32 %f3773, %f3769, %f3772, %f3771;
mov.f32 %f3774, 0fA7C234C5;
fma.rn.f32 %f5599, %f3769, %f3774, %f3773;
abs.f32 %f1218, %f5345;
setp.ltu.f32 %p929, %f1218, 0f47CE4780;
@%p929 bra $L__BB0_1098;
setp.eq.f32 %p930, %f1218, 0f7F800000;
@%p930 bra $L__BB0_1097;
bra.uni $L__BB0_1092;
$L__BB0_1097:
mov.f32 %f3777, 0f00000000;
mul.rn.f32 %f5599, %f5345, %f3777;
mov.u32 %r8509, 0;
bra.uni $L__BB0_1098;
$L__BB0_1092:
mov.b32 %r1425, %f5345;
shr.u32 %r5619, %r1425, 23;
and.b32 %r5620, %r5619, 255;
add.s32 %r1426, %r5620, -128;
shl.b32 %r5621, %r1425, 8;
or.b32 %r1427, %r5621, -2147483648;
shr.u32 %r1428, %r1426, 5;
mov.u64 %rd2638, 0;
mov.u32 %r8506, 0;
mov.u64 %rd1745, __cudart_i2opi_f;
mov.u64 %rd2639, %rd2638;
$L__BB0_1093:
.pragma "nounroll";
shl.b64 %rd1744, %rd2638, 2;
add.s64 %rd1746, %rd1745, %rd1744;
ld.global.nc.u32 %r5622, [%rd1746];
mad.wide.u32 %rd1747, %r5622, %r1427, %rd2639;
shr.u64 %rd2639, %rd1747, 32;
add.s64 %rd1748, %rd1, %rd1744;
st.local.u32 [%rd1748], %rd1747;
add.s32 %r8506, %r8506, 1;
cvt.s64.s32 %rd2638, %r8506;
setp.ne.s32 %p931, %r8506, 6;
@%p931 bra $L__BB0_1093;
st.local.u32 [%rd5], %rd2639;
mov.u32 %r5623, 4;
sub.s32 %r1431, %r5623, %r1428;
mov.u32 %r5624, 6;
sub.s32 %r5625, %r5624, %r1428;
mul.wide.s32 %rd1749, %r5625, 4;
add.s64 %rd1750, %rd1, %rd1749;
ld.local.u32 %r8507, [%rd1750];
ld.local.u32 %r8508, [%rd1750+-4];
and.b32 %r1434, %r1426, 31;
setp.eq.s32 %p932, %r1434, 0;
@%p932 bra $L__BB0_1096;
mov.u32 %r5626, 32;
sub.s32 %r5627, %r5626, %r1434;
shr.u32 %r5628, %r8508, %r5627;
shl.b32 %r5629, %r8507, %r1434;
add.s32 %r8507, %r5628, %r5629;
mul.wide.s32 %rd1751, %r1431, 4;
add.s64 %rd1752, %rd1, %rd1751;
ld.local.u32 %r5630, [%rd1752];
shr.u32 %r5631, %r5630, %r5627;
shl.b32 %r5632, %r8508, %r1434;
add.s32 %r8508, %r5631, %r5632;
$L__BB0_1096:
and.b32 %r5633, %r1425, -2147483648;
shr.u32 %r5634, %r8508, 30;
shl.b32 %r5635, %r8507, 2;
or.b32 %r5636, %r5634, %r5635;
shr.u32 %r5637, %r5636, 31;
shr.u32 %r5638, %r8507, 30;
add.s32 %r5639, %r5637, %r5638;
neg.s32 %r5640, %r5639;
setp.eq.s32 %p933, %r5633, 0;
selp.b32 %r8509, %r5639, %r5640, %p933;
setp.ne.s32 %p934, %r5637, 0;
xor.b32 %r5641, %r5633, -2147483648;
selp.b32 %r5642, %r5641, %r5633, %p934;
selp.b32 %r5643, -1, 0, %p934;
xor.b32 %r5644, %r5636, %r5643;
shl.b32 %r5645, %r8508, 2;
xor.b32 %r5646, %r5645, %r5643;
cvt.u64.u32 %rd1753, %r5644;
cvt.u64.u32 %rd1754, %r5646;
bfi.b64 %rd1755, %rd1753, %rd1754, 32, 32;
cvt.rn.f64.s64 %fd141, %rd1755;
mul.f64 %fd142, %fd141, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3775, %fd142;
setp.eq.s32 %p935, %r5642, 0;
neg.f32 %f3776, %f3775;
selp.f32 %f5599, %f3775, %f3776, %p935;
$L__BB0_1098:
and.b32 %r1441, %r8509, 1;
setp.eq.s32 %p936, %r1441, 0;
selp.f32 %f1222, %f5599, 0f3F800000, %p936;
mul.rn.f32 %f1223, %f5599, %f5599;
mov.f32 %f5600, 0fB94D4153;
@%p936 bra $L__BB0_1100;
mov.f32 %f3779, 0fBAB607ED;
mov.f32 %f3780, 0f37CBAC00;
fma.rn.f32 %f5600, %f3780, %f1223, %f3779;
$L__BB0_1100:
selp.f32 %f3781, 0f3C0885E4, 0f3D2AAABB, %p936;
fma.rn.f32 %f3782, %f5600, %f1223, %f3781;
selp.f32 %f3783, 0fBE2AAAA8, 0fBEFFFFFF, %p936;
fma.rn.f32 %f3784, %f3782, %f1223, %f3783;
mov.f32 %f3785, 0f00000000;
fma.rn.f32 %f3786, %f1223, %f1222, %f3785;
fma.rn.f32 %f5213, %f3784, %f3786, %f1222;
and.b32 %r5648, %r8509, 2;
setp.eq.s32 %p938, %r5648, 0;
@%p938 bra $L__BB0_1102;
mov.f32 %f3788, 0fBF800000;
fma.rn.f32 %f5213, %f5213, %f3788, %f3785;
$L__BB0_1102:
setp.lt.s32 %p19, %r14, %r1423;
@%p928 bra $L__BB0_1115;
mul.f32 %f3789, %f5337, 0f3F22F983;
cvt.rni.s32.f32 %r8513, %f3789;
cvt.rn.f32.s32 %f3790, %r8513;
mov.f32 %f3791, 0fBFC90FDA;
fma.rn.f32 %f3792, %f3790, %f3791, %f5337;
mov.f32 %f3793, 0fB3A22168;
fma.rn.f32 %f3794, %f3790, %f3793, %f3792;
mov.f32 %f3795, 0fA7C234C5;
fma.rn.f32 %f5603, %f3790, %f3795, %f3794;
abs.f32 %f1231, %f5337;
setp.ltu.f32 %p940, %f1231, 0f47CE4780;
@%p940 bra $L__BB0_1111;
setp.eq.f32 %p941, %f1231, 0f7F800000;
@%p941 bra $L__BB0_1110;
bra.uni $L__BB0_1105;
$L__BB0_1110:
mov.f32 %f3798, 0f00000000;
mul.rn.f32 %f5603, %f5337, %f3798;
mov.u32 %r8513, 0;
bra.uni $L__BB0_1111;
$L__BB0_1105:
mov.b32 %r1443, %f5337;
shr.u32 %r5650, %r1443, 23;
and.b32 %r5651, %r5650, 255;
add.s32 %r1444, %r5651, -128;
shl.b32 %r5652, %r1443, 8;
or.b32 %r1445, %r5652, -2147483648;
shr.u32 %r1446, %r1444, 5;
mov.u64 %rd2640, 0;
mov.u32 %r8510, 0;
mov.u64 %rd1759, __cudart_i2opi_f;
mov.u64 %rd2641, %rd2640;
$L__BB0_1106:
.pragma "nounroll";
shl.b64 %rd1758, %rd2640, 2;
add.s64 %rd1760, %rd1759, %rd1758;
ld.global.nc.u32 %r5653, [%rd1760];
mad.wide.u32 %rd1761, %r5653, %r1445, %rd2641;
shr.u64 %rd2641, %rd1761, 32;
add.s64 %rd1762, %rd1, %rd1758;
st.local.u32 [%rd1762], %rd1761;
add.s32 %r8510, %r8510, 1;
cvt.s64.s32 %rd2640, %r8510;
setp.ne.s32 %p942, %r8510, 6;
@%p942 bra $L__BB0_1106;
st.local.u32 [%rd5], %rd2641;
mov.u32 %r5654, 4;
sub.s32 %r1449, %r5654, %r1446;
mov.u32 %r5655, 6;
sub.s32 %r5656, %r5655, %r1446;
mul.wide.s32 %rd1763, %r5656, 4;
add.s64 %rd1764, %rd1, %rd1763;
ld.local.u32 %r8511, [%rd1764];
ld.local.u32 %r8512, [%rd1764+-4];
and.b32 %r1452, %r1444, 31;
setp.eq.s32 %p943, %r1452, 0;
@%p943 bra $L__BB0_1109;
mov.u32 %r5657, 32;
sub.s32 %r5658, %r5657, %r1452;
shr.u32 %r5659, %r8512, %r5658;
shl.b32 %r5660, %r8511, %r1452;
add.s32 %r8511, %r5659, %r5660;
mul.wide.s32 %rd1765, %r1449, 4;
add.s64 %rd1766, %rd1, %rd1765;
ld.local.u32 %r5661, [%rd1766];
shr.u32 %r5662, %r5661, %r5658;
shl.b32 %r5663, %r8512, %r1452;
add.s32 %r8512, %r5662, %r5663;
$L__BB0_1109:
and.b32 %r5664, %r1443, -2147483648;
shr.u32 %r5665, %r8512, 30;
shl.b32 %r5666, %r8511, 2;
or.b32 %r5667, %r5665, %r5666;
shr.u32 %r5668, %r5667, 31;
shr.u32 %r5669, %r8511, 30;
add.s32 %r5670, %r5668, %r5669;
neg.s32 %r5671, %r5670;
setp.eq.s32 %p944, %r5664, 0;
selp.b32 %r8513, %r5670, %r5671, %p944;
setp.ne.s32 %p945, %r5668, 0;
xor.b32 %r5672, %r5664, -2147483648;
selp.b32 %r5673, %r5672, %r5664, %p945;
selp.b32 %r5674, -1, 0, %p945;
xor.b32 %r5675, %r5667, %r5674;
shl.b32 %r5676, %r8512, 2;
xor.b32 %r5677, %r5676, %r5674;
cvt.u64.u32 %rd1767, %r5675;
cvt.u64.u32 %rd1768, %r5677;
bfi.b64 %rd1769, %rd1767, %rd1768, 32, 32;
cvt.rn.f64.s64 %fd143, %rd1769;
mul.f64 %fd144, %fd143, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3796, %fd144;
setp.eq.s32 %p946, %r5673, 0;
neg.f32 %f3797, %f3796;
selp.f32 %f5603, %f3796, %f3797, %p946;
$L__BB0_1111:
add.s32 %r1459, %r8513, 1;
and.b32 %r1460, %r1459, 1;
setp.eq.s32 %p947, %r1460, 0;
selp.f32 %f1235, %f5603, 0f3F800000, %p947;
mul.rn.f32 %f1236, %f5603, %f5603;
mov.f32 %f5604, 0fB94D4153;
@%p947 bra $L__BB0_1113;
mov.f32 %f3800, 0fBAB607ED;
mov.f32 %f3801, 0f37CBAC00;
fma.rn.f32 %f5604, %f3801, %f1236, %f3800;
$L__BB0_1113:
selp.f32 %f3802, 0f3C0885E4, 0f3D2AAABB, %p947;
fma.rn.f32 %f3803, %f5604, %f1236, %f3802;
selp.f32 %f3804, 0fBE2AAAA8, 0fBEFFFFFF, %p947;
fma.rn.f32 %f3805, %f3803, %f1236, %f3804;
mov.f32 %f3806, 0f00000000;
fma.rn.f32 %f3807, %f1236, %f1235, %f3806;
fma.rn.f32 %f5215, %f3805, %f3807, %f1235;
and.b32 %r5679, %r1459, 2;
setp.eq.s32 %p949, %r5679, 0;
@%p949 bra $L__BB0_1115;
mov.f32 %f3809, 0fBF800000;
fma.rn.f32 %f5215, %f5215, %f3809, %f3806;
$L__BB0_1115:
selp.f32 %f1243, %f5215, %f5216, %p19;
selp.f32 %f1244, %f5213, %f5214, %p19;
@%p928 bra $L__BB0_1117;
add.f32 %f5711, %f1244, %f1243;
mov.f32 %f5214, %f5213;
mov.f32 %f5216, %f5215;
$L__BB0_1117:
@%p828 bra $L__BB0_1146;
shl.b32 %r5681, %r12, 5;
neg.s32 %r1461, %r5681;
setp.ge.s32 %p953, %r14, %r1461;
@%p953 bra $L__BB0_1131;
mul.f32 %f3812, %f5344, 0f3F22F983;
cvt.rni.s32.f32 %r8517, %f3812;
cvt.rn.f32.s32 %f3813, %r8517;
mov.f32 %f3814, 0fBFC90FDA;
fma.rn.f32 %f3815, %f3813, %f3814, %f5344;
mov.f32 %f3816, 0fB3A22168;
fma.rn.f32 %f3817, %f3813, %f3816, %f3815;
mov.f32 %f3818, 0fA7C234C5;
fma.rn.f32 %f5612, %f3813, %f3818, %f3817;
abs.f32 %f1252, %f5344;
setp.ltu.f32 %p954, %f1252, 0f47CE4780;
@%p954 bra $L__BB0_1127;
setp.eq.f32 %p955, %f1252, 0f7F800000;
@%p955 bra $L__BB0_1126;
bra.uni $L__BB0_1121;
$L__BB0_1126:
mov.f32 %f3821, 0f00000000;
mul.rn.f32 %f5612, %f5344, %f3821;
mov.u32 %r8517, 0;
bra.uni $L__BB0_1127;
$L__BB0_1121:
mov.b32 %r1463, %f5344;
shr.u32 %r5683, %r1463, 23;
and.b32 %r5684, %r5683, 255;
add.s32 %r1464, %r5684, -128;
shl.b32 %r5685, %r1463, 8;
or.b32 %r1465, %r5685, -2147483648;
shr.u32 %r1466, %r1464, 5;
mov.u64 %rd2642, 0;
mov.u32 %r8514, 0;
mov.u64 %rd1773, __cudart_i2opi_f;
mov.u64 %rd2643, %rd2642;
$L__BB0_1122:
.pragma "nounroll";
shl.b64 %rd1772, %rd2642, 2;
add.s64 %rd1774, %rd1773, %rd1772;
ld.global.nc.u32 %r5686, [%rd1774];
mad.wide.u32 %rd1775, %r5686, %r1465, %rd2643;
shr.u64 %rd2643, %rd1775, 32;
add.s64 %rd1776, %rd1, %rd1772;
st.local.u32 [%rd1776], %rd1775;
add.s32 %r8514, %r8514, 1;
cvt.s64.s32 %rd2642, %r8514;
setp.ne.s32 %p956, %r8514, 6;
@%p956 bra $L__BB0_1122;
st.local.u32 [%rd5], %rd2643;
mov.u32 %r5687, 4;
sub.s32 %r1469, %r5687, %r1466;
mov.u32 %r5688, 6;
sub.s32 %r5689, %r5688, %r1466;
mul.wide.s32 %rd1777, %r5689, 4;
add.s64 %rd1778, %rd1, %rd1777;
ld.local.u32 %r8515, [%rd1778];
ld.local.u32 %r8516, [%rd1778+-4];
and.b32 %r1472, %r1464, 31;
setp.eq.s32 %p957, %r1472, 0;
@%p957 bra $L__BB0_1125;
mov.u32 %r5690, 32;
sub.s32 %r5691, %r5690, %r1472;
shr.u32 %r5692, %r8516, %r5691;
shl.b32 %r5693, %r8515, %r1472;
add.s32 %r8515, %r5692, %r5693;
mul.wide.s32 %rd1779, %r1469, 4;
add.s64 %rd1780, %rd1, %rd1779;
ld.local.u32 %r5694, [%rd1780];
shr.u32 %r5695, %r5694, %r5691;
shl.b32 %r5696, %r8516, %r1472;
add.s32 %r8516, %r5695, %r5696;
$L__BB0_1125:
and.b32 %r5697, %r1463, -2147483648;
shr.u32 %r5698, %r8516, 30;
shl.b32 %r5699, %r8515, 2;
or.b32 %r5700, %r5698, %r5699;
shr.u32 %r5701, %r5700, 31;
shr.u32 %r5702, %r8515, 30;
add.s32 %r5703, %r5701, %r5702;
neg.s32 %r5704, %r5703;
setp.eq.s32 %p958, %r5697, 0;
selp.b32 %r8517, %r5703, %r5704, %p958;
setp.ne.s32 %p959, %r5701, 0;
xor.b32 %r5705, %r5697, -2147483648;
selp.b32 %r5706, %r5705, %r5697, %p959;
selp.b32 %r5707, -1, 0, %p959;
xor.b32 %r5708, %r5700, %r5707;
shl.b32 %r5709, %r8516, 2;
xor.b32 %r5710, %r5709, %r5707;
cvt.u64.u32 %rd1781, %r5708;
cvt.u64.u32 %rd1782, %r5710;
bfi.b64 %rd1783, %rd1781, %rd1782, 32, 32;
cvt.rn.f64.s64 %fd145, %rd1783;
mul.f64 %fd146, %fd145, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3819, %fd146;
setp.eq.s32 %p960, %r5706, 0;
neg.f32 %f3820, %f3819;
selp.f32 %f5612, %f3819, %f3820, %p960;
$L__BB0_1127:
and.b32 %r1479, %r8517, 1;
setp.eq.s32 %p961, %r1479, 0;
selp.f32 %f1256, %f5612, 0f3F800000, %p961;
mul.rn.f32 %f1257, %f5612, %f5612;
mov.f32 %f5613, 0fB94D4153;
@%p961 bra $L__BB0_1129;
mov.f32 %f3823, 0fBAB607ED;
mov.f32 %f3824, 0f37CBAC00;
fma.rn.f32 %f5613, %f3824, %f1257, %f3823;
$L__BB0_1129:
selp.f32 %f3825, 0f3C0885E4, 0f3D2AAABB, %p961;
fma.rn.f32 %f3826, %f5613, %f1257, %f3825;
selp.f32 %f3827, 0fBE2AAAA8, 0fBEFFFFFF, %p961;
fma.rn.f32 %f3828, %f3826, %f1257, %f3827;
mov.f32 %f3829, 0f00000000;
fma.rn.f32 %f3830, %f1257, %f1256, %f3829;
fma.rn.f32 %f5213, %f3828, %f3830, %f1256;
and.b32 %r5712, %r8517, 2;
setp.eq.s32 %p963, %r5712, 0;
@%p963 bra $L__BB0_1131;
mov.f32 %f3832, 0fBF800000;
fma.rn.f32 %f5213, %f5213, %f3832, %f3829;
$L__BB0_1131:
setp.lt.s32 %p20, %r14, %r1461;
@%p953 bra $L__BB0_1144;
mul.f32 %f3833, %f5336, 0f3F22F983;
cvt.rni.s32.f32 %r8521, %f3833;
cvt.rn.f32.s32 %f3834, %r8521;
mov.f32 %f3835, 0fBFC90FDA;
fma.rn.f32 %f3836, %f3834, %f3835, %f5336;
mov.f32 %f3837, 0fB3A22168;
fma.rn.f32 %f3838, %f3834, %f3837, %f3836;
mov.f32 %f3839, 0fA7C234C5;
fma.rn.f32 %f5616, %f3834, %f3839, %f3838;
abs.f32 %f1265, %f5336;
setp.ltu.f32 %p965, %f1265, 0f47CE4780;
@%p965 bra $L__BB0_1140;
setp.eq.f32 %p966, %f1265, 0f7F800000;
@%p966 bra $L__BB0_1139;
bra.uni $L__BB0_1134;
$L__BB0_1139:
mov.f32 %f3842, 0f00000000;
mul.rn.f32 %f5616, %f5336, %f3842;
mov.u32 %r8521, 0;
bra.uni $L__BB0_1140;
$L__BB0_1134:
mov.b32 %r1481, %f5336;
shr.u32 %r5714, %r1481, 23;
and.b32 %r5715, %r5714, 255;
add.s32 %r1482, %r5715, -128;
shl.b32 %r5716, %r1481, 8;
or.b32 %r1483, %r5716, -2147483648;
shr.u32 %r1484, %r1482, 5;
mov.u64 %rd2644, 0;
mov.u32 %r8518, 0;
mov.u64 %rd1787, __cudart_i2opi_f;
mov.u64 %rd2645, %rd2644;
$L__BB0_1135:
.pragma "nounroll";
shl.b64 %rd1786, %rd2644, 2;
add.s64 %rd1788, %rd1787, %rd1786;
ld.global.nc.u32 %r5717, [%rd1788];
mad.wide.u32 %rd1789, %r5717, %r1483, %rd2645;
shr.u64 %rd2645, %rd1789, 32;
add.s64 %rd1790, %rd1, %rd1786;
st.local.u32 [%rd1790], %rd1789;
add.s32 %r8518, %r8518, 1;
cvt.s64.s32 %rd2644, %r8518;
setp.ne.s32 %p967, %r8518, 6;
@%p967 bra $L__BB0_1135;
st.local.u32 [%rd5], %rd2645;
mov.u32 %r5718, 4;
sub.s32 %r1487, %r5718, %r1484;
mov.u32 %r5719, 6;
sub.s32 %r5720, %r5719, %r1484;
mul.wide.s32 %rd1791, %r5720, 4;
add.s64 %rd1792, %rd1, %rd1791;
ld.local.u32 %r8519, [%rd1792];
ld.local.u32 %r8520, [%rd1792+-4];
and.b32 %r1490, %r1482, 31;
setp.eq.s32 %p968, %r1490, 0;
@%p968 bra $L__BB0_1138;
mov.u32 %r5721, 32;
sub.s32 %r5722, %r5721, %r1490;
shr.u32 %r5723, %r8520, %r5722;
shl.b32 %r5724, %r8519, %r1490;
add.s32 %r8519, %r5723, %r5724;
mul.wide.s32 %rd1793, %r1487, 4;
add.s64 %rd1794, %rd1, %rd1793;
ld.local.u32 %r5725, [%rd1794];
shr.u32 %r5726, %r5725, %r5722;
shl.b32 %r5727, %r8520, %r1490;
add.s32 %r8520, %r5726, %r5727;
$L__BB0_1138:
and.b32 %r5728, %r1481, -2147483648;
shr.u32 %r5729, %r8520, 30;
shl.b32 %r5730, %r8519, 2;
or.b32 %r5731, %r5729, %r5730;
shr.u32 %r5732, %r5731, 31;
shr.u32 %r5733, %r8519, 30;
add.s32 %r5734, %r5732, %r5733;
neg.s32 %r5735, %r5734;
setp.eq.s32 %p969, %r5728, 0;
selp.b32 %r8521, %r5734, %r5735, %p969;
setp.ne.s32 %p970, %r5732, 0;
xor.b32 %r5736, %r5728, -2147483648;
selp.b32 %r5737, %r5736, %r5728, %p970;
selp.b32 %r5738, -1, 0, %p970;
xor.b32 %r5739, %r5731, %r5738;
shl.b32 %r5740, %r8520, 2;
xor.b32 %r5741, %r5740, %r5738;
cvt.u64.u32 %rd1795, %r5739;
cvt.u64.u32 %rd1796, %r5741;
bfi.b64 %rd1797, %rd1795, %rd1796, 32, 32;
cvt.rn.f64.s64 %fd147, %rd1797;
mul.f64 %fd148, %fd147, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3840, %fd148;
setp.eq.s32 %p971, %r5737, 0;
neg.f32 %f3841, %f3840;
selp.f32 %f5616, %f3840, %f3841, %p971;
$L__BB0_1140:
add.s32 %r1497, %r8521, 1;
and.b32 %r1498, %r1497, 1;
setp.eq.s32 %p972, %r1498, 0;
selp.f32 %f1269, %f5616, 0f3F800000, %p972;
mul.rn.f32 %f1270, %f5616, %f5616;
mov.f32 %f5617, 0fB94D4153;
@%p972 bra $L__BB0_1142;
mov.f32 %f3844, 0fBAB607ED;
mov.f32 %f3845, 0f37CBAC00;
fma.rn.f32 %f5617, %f3845, %f1270, %f3844;
$L__BB0_1142:
selp.f32 %f3846, 0f3C0885E4, 0f3D2AAABB, %p972;
fma.rn.f32 %f3847, %f5617, %f1270, %f3846;
selp.f32 %f3848, 0fBE2AAAA8, 0fBEFFFFFF, %p972;
fma.rn.f32 %f3849, %f3847, %f1270, %f3848;
mov.f32 %f3850, 0f00000000;
fma.rn.f32 %f3851, %f1270, %f1269, %f3850;
fma.rn.f32 %f5215, %f3849, %f3851, %f1269;
and.b32 %r5743, %r1497, 2;
setp.eq.s32 %p974, %r5743, 0;
@%p974 bra $L__BB0_1144;
mov.f32 %f3853, 0fBF800000;
fma.rn.f32 %f5215, %f5215, %f3853, %f3850;
$L__BB0_1144:
selp.f32 %f1277, %f5215, %f5216, %p20;
selp.f32 %f1278, %f5213, %f5214, %p20;
@%p953 bra $L__BB0_1146;
add.f32 %f5710, %f1278, %f1277;
mov.f32 %f5214, %f5213;
mov.f32 %f5216, %f5215;
$L__BB0_1146:
@%p828 bra $L__BB0_1175;
shl.b32 %r5745, %r12, 5;
mov.u32 %r5746, -32;
sub.s32 %r1499, %r5746, %r5745;
setp.ge.s32 %p978, %r14, %r1499;
@%p978 bra $L__BB0_1160;
mul.f32 %f3856, %f5343, 0f3F22F983;
cvt.rni.s32.f32 %r8525, %f3856;
cvt.rn.f32.s32 %f3857, %r8525;
mov.f32 %f3858, 0fBFC90FDA;
fma.rn.f32 %f3859, %f3857, %f3858, %f5343;
mov.f32 %f3860, 0fB3A22168;
fma.rn.f32 %f3861, %f3857, %f3860, %f3859;
mov.f32 %f3862, 0fA7C234C5;
fma.rn.f32 %f5625, %f3857, %f3862, %f3861;
abs.f32 %f1286, %f5343;
setp.ltu.f32 %p979, %f1286, 0f47CE4780;
@%p979 bra $L__BB0_1156;
setp.eq.f32 %p980, %f1286, 0f7F800000;
@%p980 bra $L__BB0_1155;
bra.uni $L__BB0_1150;
$L__BB0_1155:
mov.f32 %f3865, 0f00000000;
mul.rn.f32 %f5625, %f5343, %f3865;
mov.u32 %r8525, 0;
bra.uni $L__BB0_1156;
$L__BB0_1150:
mov.b32 %r1501, %f5343;
shr.u32 %r5748, %r1501, 23;
and.b32 %r5749, %r5748, 255;
add.s32 %r1502, %r5749, -128;
shl.b32 %r5750, %r1501, 8;
or.b32 %r1503, %r5750, -2147483648;
shr.u32 %r1504, %r1502, 5;
mov.u64 %rd2646, 0;
mov.u32 %r8522, 0;
mov.u64 %rd1801, __cudart_i2opi_f;
mov.u64 %rd2647, %rd2646;
$L__BB0_1151:
.pragma "nounroll";
shl.b64 %rd1800, %rd2646, 2;
add.s64 %rd1802, %rd1801, %rd1800;
ld.global.nc.u32 %r5751, [%rd1802];
mad.wide.u32 %rd1803, %r5751, %r1503, %rd2647;
shr.u64 %rd2647, %rd1803, 32;
add.s64 %rd1804, %rd1, %rd1800;
st.local.u32 [%rd1804], %rd1803;
add.s32 %r8522, %r8522, 1;
cvt.s64.s32 %rd2646, %r8522;
setp.ne.s32 %p981, %r8522, 6;
@%p981 bra $L__BB0_1151;
st.local.u32 [%rd5], %rd2647;
mov.u32 %r5752, 4;
sub.s32 %r1507, %r5752, %r1504;
mov.u32 %r5753, 6;
sub.s32 %r5754, %r5753, %r1504;
mul.wide.s32 %rd1805, %r5754, 4;
add.s64 %rd1806, %rd1, %rd1805;
ld.local.u32 %r8523, [%rd1806];
ld.local.u32 %r8524, [%rd1806+-4];
and.b32 %r1510, %r1502, 31;
setp.eq.s32 %p982, %r1510, 0;
@%p982 bra $L__BB0_1154;
mov.u32 %r5755, 32;
sub.s32 %r5756, %r5755, %r1510;
shr.u32 %r5757, %r8524, %r5756;
shl.b32 %r5758, %r8523, %r1510;
add.s32 %r8523, %r5757, %r5758;
mul.wide.s32 %rd1807, %r1507, 4;
add.s64 %rd1808, %rd1, %rd1807;
ld.local.u32 %r5759, [%rd1808];
shr.u32 %r5760, %r5759, %r5756;
shl.b32 %r5761, %r8524, %r1510;
add.s32 %r8524, %r5760, %r5761;
$L__BB0_1154:
and.b32 %r5762, %r1501, -2147483648;
shr.u32 %r5763, %r8524, 30;
shl.b32 %r5764, %r8523, 2;
or.b32 %r5765, %r5763, %r5764;
shr.u32 %r5766, %r5765, 31;
shr.u32 %r5767, %r8523, 30;
add.s32 %r5768, %r5766, %r5767;
neg.s32 %r5769, %r5768;
setp.eq.s32 %p983, %r5762, 0;
selp.b32 %r8525, %r5768, %r5769, %p983;
setp.ne.s32 %p984, %r5766, 0;
xor.b32 %r5770, %r5762, -2147483648;
selp.b32 %r5771, %r5770, %r5762, %p984;
selp.b32 %r5772, -1, 0, %p984;
xor.b32 %r5773, %r5765, %r5772;
shl.b32 %r5774, %r8524, 2;
xor.b32 %r5775, %r5774, %r5772;
cvt.u64.u32 %rd1809, %r5773;
cvt.u64.u32 %rd1810, %r5775;
bfi.b64 %rd1811, %rd1809, %rd1810, 32, 32;
cvt.rn.f64.s64 %fd149, %rd1811;
mul.f64 %fd150, %fd149, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3863, %fd150;
setp.eq.s32 %p985, %r5771, 0;
neg.f32 %f3864, %f3863;
selp.f32 %f5625, %f3863, %f3864, %p985;
$L__BB0_1156:
and.b32 %r1517, %r8525, 1;
setp.eq.s32 %p986, %r1517, 0;
selp.f32 %f1290, %f5625, 0f3F800000, %p986;
mul.rn.f32 %f1291, %f5625, %f5625;
mov.f32 %f5626, 0fB94D4153;
@%p986 bra $L__BB0_1158;
mov.f32 %f3867, 0fBAB607ED;
mov.f32 %f3868, 0f37CBAC00;
fma.rn.f32 %f5626, %f3868, %f1291, %f3867;
$L__BB0_1158:
selp.f32 %f3869, 0f3C0885E4, 0f3D2AAABB, %p986;
fma.rn.f32 %f3870, %f5626, %f1291, %f3869;
selp.f32 %f3871, 0fBE2AAAA8, 0fBEFFFFFF, %p986;
fma.rn.f32 %f3872, %f3870, %f1291, %f3871;
mov.f32 %f3873, 0f00000000;
fma.rn.f32 %f3874, %f1291, %f1290, %f3873;
fma.rn.f32 %f5213, %f3872, %f3874, %f1290;
and.b32 %r5777, %r8525, 2;
setp.eq.s32 %p988, %r5777, 0;
@%p988 bra $L__BB0_1160;
mov.f32 %f3876, 0fBF800000;
fma.rn.f32 %f5213, %f5213, %f3876, %f3873;
$L__BB0_1160:
setp.lt.s32 %p21, %r14, %r1499;
@%p978 bra $L__BB0_1173;
mul.f32 %f3877, %f5335, 0f3F22F983;
cvt.rni.s32.f32 %r8529, %f3877;
cvt.rn.f32.s32 %f3878, %r8529;
mov.f32 %f3879, 0fBFC90FDA;
fma.rn.f32 %f3880, %f3878, %f3879, %f5335;
mov.f32 %f3881, 0fB3A22168;
fma.rn.f32 %f3882, %f3878, %f3881, %f3880;
mov.f32 %f3883, 0fA7C234C5;
fma.rn.f32 %f5629, %f3878, %f3883, %f3882;
abs.f32 %f1299, %f5335;
setp.ltu.f32 %p990, %f1299, 0f47CE4780;
@%p990 bra $L__BB0_1169;
setp.eq.f32 %p991, %f1299, 0f7F800000;
@%p991 bra $L__BB0_1168;
bra.uni $L__BB0_1163;
$L__BB0_1168:
mov.f32 %f3886, 0f00000000;
mul.rn.f32 %f5629, %f5335, %f3886;
mov.u32 %r8529, 0;
bra.uni $L__BB0_1169;
$L__BB0_1163:
mov.b32 %r1519, %f5335;
shr.u32 %r5779, %r1519, 23;
and.b32 %r5780, %r5779, 255;
add.s32 %r1520, %r5780, -128;
shl.b32 %r5781, %r1519, 8;
or.b32 %r1521, %r5781, -2147483648;
shr.u32 %r1522, %r1520, 5;
mov.u64 %rd2648, 0;
mov.u32 %r8526, 0;
mov.u64 %rd1815, __cudart_i2opi_f;
mov.u64 %rd2649, %rd2648;
$L__BB0_1164:
.pragma "nounroll";
shl.b64 %rd1814, %rd2648, 2;
add.s64 %rd1816, %rd1815, %rd1814;
ld.global.nc.u32 %r5782, [%rd1816];
mad.wide.u32 %rd1817, %r5782, %r1521, %rd2649;
shr.u64 %rd2649, %rd1817, 32;
add.s64 %rd1818, %rd1, %rd1814;
st.local.u32 [%rd1818], %rd1817;
add.s32 %r8526, %r8526, 1;
cvt.s64.s32 %rd2648, %r8526;
setp.ne.s32 %p992, %r8526, 6;
@%p992 bra $L__BB0_1164;
st.local.u32 [%rd5], %rd2649;
mov.u32 %r5783, 4;
sub.s32 %r1525, %r5783, %r1522;
mov.u32 %r5784, 6;
sub.s32 %r5785, %r5784, %r1522;
mul.wide.s32 %rd1819, %r5785, 4;
add.s64 %rd1820, %rd1, %rd1819;
ld.local.u32 %r8527, [%rd1820];
ld.local.u32 %r8528, [%rd1820+-4];
and.b32 %r1528, %r1520, 31;
setp.eq.s32 %p993, %r1528, 0;
@%p993 bra $L__BB0_1167;
mov.u32 %r5786, 32;
sub.s32 %r5787, %r5786, %r1528;
shr.u32 %r5788, %r8528, %r5787;
shl.b32 %r5789, %r8527, %r1528;
add.s32 %r8527, %r5788, %r5789;
mul.wide.s32 %rd1821, %r1525, 4;
add.s64 %rd1822, %rd1, %rd1821;
ld.local.u32 %r5790, [%rd1822];
shr.u32 %r5791, %r5790, %r5787;
shl.b32 %r5792, %r8528, %r1528;
add.s32 %r8528, %r5791, %r5792;
$L__BB0_1167:
and.b32 %r5793, %r1519, -2147483648;
shr.u32 %r5794, %r8528, 30;
shl.b32 %r5795, %r8527, 2;
or.b32 %r5796, %r5794, %r5795;
shr.u32 %r5797, %r5796, 31;
shr.u32 %r5798, %r8527, 30;
add.s32 %r5799, %r5797, %r5798;
neg.s32 %r5800, %r5799;
setp.eq.s32 %p994, %r5793, 0;
selp.b32 %r8529, %r5799, %r5800, %p994;
setp.ne.s32 %p995, %r5797, 0;
xor.b32 %r5801, %r5793, -2147483648;
selp.b32 %r5802, %r5801, %r5793, %p995;
selp.b32 %r5803, -1, 0, %p995;
xor.b32 %r5804, %r5796, %r5803;
shl.b32 %r5805, %r8528, 2;
xor.b32 %r5806, %r5805, %r5803;
cvt.u64.u32 %rd1823, %r5804;
cvt.u64.u32 %rd1824, %r5806;
bfi.b64 %rd1825, %rd1823, %rd1824, 32, 32;
cvt.rn.f64.s64 %fd151, %rd1825;
mul.f64 %fd152, %fd151, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3884, %fd152;
setp.eq.s32 %p996, %r5802, 0;
neg.f32 %f3885, %f3884;
selp.f32 %f5629, %f3884, %f3885, %p996;
$L__BB0_1169:
add.s32 %r1535, %r8529, 1;
and.b32 %r1536, %r1535, 1;
setp.eq.s32 %p997, %r1536, 0;
selp.f32 %f1303, %f5629, 0f3F800000, %p997;
mul.rn.f32 %f1304, %f5629, %f5629;
mov.f32 %f5630, 0fB94D4153;
@%p997 bra $L__BB0_1171;
mov.f32 %f3888, 0fBAB607ED;
mov.f32 %f3889, 0f37CBAC00;
fma.rn.f32 %f5630, %f3889, %f1304, %f3888;
$L__BB0_1171:
selp.f32 %f3890, 0f3C0885E4, 0f3D2AAABB, %p997;
fma.rn.f32 %f3891, %f5630, %f1304, %f3890;
selp.f32 %f3892, 0fBE2AAAA8, 0fBEFFFFFF, %p997;
fma.rn.f32 %f3893, %f3891, %f1304, %f3892;
mov.f32 %f3894, 0f00000000;
fma.rn.f32 %f3895, %f1304, %f1303, %f3894;
fma.rn.f32 %f5215, %f3893, %f3895, %f1303;
and.b32 %r5808, %r1535, 2;
setp.eq.s32 %p999, %r5808, 0;
@%p999 bra $L__BB0_1173;
mov.f32 %f3897, 0fBF800000;
fma.rn.f32 %f5215, %f5215, %f3897, %f3894;
$L__BB0_1173:
selp.f32 %f1311, %f5215, %f5216, %p21;
selp.f32 %f1312, %f5213, %f5214, %p21;
@%p978 bra $L__BB0_1175;
add.f32 %f5709, %f1312, %f1311;
mov.f32 %f5214, %f5213;
mov.f32 %f5216, %f5215;
$L__BB0_1175:
@%p832 bra $L__BB0_1204;
shl.b32 %r5810, %r12, 5;
neg.s32 %r1537, %r5810;
setp.ge.s32 %p1003, %r14, %r1537;
@%p1003 bra $L__BB0_1189;
mul.f32 %f3900, %f5342, 0f3F22F983;
cvt.rni.s32.f32 %r8533, %f3900;
cvt.rn.f32.s32 %f3901, %r8533;
mov.f32 %f3902, 0fBFC90FDA;
fma.rn.f32 %f3903, %f3901, %f3902, %f5342;
mov.f32 %f3904, 0fB3A22168;
fma.rn.f32 %f3905, %f3901, %f3904, %f3903;
mov.f32 %f3906, 0fA7C234C5;
fma.rn.f32 %f5638, %f3901, %f3906, %f3905;
abs.f32 %f1320, %f5342;
setp.ltu.f32 %p1004, %f1320, 0f47CE4780;
@%p1004 bra $L__BB0_1185;
setp.eq.f32 %p1005, %f1320, 0f7F800000;
@%p1005 bra $L__BB0_1184;
bra.uni $L__BB0_1179;
$L__BB0_1184:
mov.f32 %f3909, 0f00000000;
mul.rn.f32 %f5638, %f5342, %f3909;
mov.u32 %r8533, 0;
bra.uni $L__BB0_1185;
$L__BB0_1179:
mov.b32 %r1539, %f5342;
shr.u32 %r5812, %r1539, 23;
and.b32 %r5813, %r5812, 255;
add.s32 %r1540, %r5813, -128;
shl.b32 %r5814, %r1539, 8;
or.b32 %r1541, %r5814, -2147483648;
shr.u32 %r1542, %r1540, 5;
mov.u64 %rd2650, 0;
mov.u32 %r8530, 0;
mov.u64 %rd1829, __cudart_i2opi_f;
mov.u64 %rd2651, %rd2650;
$L__BB0_1180:
.pragma "nounroll";
shl.b64 %rd1828, %rd2650, 2;
add.s64 %rd1830, %rd1829, %rd1828;
ld.global.nc.u32 %r5815, [%rd1830];
mad.wide.u32 %rd1831, %r5815, %r1541, %rd2651;
shr.u64 %rd2651, %rd1831, 32;
add.s64 %rd1832, %rd1, %rd1828;
st.local.u32 [%rd1832], %rd1831;
add.s32 %r8530, %r8530, 1;
cvt.s64.s32 %rd2650, %r8530;
setp.ne.s32 %p1006, %r8530, 6;
@%p1006 bra $L__BB0_1180;
st.local.u32 [%rd5], %rd2651;
mov.u32 %r5816, 4;
sub.s32 %r1545, %r5816, %r1542;
mov.u32 %r5817, 6;
sub.s32 %r5818, %r5817, %r1542;
mul.wide.s32 %rd1833, %r5818, 4;
add.s64 %rd1834, %rd1, %rd1833;
ld.local.u32 %r8531, [%rd1834];
ld.local.u32 %r8532, [%rd1834+-4];
and.b32 %r1548, %r1540, 31;
setp.eq.s32 %p1007, %r1548, 0;
@%p1007 bra $L__BB0_1183;
mov.u32 %r5819, 32;
sub.s32 %r5820, %r5819, %r1548;
shr.u32 %r5821, %r8532, %r5820;
shl.b32 %r5822, %r8531, %r1548;
add.s32 %r8531, %r5821, %r5822;
mul.wide.s32 %rd1835, %r1545, 4;
add.s64 %rd1836, %rd1, %rd1835;
ld.local.u32 %r5823, [%rd1836];
shr.u32 %r5824, %r5823, %r5820;
shl.b32 %r5825, %r8532, %r1548;
add.s32 %r8532, %r5824, %r5825;
$L__BB0_1183:
and.b32 %r5826, %r1539, -2147483648;
shr.u32 %r5827, %r8532, 30;
shl.b32 %r5828, %r8531, 2;
or.b32 %r5829, %r5827, %r5828;
shr.u32 %r5830, %r5829, 31;
shr.u32 %r5831, %r8531, 30;
add.s32 %r5832, %r5830, %r5831;
neg.s32 %r5833, %r5832;
setp.eq.s32 %p1008, %r5826, 0;
selp.b32 %r8533, %r5832, %r5833, %p1008;
setp.ne.s32 %p1009, %r5830, 0;
xor.b32 %r5834, %r5826, -2147483648;
selp.b32 %r5835, %r5834, %r5826, %p1009;
selp.b32 %r5836, -1, 0, %p1009;
xor.b32 %r5837, %r5829, %r5836;
shl.b32 %r5838, %r8532, 2;
xor.b32 %r5839, %r5838, %r5836;
cvt.u64.u32 %rd1837, %r5837;
cvt.u64.u32 %rd1838, %r5839;
bfi.b64 %rd1839, %rd1837, %rd1838, 32, 32;
cvt.rn.f64.s64 %fd153, %rd1839;
mul.f64 %fd154, %fd153, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3907, %fd154;
setp.eq.s32 %p1010, %r5835, 0;
neg.f32 %f3908, %f3907;
selp.f32 %f5638, %f3907, %f3908, %p1010;
$L__BB0_1185:
and.b32 %r1555, %r8533, 1;
setp.eq.s32 %p1011, %r1555, 0;
selp.f32 %f1324, %f5638, 0f3F800000, %p1011;
mul.rn.f32 %f1325, %f5638, %f5638;
mov.f32 %f5639, 0fB94D4153;
@%p1011 bra $L__BB0_1187;
mov.f32 %f3911, 0fBAB607ED;
mov.f32 %f3912, 0f37CBAC00;
fma.rn.f32 %f5639, %f3912, %f1325, %f3911;
$L__BB0_1187:
selp.f32 %f3913, 0f3C0885E4, 0f3D2AAABB, %p1011;
fma.rn.f32 %f3914, %f5639, %f1325, %f3913;
selp.f32 %f3915, 0fBE2AAAA8, 0fBEFFFFFF, %p1011;
fma.rn.f32 %f3916, %f3914, %f1325, %f3915;
mov.f32 %f3917, 0f00000000;
fma.rn.f32 %f3918, %f1325, %f1324, %f3917;
fma.rn.f32 %f5213, %f3916, %f3918, %f1324;
and.b32 %r5841, %r8533, 2;
setp.eq.s32 %p1013, %r5841, 0;
@%p1013 bra $L__BB0_1189;
mov.f32 %f3920, 0fBF800000;
fma.rn.f32 %f5213, %f5213, %f3920, %f3917;
$L__BB0_1189:
setp.lt.s32 %p22, %r14, %r1537;
@%p1003 bra $L__BB0_1202;
mul.f32 %f3921, %f5334, 0f3F22F983;
cvt.rni.s32.f32 %r8537, %f3921;
cvt.rn.f32.s32 %f3922, %r8537;
mov.f32 %f3923, 0fBFC90FDA;
fma.rn.f32 %f3924, %f3922, %f3923, %f5334;
mov.f32 %f3925, 0fB3A22168;
fma.rn.f32 %f3926, %f3922, %f3925, %f3924;
mov.f32 %f3927, 0fA7C234C5;
fma.rn.f32 %f5642, %f3922, %f3927, %f3926;
abs.f32 %f1333, %f5334;
setp.ltu.f32 %p1015, %f1333, 0f47CE4780;
@%p1015 bra $L__BB0_1198;
setp.eq.f32 %p1016, %f1333, 0f7F800000;
@%p1016 bra $L__BB0_1197;
bra.uni $L__BB0_1192;
$L__BB0_1197:
mov.f32 %f3930, 0f00000000;
mul.rn.f32 %f5642, %f5334, %f3930;
mov.u32 %r8537, 0;
bra.uni $L__BB0_1198;
$L__BB0_1192:
mov.b32 %r1557, %f5334;
shr.u32 %r5843, %r1557, 23;
and.b32 %r5844, %r5843, 255;
add.s32 %r1558, %r5844, -128;
shl.b32 %r5845, %r1557, 8;
or.b32 %r1559, %r5845, -2147483648;
shr.u32 %r1560, %r1558, 5;
mov.u64 %rd2652, 0;
mov.u32 %r8534, 0;
mov.u64 %rd1843, __cudart_i2opi_f;
mov.u64 %rd2653, %rd2652;
$L__BB0_1193:
.pragma "nounroll";
shl.b64 %rd1842, %rd2652, 2;
add.s64 %rd1844, %rd1843, %rd1842;
ld.global.nc.u32 %r5846, [%rd1844];
mad.wide.u32 %rd1845, %r5846, %r1559, %rd2653;
shr.u64 %rd2653, %rd1845, 32;
add.s64 %rd1846, %rd1, %rd1842;
st.local.u32 [%rd1846], %rd1845;
add.s32 %r8534, %r8534, 1;
cvt.s64.s32 %rd2652, %r8534;
setp.ne.s32 %p1017, %r8534, 6;
@%p1017 bra $L__BB0_1193;
st.local.u32 [%rd5], %rd2653;
mov.u32 %r5847, 4;
sub.s32 %r1563, %r5847, %r1560;
mov.u32 %r5848, 6;
sub.s32 %r5849, %r5848, %r1560;
mul.wide.s32 %rd1847, %r5849, 4;
add.s64 %rd1848, %rd1, %rd1847;
ld.local.u32 %r8535, [%rd1848];
ld.local.u32 %r8536, [%rd1848+-4];
and.b32 %r1566, %r1558, 31;
setp.eq.s32 %p1018, %r1566, 0;
@%p1018 bra $L__BB0_1196;
mov.u32 %r5850, 32;
sub.s32 %r5851, %r5850, %r1566;
shr.u32 %r5852, %r8536, %r5851;
shl.b32 %r5853, %r8535, %r1566;
add.s32 %r8535, %r5852, %r5853;
mul.wide.s32 %rd1849, %r1563, 4;
add.s64 %rd1850, %rd1, %rd1849;
ld.local.u32 %r5854, [%rd1850];
shr.u32 %r5855, %r5854, %r5851;
shl.b32 %r5856, %r8536, %r1566;
add.s32 %r8536, %r5855, %r5856;
$L__BB0_1196:
and.b32 %r5857, %r1557, -2147483648;
shr.u32 %r5858, %r8536, 30;
shl.b32 %r5859, %r8535, 2;
or.b32 %r5860, %r5858, %r5859;
shr.u32 %r5861, %r5860, 31;
shr.u32 %r5862, %r8535, 30;
add.s32 %r5863, %r5861, %r5862;
neg.s32 %r5864, %r5863;
setp.eq.s32 %p1019, %r5857, 0;
selp.b32 %r8537, %r5863, %r5864, %p1019;
setp.ne.s32 %p1020, %r5861, 0;
xor.b32 %r5865, %r5857, -2147483648;
selp.b32 %r5866, %r5865, %r5857, %p1020;
selp.b32 %r5867, -1, 0, %p1020;
xor.b32 %r5868, %r5860, %r5867;
shl.b32 %r5869, %r8536, 2;
xor.b32 %r5870, %r5869, %r5867;
cvt.u64.u32 %rd1851, %r5868;
cvt.u64.u32 %rd1852, %r5870;
bfi.b64 %rd1853, %rd1851, %rd1852, 32, 32;
cvt.rn.f64.s64 %fd155, %rd1853;
mul.f64 %fd156, %fd155, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3928, %fd156;
setp.eq.s32 %p1021, %r5866, 0;
neg.f32 %f3929, %f3928;
selp.f32 %f5642, %f3928, %f3929, %p1021;
$L__BB0_1198:
add.s32 %r1573, %r8537, 1;
and.b32 %r1574, %r1573, 1;
setp.eq.s32 %p1022, %r1574, 0;
selp.f32 %f1337, %f5642, 0f3F800000, %p1022;
mul.rn.f32 %f1338, %f5642, %f5642;
mov.f32 %f5643, 0fB94D4153;
@%p1022 bra $L__BB0_1200;
mov.f32 %f3932, 0fBAB607ED;
mov.f32 %f3933, 0f37CBAC00;
fma.rn.f32 %f5643, %f3933, %f1338, %f3932;
$L__BB0_1200:
selp.f32 %f3934, 0f3C0885E4, 0f3D2AAABB, %p1022;
fma.rn.f32 %f3935, %f5643, %f1338, %f3934;
selp.f32 %f3936, 0fBE2AAAA8, 0fBEFFFFFF, %p1022;
fma.rn.f32 %f3937, %f3935, %f1338, %f3936;
mov.f32 %f3938, 0f00000000;
fma.rn.f32 %f3939, %f1338, %f1337, %f3938;
fma.rn.f32 %f5215, %f3937, %f3939, %f1337;
and.b32 %r5872, %r1573, 2;
setp.eq.s32 %p1024, %r5872, 0;
@%p1024 bra $L__BB0_1202;
mov.f32 %f3941, 0fBF800000;
fma.rn.f32 %f5215, %f5215, %f3941, %f3938;
$L__BB0_1202:
selp.f32 %f1345, %f5215, %f5216, %p22;
selp.f32 %f1346, %f5213, %f5214, %p22;
@%p1003 bra $L__BB0_1204;
add.f32 %f5708, %f1346, %f1345;
mov.f32 %f5214, %f5213;
mov.f32 %f5216, %f5215;
$L__BB0_1204:
@%p832 bra $L__BB0_1426;
shl.b32 %r5874, %r12, 5;
mov.u32 %r5875, -32;
sub.s32 %r1575, %r5875, %r5874;
setp.ge.s32 %p1028, %r14, %r1575;
@%p1028 bra $L__BB0_1218;
mul.f32 %f3944, %f5341, 0f3F22F983;
cvt.rni.s32.f32 %r8541, %f3944;
cvt.rn.f32.s32 %f3945, %r8541;
mov.f32 %f3946, 0fBFC90FDA;
fma.rn.f32 %f3947, %f3945, %f3946, %f5341;
mov.f32 %f3948, 0fB3A22168;
fma.rn.f32 %f3949, %f3945, %f3948, %f3947;
mov.f32 %f3950, 0fA7C234C5;
fma.rn.f32 %f5651, %f3945, %f3950, %f3949;
abs.f32 %f1354, %f5341;
setp.ltu.f32 %p1029, %f1354, 0f47CE4780;
@%p1029 bra $L__BB0_1214;
setp.eq.f32 %p1030, %f1354, 0f7F800000;
@%p1030 bra $L__BB0_1213;
bra.uni $L__BB0_1208;
$L__BB0_1213:
mov.f32 %f3953, 0f00000000;
mul.rn.f32 %f5651, %f5341, %f3953;
mov.u32 %r8541, 0;
bra.uni $L__BB0_1214;
$L__BB0_1208:
mov.b32 %r1577, %f5341;
shr.u32 %r5877, %r1577, 23;
and.b32 %r5878, %r5877, 255;
add.s32 %r1578, %r5878, -128;
shl.b32 %r5879, %r1577, 8;
or.b32 %r1579, %r5879, -2147483648;
shr.u32 %r1580, %r1578, 5;
mov.u64 %rd2656, 0;
mov.u32 %r8538, 0;
mov.u64 %rd2654, __cudart_i2opi_f;
mov.u64 %rd2655, %rd1;
$L__BB0_1209:
.pragma "nounroll";
ld.global.nc.u32 %r5880, [%rd2654];
mad.wide.u32 %rd1856, %r5880, %r1579, %rd2656;
shr.u64 %rd2656, %rd1856, 32;
st.local.u32 [%rd2655], %rd1856;
add.s64 %rd2655, %rd2655, 4;
add.s64 %rd2654, %rd2654, 4;
add.s32 %r8538, %r8538, 1;
setp.ne.s32 %p1031, %r8538, 6;
@%p1031 bra $L__BB0_1209;
st.local.u32 [%rd5], %rd2656;
mov.u32 %r5881, 4;
sub.s32 %r1583, %r5881, %r1580;
mov.u32 %r5882, 6;
sub.s32 %r5883, %r5882, %r1580;
mul.wide.s32 %rd1857, %r5883, 4;
add.s64 %rd1858, %rd1, %rd1857;
ld.local.u32 %r8539, [%rd1858];
ld.local.u32 %r8540, [%rd1858+-4];
and.b32 %r1586, %r1578, 31;
setp.eq.s32 %p1032, %r1586, 0;
@%p1032 bra $L__BB0_1212;
mov.u32 %r5884, 32;
sub.s32 %r5885, %r5884, %r1586;
shr.u32 %r5886, %r8540, %r5885;
shl.b32 %r5887, %r8539, %r1586;
add.s32 %r8539, %r5886, %r5887;
mul.wide.s32 %rd1859, %r1583, 4;
add.s64 %rd1860, %rd1, %rd1859;
ld.local.u32 %r5888, [%rd1860];
shr.u32 %r5889, %r5888, %r5885;
shl.b32 %r5890, %r8540, %r1586;
add.s32 %r8540, %r5889, %r5890;
$L__BB0_1212:
and.b32 %r5891, %r1577, -2147483648;
shr.u32 %r5892, %r8540, 30;
shl.b32 %r5893, %r8539, 2;
or.b32 %r5894, %r5892, %r5893;
shr.u32 %r5895, %r5894, 31;
shr.u32 %r5896, %r8539, 30;
add.s32 %r5897, %r5895, %r5896;
neg.s32 %r5898, %r5897;
setp.eq.s32 %p1033, %r5891, 0;
selp.b32 %r8541, %r5897, %r5898, %p1033;
setp.ne.s32 %p1034, %r5895, 0;
xor.b32 %r5899, %r5891, -2147483648;
selp.b32 %r5900, %r5899, %r5891, %p1034;
selp.b32 %r5901, -1, 0, %p1034;
xor.b32 %r5902, %r5894, %r5901;
shl.b32 %r5903, %r8540, 2;
xor.b32 %r5904, %r5903, %r5901;
cvt.u64.u32 %rd1861, %r5902;
cvt.u64.u32 %rd1862, %r5904;
bfi.b64 %rd1863, %rd1861, %rd1862, 32, 32;
cvt.rn.f64.s64 %fd157, %rd1863;
mul.f64 %fd158, %fd157, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3951, %fd158;
setp.eq.s32 %p1035, %r5900, 0;
neg.f32 %f3952, %f3951;
selp.f32 %f5651, %f3951, %f3952, %p1035;
$L__BB0_1214:
and.b32 %r1593, %r8541, 1;
setp.eq.s32 %p1036, %r1593, 0;
selp.f32 %f1358, %f5651, 0f3F800000, %p1036;
mul.rn.f32 %f1359, %f5651, %f5651;
mov.f32 %f5652, 0fB94D4153;
@%p1036 bra $L__BB0_1216;
mov.f32 %f3955, 0fBAB607ED;
mov.f32 %f3956, 0f37CBAC00;
fma.rn.f32 %f5652, %f3956, %f1359, %f3955;
$L__BB0_1216:
selp.f32 %f3957, 0f3C0885E4, 0f3D2AAABB, %p1036;
fma.rn.f32 %f3958, %f5652, %f1359, %f3957;
selp.f32 %f3959, 0fBE2AAAA8, 0fBEFFFFFF, %p1036;
fma.rn.f32 %f3960, %f3958, %f1359, %f3959;
mov.f32 %f3961, 0f00000000;
fma.rn.f32 %f3962, %f1359, %f1358, %f3961;
fma.rn.f32 %f5213, %f3960, %f3962, %f1358;
and.b32 %r5906, %r8541, 2;
setp.eq.s32 %p1038, %r5906, 0;
@%p1038 bra $L__BB0_1218;
mov.f32 %f3964, 0fBF800000;
fma.rn.f32 %f5213, %f5213, %f3964, %f3961;
$L__BB0_1218:
setp.lt.s32 %p23, %r14, %r1575;
@%p1028 bra $L__BB0_1231;
mul.f32 %f3965, %f5333, 0f3F22F983;
cvt.rni.s32.f32 %r8545, %f3965;
cvt.rn.f32.s32 %f3966, %r8545;
mov.f32 %f3967, 0fBFC90FDA;
fma.rn.f32 %f3968, %f3966, %f3967, %f5333;
mov.f32 %f3969, 0fB3A22168;
fma.rn.f32 %f3970, %f3966, %f3969, %f3968;
mov.f32 %f3971, 0fA7C234C5;
fma.rn.f32 %f5655, %f3966, %f3971, %f3970;
abs.f32 %f1367, %f5333;
setp.ltu.f32 %p1040, %f1367, 0f47CE4780;
@%p1040 bra $L__BB0_1227;
setp.eq.f32 %p1041, %f1367, 0f7F800000;
@%p1041 bra $L__BB0_1226;
bra.uni $L__BB0_1221;
$L__BB0_1226:
mov.f32 %f3974, 0f00000000;
mul.rn.f32 %f5655, %f5333, %f3974;
mov.u32 %r8545, 0;
bra.uni $L__BB0_1227;
$L__BB0_1221:
mov.b32 %r1595, %f5333;
shr.u32 %r5908, %r1595, 23;
and.b32 %r5909, %r5908, 255;
add.s32 %r1596, %r5909, -128;
shl.b32 %r5910, %r1595, 8;
or.b32 %r1597, %r5910, -2147483648;
shr.u32 %r1598, %r1596, 5;
mov.u64 %rd2659, 0;
mov.u32 %r8542, 0;
mov.u64 %rd2657, __cudart_i2opi_f;
mov.u64 %rd2658, %rd1;
$L__BB0_1222:
.pragma "nounroll";
ld.global.nc.u32 %r5911, [%rd2657];
mad.wide.u32 %rd1866, %r5911, %r1597, %rd2659;
shr.u64 %rd2659, %rd1866, 32;
st.local.u32 [%rd2658], %rd1866;
add.s64 %rd2658, %rd2658, 4;
add.s64 %rd2657, %rd2657, 4;
add.s32 %r8542, %r8542, 1;
setp.ne.s32 %p1042, %r8542, 6;
@%p1042 bra $L__BB0_1222;
st.local.u32 [%rd5], %rd2659;
mov.u32 %r5912, 4;
sub.s32 %r1601, %r5912, %r1598;
mov.u32 %r5913, 6;
sub.s32 %r5914, %r5913, %r1598;
mul.wide.s32 %rd1867, %r5914, 4;
add.s64 %rd1868, %rd1, %rd1867;
ld.local.u32 %r8543, [%rd1868];
ld.local.u32 %r8544, [%rd1868+-4];
and.b32 %r1604, %r1596, 31;
setp.eq.s32 %p1043, %r1604, 0;
@%p1043 bra $L__BB0_1225;
mov.u32 %r5915, 32;
sub.s32 %r5916, %r5915, %r1604;
shr.u32 %r5917, %r8544, %r5916;
shl.b32 %r5918, %r8543, %r1604;
add.s32 %r8543, %r5917, %r5918;
mul.wide.s32 %rd1869, %r1601, 4;
add.s64 %rd1870, %rd1, %rd1869;
ld.local.u32 %r5919, [%rd1870];
shr.u32 %r5920, %r5919, %r5916;
shl.b32 %r5921, %r8544, %r1604;
add.s32 %r8544, %r5920, %r5921;
$L__BB0_1225:
and.b32 %r5922, %r1595, -2147483648;
shr.u32 %r5923, %r8544, 30;
shl.b32 %r5924, %r8543, 2;
or.b32 %r5925, %r5923, %r5924;
shr.u32 %r5926, %r5925, 31;
shr.u32 %r5927, %r8543, 30;
add.s32 %r5928, %r5926, %r5927;
neg.s32 %r5929, %r5928;
setp.eq.s32 %p1044, %r5922, 0;
selp.b32 %r8545, %r5928, %r5929, %p1044;
setp.ne.s32 %p1045, %r5926, 0;
xor.b32 %r5930, %r5922, -2147483648;
selp.b32 %r5931, %r5930, %r5922, %p1045;
selp.b32 %r5932, -1, 0, %p1045;
xor.b32 %r5933, %r5925, %r5932;
shl.b32 %r5934, %r8544, 2;
xor.b32 %r5935, %r5934, %r5932;
cvt.u64.u32 %rd1871, %r5933;
cvt.u64.u32 %rd1872, %r5935;
bfi.b64 %rd1873, %rd1871, %rd1872, 32, 32;
cvt.rn.f64.s64 %fd159, %rd1873;
mul.f64 %fd160, %fd159, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3972, %fd160;
setp.eq.s32 %p1046, %r5931, 0;
neg.f32 %f3973, %f3972;
selp.f32 %f5655, %f3972, %f3973, %p1046;
$L__BB0_1227:
add.s32 %r1611, %r8545, 1;
and.b32 %r1612, %r1611, 1;
setp.eq.s32 %p1047, %r1612, 0;
selp.f32 %f1371, %f5655, 0f3F800000, %p1047;
mul.rn.f32 %f1372, %f5655, %f5655;
mov.f32 %f5656, 0fB94D4153;
@%p1047 bra $L__BB0_1229;
mov.f32 %f3976, 0fBAB607ED;
mov.f32 %f3977, 0f37CBAC00;
fma.rn.f32 %f5656, %f3977, %f1372, %f3976;
$L__BB0_1229:
selp.f32 %f3978, 0f3C0885E4, 0f3D2AAABB, %p1047;
fma.rn.f32 %f3979, %f5656, %f1372, %f3978;
selp.f32 %f3980, 0fBE2AAAA8, 0fBEFFFFFF, %p1047;
fma.rn.f32 %f3981, %f3979, %f1372, %f3980;
mov.f32 %f3982, 0f00000000;
fma.rn.f32 %f3983, %f1372, %f1371, %f3982;
fma.rn.f32 %f5215, %f3981, %f3983, %f1371;
and.b32 %r5937, %r1611, 2;
setp.eq.s32 %p1049, %r5937, 0;
@%p1049 bra $L__BB0_1231;
mov.f32 %f3985, 0fBF800000;
fma.rn.f32 %f5215, %f5215, %f3985, %f3982;
$L__BB0_1231:
selp.f32 %f1379, %f5215, %f5216, %p23;
selp.f32 %f1380, %f5213, %f5214, %p23;
@%p1028 bra $L__BB0_1426;
add.f32 %f5707, %f1380, %f1379;
mov.f32 %f5214, %f5213;
mov.f32 %f5216, %f5215;
$L__BB0_1426:
@%p32 bra $L__BB0_1428;
shl.b32 %r6509, %r12, 2;
mov.u32 %r6510, -12;
sub.s32 %r6511, %r6510, %r6509;
add.s32 %r6512, %r13, -12;
setp.lt.s32 %p1213, %r6512, %r6511;
@%p1213 bra $L__BB0_1708;
bra.uni $L__BB0_1428;
$L__BB0_1708:
mov.u32 %r7240, %ctaid.x;
mul.lo.s32 %r7241, %r2614, %r7240;
shl.b32 %r7242, %r12, 5;
add.s32 %r7243, %r7242, %r1;
mul.hi.s32 %r7244, %r7243, -1840700269;
add.s32 %r7245, %r7244, %r7243;
shr.u32 %r7246, %r7245, 31;
shr.s32 %r7247, %r7245, 2;
add.s32 %r7248, %r7247, %r7246;
mul.lo.s32 %r7249, %r7248, %r2615;
mul.lo.s32 %r7250, %r7248, 7;
sub.s32 %r7251, %r7243, %r7250;
mul.lo.s32 %r7252, %r7251, %r2616;
add.s32 %r7253, %r13, 2;
shl.b32 %r7254, %r2612, 1;
add.s32 %r7255, %r7254, %r7241;
mad.lo.s32 %r7256, %r7253, %r2613, %r7255;
add.s32 %r7257, %r7256, %r7249;
add.s32 %r7258, %r7257, %r7252;
mul.wide.s32 %rd2287, %r7258, 4;
add.s64 %rd2288, %rd3, %rd2287;
ld.global.f32 %f1925, [%rd2288];
add.s32 %r7259, %r7243, 32;
mul.hi.s32 %r7260, %r7259, -1840700269;
add.s32 %r7261, %r7260, %r7259;
shr.u32 %r7262, %r7261, 31;
shr.s32 %r7263, %r7261, 2;
add.s32 %r7264, %r7263, %r7262;
mul.lo.s32 %r7265, %r7264, %r2615;
mul.lo.s32 %r7266, %r7264, 7;
sub.s32 %r7267, %r7259, %r7266;
mul.lo.s32 %r7268, %r7267, %r2616;
add.s32 %r7269, %r7256, %r7265;
add.s32 %r7270, %r7269, %r7268;
mul.wide.s32 %rd2289, %r7270, 4;
add.s64 %rd2290, %rd3, %rd2289;
ld.global.f32 %f1926, [%rd2290];
mul.wide.s32 %rd2291, %r2613, 4;
add.s64 %rd2292, %rd2288, %rd2291;
ld.global.f32 %f1927, [%rd2292];
add.s64 %rd2293, %rd2290, %rd2291;
ld.global.f32 %f1928, [%rd2293];
add.s64 %rd2294, %rd2292, %rd2291;
ld.global.f32 %f1929, [%rd2294];
add.s64 %rd2295, %rd2293, %rd2291;
ld.global.f32 %f1930, [%rd2295];
mad.lo.s32 %r7271, %r2612, 3, %r7241;
mad.lo.s32 %r7272, %r13, %r2613, %r7271;
add.s32 %r7273, %r7272, %r7249;
add.s32 %r7274, %r7273, %r7252;
mul.wide.s32 %rd2296, %r7274, 4;
add.s64 %rd2297, %rd3, %rd2296;
ld.global.f32 %f1931, [%rd2297];
add.s32 %r7275, %r7272, %r7265;
add.s32 %r7276, %r7275, %r7268;
mul.wide.s32 %rd2298, %r7276, 4;
add.s64 %rd2299, %rd3, %rd2298;
ld.global.f32 %f1932, [%rd2299];
mul.lo.s32 %r7277, %r2604, %r7240;
mul.hi.s32 %r7278, %r7243, 954437177;
shr.u32 %r7279, %r7278, 31;
shr.s32 %r7280, %r7278, 1;
add.s32 %r7281, %r7280, %r7279;
mul.lo.s32 %r7282, %r7281, %r2605;
mul.lo.s32 %r7283, %r7281, 9;
sub.s32 %r7284, %r7243, %r7283;
mul.lo.s32 %r7285, %r7284, %r2606;
shl.b32 %r7286, %r2602, 2;
add.s32 %r7287, %r7286, %r7277;
mul.lo.s32 %r7288, %r13, %r2603;
add.s32 %r7289, %r7287, %r7288;
add.s32 %r7290, %r7289, %r7282;
add.s32 %r7291, %r7290, %r7285;
mul.wide.s32 %rd2300, %r7291, 4;
add.s64 %rd2301, %rd2, %rd2300;
ld.global.f32 %f1933, [%rd2301];
mul.hi.s32 %r7292, %r7259, 954437177;
shr.u32 %r7293, %r7292, 31;
shr.s32 %r7294, %r7292, 1;
add.s32 %r7295, %r7294, %r7293;
mul.lo.s32 %r7296, %r7295, %r2605;
mul.lo.s32 %r7297, %r7295, 9;
sub.s32 %r7298, %r7259, %r7297;
mul.lo.s32 %r7299, %r7298, %r2606;
add.s32 %r7300, %r7289, %r7296;
add.s32 %r7301, %r7300, %r7299;
mul.wide.s32 %rd2302, %r7301, 4;
add.s64 %rd2303, %rd2, %rd2302;
ld.global.f32 %f1934, [%rd2303];
mul.wide.s32 %rd2304, %r2603, 4;
add.s64 %rd2305, %rd2301, %rd2304;
ld.global.f32 %f1935, [%rd2305];
add.s64 %rd2306, %rd2303, %rd2304;
ld.global.f32 %f1936, [%rd2306];
add.s64 %rd2307, %rd2305, %rd2304;
ld.global.f32 %f1937, [%rd2307];
add.s64 %rd2308, %rd2306, %rd2304;
ld.global.f32 %f1938, [%rd2308];
mad.lo.s32 %r7302, %r2602, 5, %r7277;
add.s32 %r7303, %r7302, %r7288;
add.s32 %r7304, %r7303, %r7282;
add.s32 %r7305, %r7304, %r7285;
mul.wide.s32 %rd2309, %r7305, 4;
add.s64 %rd2310, %rd2, %rd2309;
ld.global.f32 %f1939, [%rd2310];
add.s32 %r7306, %r7303, %r7296;
add.s32 %r7307, %r7306, %r7299;
mul.wide.s32 %rd2311, %r7307, 4;
add.s64 %rd2312, %rd2, %rd2311;
ld.global.f32 %f1940, [%rd2312];
mul.f32 %f4675, %f1933, 0f3F22F983;
cvt.rni.s32.f32 %r8677, %f4675;
cvt.rn.f32.s32 %f4676, %r8677;
mov.f32 %f4677, 0fBFC90FDA;
fma.rn.f32 %f4678, %f4676, %f4677, %f1933;
mov.f32 %f4679, 0fB3A22168;
fma.rn.f32 %f4680, %f4676, %f4679, %f4678;
mov.f32 %f4681, 0fA7C234C5;
fma.rn.f32 %f5850, %f4676, %f4681, %f4680;
abs.f32 %f1942, %f1933;
setp.ltu.f32 %p1447, %f1942, 0f47CE4780;
@%p1447 bra $L__BB0_1716;
setp.eq.f32 %p1448, %f1942, 0f7F800000;
@%p1448 bra $L__BB0_1715;
bra.uni $L__BB0_1710;
$L__BB0_1715:
mov.f32 %f4684, 0f00000000;
mul.rn.f32 %f5850, %f1933, %f4684;
mov.u32 %r8677, 0;
bra.uni $L__BB0_1716;
$L__BB0_1428:
mov.u32 %r1909, %ctaid.x;
mul.lo.s32 %r1910, %r2614, %r1909;
add.s32 %r6513, %r13, -15;
mov.u32 %r6514, -12;
sub.s32 %r1911, %r6514, %r12;
setp.ge.s32 %p1214, %r6513, %r1911;
add.s32 %r6515, %r13, 2;
shl.b32 %r6516, %r2612, 1;
add.s32 %r6517, %r6516, %r1910;
mad.lo.s32 %r1912, %r6515, %r2613, %r6517;
@%p1214 bra $L__BB0_1431;
shl.b32 %r1913, %r12, 5;
neg.s32 %r6518, %r1913;
setp.ge.s32 %p1215, %r14, %r6518;
@%p1215 bra $L__BB0_1431;
add.s32 %r6519, %r1913, %r1;
mul.hi.s32 %r6520, %r6519, -1840700269;
add.s32 %r6521, %r6520, %r6519;
shr.u32 %r6522, %r6521, 31;
shr.s32 %r6523, %r6521, 2;
add.s32 %r6524, %r6523, %r6522;
mul.lo.s32 %r6525, %r6524, 7;
sub.s32 %r6526, %r6519, %r6525;
mad.lo.s32 %r6527, %r6524, %r2615, %r1912;
mad.lo.s32 %r6528, %r6526, %r2616, %r6527;
mul.wide.s32 %rd2095, %r6528, 4;
add.s64 %rd2096, %rd3, %rd2095;
ld.global.f32 %f5531, [%rd2096];
$L__BB0_1431:
@%p1214 bra $L__BB0_1434;
shl.b32 %r1914, %r12, 5;
mov.u32 %r6530, -32;
sub.s32 %r6531, %r6530, %r1914;
setp.ge.s32 %p1217, %r14, %r6531;
@%p1217 bra $L__BB0_1434;
add.s32 %r6532, %r1914, %r1;
add.s32 %r6533, %r6532, 32;
mul.hi.s32 %r6534, %r6533, -1840700269;
add.s32 %r6535, %r6534, %r6533;
shr.u32 %r6536, %r6535, 31;
shr.s32 %r6537, %r6535, 2;
add.s32 %r6538, %r6537, %r6536;
mul.lo.s32 %r6539, %r6538, 7;
sub.s32 %r6540, %r6533, %r6539;
mad.lo.s32 %r6541, %r6538, %r2615, %r1912;
mad.lo.s32 %r6542, %r6540, %r2616, %r6541;
mul.wide.s32 %rd2097, %r6542, 4;
add.s64 %rd2098, %rd3, %rd2097;
ld.global.f32 %f5339, [%rd2098];
$L__BB0_1434:
mov.u32 %r6544, -13;
sub.s32 %r1915, %r6544, %r12;
setp.ge.s32 %p1218, %r6513, %r1915;
add.s32 %r1916, %r1912, %r2613;
@%p1218 bra $L__BB0_1437;
shl.b32 %r1917, %r12, 5;
neg.s32 %r6545, %r1917;
setp.ge.s32 %p1219, %r14, %r6545;
@%p1219 bra $L__BB0_1437;
add.s32 %r6546, %r1917, %r1;
mul.hi.s32 %r6547, %r6546, -1840700269;
add.s32 %r6548, %r6547, %r6546;
shr.u32 %r6549, %r6548, 31;
shr.s32 %r6550, %r6548, 2;
add.s32 %r6551, %r6550, %r6549;
mul.lo.s32 %r6552, %r6551, 7;
sub.s32 %r6553, %r6546, %r6552;
mad.lo.s32 %r6554, %r6551, %r2615, %r1916;
mad.lo.s32 %r6555, %r6553, %r2616, %r6554;
mul.wide.s32 %rd2099, %r6555, 4;
add.s64 %rd2100, %rd3, %rd2099;
ld.global.f32 %f5338, [%rd2100];
$L__BB0_1437:
@%p1218 bra $L__BB0_1440;
shl.b32 %r1918, %r12, 5;
mov.u32 %r6557, -32;
sub.s32 %r6558, %r6557, %r1918;
setp.ge.s32 %p1221, %r14, %r6558;
@%p1221 bra $L__BB0_1440;
add.s32 %r6559, %r1918, %r1;
add.s32 %r6560, %r6559, 32;
mul.hi.s32 %r6561, %r6560, -1840700269;
add.s32 %r6562, %r6561, %r6560;
shr.u32 %r6563, %r6562, 31;
shr.s32 %r6564, %r6562, 2;
add.s32 %r6565, %r6564, %r6563;
mul.lo.s32 %r6566, %r6565, 7;
sub.s32 %r6567, %r6560, %r6566;
mad.lo.s32 %r6568, %r6565, %r2615, %r1916;
mad.lo.s32 %r6569, %r6567, %r2616, %r6568;
mul.wide.s32 %rd2101, %r6569, 4;
add.s64 %rd2102, %rd3, %rd2101;
ld.global.f32 %f5337, [%rd2102];
$L__BB0_1440:
mov.u32 %r6571, -14;
sub.s32 %r1919, %r6571, %r12;
setp.ge.s32 %p1222, %r6513, %r1919;
add.s32 %r1920, %r1916, %r2613;
@%p1222 bra $L__BB0_1443;
shl.b32 %r1921, %r12, 5;
neg.s32 %r6572, %r1921;
setp.ge.s32 %p1223, %r14, %r6572;
@%p1223 bra $L__BB0_1443;
add.s32 %r6573, %r1921, %r1;
mul.hi.s32 %r6574, %r6573, -1840700269;
add.s32 %r6575, %r6574, %r6573;
shr.u32 %r6576, %r6575, 31;
shr.s32 %r6577, %r6575, 2;
add.s32 %r6578, %r6577, %r6576;
mul.lo.s32 %r6579, %r6578, 7;
sub.s32 %r6580, %r6573, %r6579;
mad.lo.s32 %r6581, %r6578, %r2615, %r1920;
mad.lo.s32 %r6582, %r6580, %r2616, %r6581;
mul.wide.s32 %rd2103, %r6582, 4;
add.s64 %rd2104, %rd3, %rd2103;
ld.global.f32 %f5336, [%rd2104];
$L__BB0_1443:
@%p1222 bra $L__BB0_1446;
shl.b32 %r1922, %r12, 5;
mov.u32 %r6584, -32;
sub.s32 %r6585, %r6584, %r1922;
setp.ge.s32 %p1225, %r14, %r6585;
@%p1225 bra $L__BB0_1446;
add.s32 %r6586, %r1922, %r1;
add.s32 %r6587, %r6586, 32;
mul.hi.s32 %r6588, %r6587, -1840700269;
add.s32 %r6589, %r6588, %r6587;
shr.u32 %r6590, %r6589, 31;
shr.s32 %r6591, %r6589, 2;
add.s32 %r6592, %r6591, %r6590;
mul.lo.s32 %r6593, %r6592, 7;
sub.s32 %r6594, %r6587, %r6593;
mad.lo.s32 %r6595, %r6592, %r2615, %r1920;
mad.lo.s32 %r6596, %r6594, %r2616, %r6595;
mul.wide.s32 %rd2105, %r6596, 4;
add.s64 %rd2106, %rd3, %rd2105;
ld.global.f32 %f5335, [%rd2106];
$L__BB0_1446:
mov.u32 %r6598, -15;
sub.s32 %r1923, %r6598, %r12;
setp.ge.s32 %p1226, %r6513, %r1923;
mad.lo.s32 %r6599, %r2612, 3, %r1910;
mad.lo.s32 %r1924, %r13, %r2613, %r6599;
@%p1226 bra $L__BB0_1449;
shl.b32 %r1925, %r12, 5;
neg.s32 %r6600, %r1925;
setp.ge.s32 %p1227, %r14, %r6600;
@%p1227 bra $L__BB0_1449;
add.s32 %r6601, %r1925, %r1;
mul.hi.s32 %r6602, %r6601, -1840700269;
add.s32 %r6603, %r6602, %r6601;
shr.u32 %r6604, %r6603, 31;
shr.s32 %r6605, %r6603, 2;
add.s32 %r6606, %r6605, %r6604;
mul.lo.s32 %r6607, %r6606, 7;
sub.s32 %r6608, %r6601, %r6607;
mad.lo.s32 %r6609, %r6606, %r2615, %r1924;
mad.lo.s32 %r6610, %r6608, %r2616, %r6609;
mul.wide.s32 %rd2107, %r6610, 4;
add.s64 %rd2108, %rd3, %rd2107;
ld.global.f32 %f5334, [%rd2108];
$L__BB0_1449:
@%p1226 bra $L__BB0_1452;
shl.b32 %r1926, %r12, 5;
mov.u32 %r6612, -32;
sub.s32 %r6613, %r6612, %r1926;
setp.ge.s32 %p1229, %r14, %r6613;
@%p1229 bra $L__BB0_1452;
add.s32 %r6614, %r1926, %r1;
add.s32 %r6615, %r6614, 32;
mul.hi.s32 %r6616, %r6615, -1840700269;
add.s32 %r6617, %r6616, %r6615;
shr.u32 %r6618, %r6617, 31;
shr.s32 %r6619, %r6617, 2;
add.s32 %r6620, %r6619, %r6618;
mul.lo.s32 %r6621, %r6620, 7;
sub.s32 %r6622, %r6615, %r6621;
mad.lo.s32 %r6623, %r6620, %r2615, %r1924;
mad.lo.s32 %r6624, %r6622, %r2616, %r6623;
mul.wide.s32 %rd2109, %r6624, 4;
add.s64 %rd2110, %rd3, %rd2109;
ld.global.f32 %f5333, [%rd2110];
$L__BB0_1452:
shl.b32 %r6626, %r2602, 2;
mul.lo.s32 %r1927, %r2604, %r1909;
add.s32 %r6627, %r6626, %r1927;
mul.lo.s32 %r1928, %r13, %r2603;
add.s32 %r1929, %r6627, %r1928;
@%p1214 bra $L__BB0_1455;
shl.b32 %r1930, %r12, 5;
neg.s32 %r6628, %r1930;
setp.ge.s32 %p1231, %r14, %r6628;
@%p1231 bra $L__BB0_1455;
add.s32 %r6629, %r1930, %r1;
mul.hi.s32 %r6630, %r6629, 954437177;
shr.u32 %r6631, %r6630, 31;
shr.s32 %r6632, %r6630, 1;
add.s32 %r6633, %r6632, %r6631;
mul.lo.s32 %r6634, %r6633, 9;
sub.s32 %r6635, %r6629, %r6634;
mad.lo.s32 %r6636, %r6633, %r2605, %r1929;
mad.lo.s32 %r6637, %r6635, %r2606, %r6636;
mul.wide.s32 %rd2111, %r6637, 4;
add.s64 %rd2112, %rd2, %rd2111;
ld.global.f32 %f5348, [%rd2112];
$L__BB0_1455:
@%p1214 bra $L__BB0_1458;
shl.b32 %r1931, %r12, 5;
mov.u32 %r6639, -32;
sub.s32 %r6640, %r6639, %r1931;
setp.ge.s32 %p1233, %r14, %r6640;
@%p1233 bra $L__BB0_1458;
add.s32 %r6641, %r1931, %r1;
add.s32 %r6642, %r6641, 32;
mul.hi.s32 %r6643, %r6642, 954437177;
shr.u32 %r6644, %r6643, 31;
shr.s32 %r6645, %r6643, 1;
add.s32 %r6646, %r6645, %r6644;
mul.lo.s32 %r6647, %r6646, 9;
sub.s32 %r6648, %r6642, %r6647;
mad.lo.s32 %r6649, %r6646, %r2605, %r1929;
mad.lo.s32 %r6650, %r6648, %r2606, %r6649;
mul.wide.s32 %rd2113, %r6650, 4;
add.s64 %rd2114, %rd2, %rd2113;
ld.global.f32 %f5347, [%rd2114];
$L__BB0_1458:
add.s32 %r1932, %r1929, %r2603;
@%p1218 bra $L__BB0_1461;
shl.b32 %r1933, %r12, 5;
neg.s32 %r6652, %r1933;
setp.ge.s32 %p1235, %r14, %r6652;
@%p1235 bra $L__BB0_1461;
add.s32 %r6653, %r1933, %r1;
mul.hi.s32 %r6654, %r6653, 954437177;
shr.u32 %r6655, %r6654, 31;
shr.s32 %r6656, %r6654, 1;
add.s32 %r6657, %r6656, %r6655;
mul.lo.s32 %r6658, %r6657, 9;
sub.s32 %r6659, %r6653, %r6658;
mad.lo.s32 %r6660, %r6657, %r2605, %r1932;
mad.lo.s32 %r6661, %r6659, %r2606, %r6660;
mul.wide.s32 %rd2115, %r6661, 4;
add.s64 %rd2116, %rd2, %rd2115;
ld.global.f32 %f5346, [%rd2116];
$L__BB0_1461:
@%p1218 bra $L__BB0_1464;
shl.b32 %r1934, %r12, 5;
mov.u32 %r6663, -32;
sub.s32 %r6664, %r6663, %r1934;
setp.ge.s32 %p1237, %r14, %r6664;
@%p1237 bra $L__BB0_1464;
add.s32 %r6665, %r1934, %r1;
add.s32 %r6666, %r6665, 32;
mul.hi.s32 %r6667, %r6666, 954437177;
shr.u32 %r6668, %r6667, 31;
shr.s32 %r6669, %r6667, 1;
add.s32 %r6670, %r6669, %r6668;
mul.lo.s32 %r6671, %r6670, 9;
sub.s32 %r6672, %r6666, %r6671;
mad.lo.s32 %r6673, %r6670, %r2605, %r1932;
mad.lo.s32 %r6674, %r6672, %r2606, %r6673;
mul.wide.s32 %rd2117, %r6674, 4;
add.s64 %rd2118, %rd2, %rd2117;
ld.global.f32 %f5345, [%rd2118];
$L__BB0_1464:
add.s32 %r1935, %r1932, %r2603;
@%p1222 bra $L__BB0_1467;
shl.b32 %r1936, %r12, 5;
neg.s32 %r6676, %r1936;
setp.ge.s32 %p1239, %r14, %r6676;
@%p1239 bra $L__BB0_1467;
add.s32 %r6677, %r1936, %r1;
mul.hi.s32 %r6678, %r6677, 954437177;
shr.u32 %r6679, %r6678, 31;
shr.s32 %r6680, %r6678, 1;
add.s32 %r6681, %r6680, %r6679;
mul.lo.s32 %r6682, %r6681, 9;
sub.s32 %r6683, %r6677, %r6682;
mad.lo.s32 %r6684, %r6681, %r2605, %r1935;
mad.lo.s32 %r6685, %r6683, %r2606, %r6684;
mul.wide.s32 %rd2119, %r6685, 4;
add.s64 %rd2120, %rd2, %rd2119;
ld.global.f32 %f5344, [%rd2120];
$L__BB0_1467:
@%p1222 bra $L__BB0_1470;
shl.b32 %r1937, %r12, 5;
mov.u32 %r6687, -32;
sub.s32 %r6688, %r6687, %r1937;
setp.ge.s32 %p1241, %r14, %r6688;
@%p1241 bra $L__BB0_1470;
add.s32 %r6689, %r1937, %r1;
add.s32 %r6690, %r6689, 32;
mul.hi.s32 %r6691, %r6690, 954437177;
shr.u32 %r6692, %r6691, 31;
shr.s32 %r6693, %r6691, 1;
add.s32 %r6694, %r6693, %r6692;
mul.lo.s32 %r6695, %r6694, 9;
sub.s32 %r6696, %r6690, %r6695;
mad.lo.s32 %r6697, %r6694, %r2605, %r1935;
mad.lo.s32 %r6698, %r6696, %r2606, %r6697;
mul.wide.s32 %rd2121, %r6698, 4;
add.s64 %rd2122, %rd2, %rd2121;
ld.global.f32 %f5343, [%rd2122];
$L__BB0_1470:
mad.lo.s32 %r6700, %r2602, 5, %r1927;
add.s32 %r1938, %r6700, %r1928;
@%p1226 bra $L__BB0_1473;
shl.b32 %r1939, %r12, 5;
neg.s32 %r6701, %r1939;
setp.ge.s32 %p1243, %r14, %r6701;
@%p1243 bra $L__BB0_1473;
add.s32 %r6702, %r1939, %r1;
mul.hi.s32 %r6703, %r6702, 954437177;
shr.u32 %r6704, %r6703, 31;
shr.s32 %r6705, %r6703, 1;
add.s32 %r6706, %r6705, %r6704;
mul.lo.s32 %r6707, %r6706, 9;
sub.s32 %r6708, %r6702, %r6707;
mad.lo.s32 %r6709, %r6706, %r2605, %r1938;
mad.lo.s32 %r6710, %r6708, %r2606, %r6709;
mul.wide.s32 %rd2123, %r6710, 4;
add.s64 %rd2124, %rd2, %rd2123;
ld.global.f32 %f5342, [%rd2124];
$L__BB0_1473:
@%p1226 bra $L__BB0_1476;
shl.b32 %r1940, %r12, 5;
mov.u32 %r6712, -32;
sub.s32 %r6713, %r6712, %r1940;
setp.ge.s32 %p1245, %r14, %r6713;
@%p1245 bra $L__BB0_1476;
add.s32 %r6714, %r1940, %r1;
add.s32 %r6715, %r6714, 32;
mul.hi.s32 %r6716, %r6715, 954437177;
shr.u32 %r6717, %r6716, 31;
shr.s32 %r6718, %r6716, 1;
add.s32 %r6719, %r6718, %r6717;
mul.lo.s32 %r6720, %r6719, 9;
sub.s32 %r6721, %r6715, %r6720;
mad.lo.s32 %r6722, %r6719, %r2605, %r1938;
mad.lo.s32 %r6723, %r6721, %r2606, %r6722;
mul.wide.s32 %rd2125, %r6723, 4;
add.s64 %rd2126, %rd2, %rd2125;
ld.global.f32 %f5341, [%rd2126];
$L__BB0_1476:
@%p1214 bra $L__BB0_1505;
shl.b32 %r6725, %r12, 5;
neg.s32 %r1941, %r6725;
setp.ge.s32 %p1247, %r14, %r1941;
@%p1247 bra $L__BB0_1490;
mul.f32 %f4324, %f5348, 0f3F22F983;
cvt.rni.s32.f32 %r8613, %f4324;
cvt.rn.f32.s32 %f4325, %r8613;
mov.f32 %f4326, 0fBFC90FDA;
fma.rn.f32 %f4327, %f4325, %f4326, %f5348;
mov.f32 %f4328, 0fB3A22168;
fma.rn.f32 %f4329, %f4325, %f4328, %f4327;
mov.f32 %f4330, 0fA7C234C5;
fma.rn.f32 %f5751, %f4325, %f4330, %f4329;
abs.f32 %f1659, %f5348;
setp.ltu.f32 %p1248, %f1659, 0f47CE4780;
@%p1248 bra $L__BB0_1486;
setp.eq.f32 %p1249, %f1659, 0f7F800000;
@%p1249 bra $L__BB0_1485;
bra.uni $L__BB0_1480;
$L__BB0_1485:
mov.f32 %f4333, 0f00000000;
mul.rn.f32 %f5751, %f5348, %f4333;
mov.u32 %r8613, 0;
bra.uni $L__BB0_1486;
$L__BB0_1710:
mov.b32 %r2246, %f1933;
shr.u32 %r7309, %r2246, 23;
and.b32 %r7310, %r7309, 255;
add.s32 %r2247, %r7310, -128;
shl.b32 %r7311, %r2246, 8;
or.b32 %r2248, %r7311, -2147483648;
shr.u32 %r2249, %r2247, 5;
mov.u64 %rd2750, 0;
mov.u32 %r8674, 0;
mov.u64 %rd2748, __cudart_i2opi_f;
mov.u64 %rd2749, %rd1;
$L__BB0_1711:
.pragma "nounroll";
ld.global.nc.u32 %r7312, [%rd2748];
mad.wide.u32 %rd2315, %r7312, %r2248, %rd2750;
shr.u64 %rd2750, %rd2315, 32;
st.local.u32 [%rd2749], %rd2315;
add.s64 %rd2749, %rd2749, 4;
add.s64 %rd2748, %rd2748, 4;
add.s32 %r8674, %r8674, 1;
setp.ne.s32 %p1449, %r8674, 6;
@%p1449 bra $L__BB0_1711;
st.local.u32 [%rd5], %rd2750;
mov.u32 %r7313, 4;
sub.s32 %r2252, %r7313, %r2249;
mov.u32 %r7314, 6;
sub.s32 %r7315, %r7314, %r2249;
mul.wide.s32 %rd2316, %r7315, 4;
add.s64 %rd2317, %rd1, %rd2316;
ld.local.u32 %r8675, [%rd2317];
ld.local.u32 %r8676, [%rd2317+-4];
and.b32 %r2255, %r2247, 31;
setp.eq.s32 %p1450, %r2255, 0;
@%p1450 bra $L__BB0_1714;
mov.u32 %r7316, 32;
sub.s32 %r7317, %r7316, %r2255;
shr.u32 %r7318, %r8676, %r7317;
shl.b32 %r7319, %r8675, %r2255;
add.s32 %r8675, %r7318, %r7319;
mul.wide.s32 %rd2318, %r2252, 4;
add.s64 %rd2319, %rd1, %rd2318;
ld.local.u32 %r7320, [%rd2319];
shr.u32 %r7321, %r7320, %r7317;
shl.b32 %r7322, %r8676, %r2255;
add.s32 %r8676, %r7321, %r7322;
$L__BB0_1714:
and.b32 %r7323, %r2246, -2147483648;
shr.u32 %r7324, %r8676, 30;
shl.b32 %r7325, %r8675, 2;
or.b32 %r7326, %r7324, %r7325;
shr.u32 %r7327, %r7326, 31;
shr.u32 %r7328, %r8675, 30;
add.s32 %r7329, %r7327, %r7328;
neg.s32 %r7330, %r7329;
setp.eq.s32 %p1451, %r7323, 0;
selp.b32 %r8677, %r7329, %r7330, %p1451;
setp.ne.s32 %p1452, %r7327, 0;
xor.b32 %r7331, %r7323, -2147483648;
selp.b32 %r7332, %r7331, %r7323, %p1452;
selp.b32 %r7333, -1, 0, %p1452;
xor.b32 %r7334, %r7326, %r7333;
shl.b32 %r7335, %r8676, 2;
xor.b32 %r7336, %r7335, %r7333;
cvt.u64.u32 %rd2320, %r7334;
cvt.u64.u32 %rd2321, %r7336;
bfi.b64 %rd2322, %rd2320, %rd2321, 32, 32;
cvt.rn.f64.s64 %fd225, %rd2322;
mul.f64 %fd226, %fd225, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4682, %fd226;
setp.eq.s32 %p1453, %r7332, 0;
neg.f32 %f4683, %f4682;
selp.f32 %f5850, %f4682, %f4683, %p1453;
$L__BB0_1716:
and.b32 %r2262, %r8677, 1;
setp.eq.s32 %p1454, %r2262, 0;
selp.f32 %f1946, %f5850, 0f3F800000, %p1454;
mul.rn.f32 %f1947, %f5850, %f5850;
mov.f32 %f5851, 0fB94D4153;
@%p1454 bra $L__BB0_1718;
mov.f32 %f4686, 0fBAB607ED;
mov.f32 %f4687, 0f37CBAC00;
fma.rn.f32 %f5851, %f4687, %f1947, %f4686;
$L__BB0_1718:
selp.f32 %f4688, 0f3C0885E4, 0f3D2AAABB, %p1454;
fma.rn.f32 %f4689, %f5851, %f1947, %f4688;
selp.f32 %f4690, 0fBE2AAAA8, 0fBEFFFFFF, %p1454;
fma.rn.f32 %f4691, %f4689, %f1947, %f4690;
mov.f32 %f4692, 0f00000000;
fma.rn.f32 %f4693, %f1947, %f1946, %f4692;
fma.rn.f32 %f5852, %f4691, %f4693, %f1946;
and.b32 %r7338, %r8677, 2;
setp.eq.s32 %p1456, %r7338, 0;
@%p1456 bra $L__BB0_1720;
mov.f32 %f4695, 0fBF800000;
fma.rn.f32 %f5852, %f5852, %f4695, %f4692;
$L__BB0_1720:
mul.f32 %f4696, %f1925, 0f3F22F983;
cvt.rni.s32.f32 %r8681, %f4696;
cvt.rn.f32.s32 %f4697, %r8681;
mov.f32 %f4698, 0fBFC90FDA;
fma.rn.f32 %f4699, %f4697, %f4698, %f1925;
mov.f32 %f4700, 0fB3A22168;
fma.rn.f32 %f4701, %f4697, %f4700, %f4699;
mov.f32 %f4702, 0fA7C234C5;
fma.rn.f32 %f5853, %f4697, %f4702, %f4701;
abs.f32 %f1954, %f1925;
setp.ltu.f32 %p1457, %f1954, 0f47CE4780;
@%p1457 bra $L__BB0_1728;
setp.eq.f32 %p1458, %f1954, 0f7F800000;
@%p1458 bra $L__BB0_1727;
bra.uni $L__BB0_1722;
$L__BB0_1727:
mov.f32 %f4705, 0f00000000;
mul.rn.f32 %f5853, %f1925, %f4705;
mov.u32 %r8681, 0;
bra.uni $L__BB0_1728;
$L__BB0_1722:
mov.b32 %r2264, %f1925;
shr.u32 %r7340, %r2264, 23;
and.b32 %r7341, %r7340, 255;
add.s32 %r2265, %r7341, -128;
shl.b32 %r7342, %r2264, 8;
or.b32 %r2266, %r7342, -2147483648;
shr.u32 %r2267, %r2265, 5;
mov.u64 %rd2753, 0;
mov.u32 %r8678, 0;
mov.u64 %rd2751, __cudart_i2opi_f;
mov.u64 %rd2752, %rd1;
$L__BB0_1723:
.pragma "nounroll";
ld.global.nc.u32 %r7343, [%rd2751];
mad.wide.u32 %rd2325, %r7343, %r2266, %rd2753;
shr.u64 %rd2753, %rd2325, 32;
st.local.u32 [%rd2752], %rd2325;
add.s64 %rd2752, %rd2752, 4;
add.s64 %rd2751, %rd2751, 4;
add.s32 %r8678, %r8678, 1;
setp.ne.s32 %p1459, %r8678, 6;
@%p1459 bra $L__BB0_1723;
st.local.u32 [%rd5], %rd2753;
mov.u32 %r7344, 4;
sub.s32 %r2270, %r7344, %r2267;
mov.u32 %r7345, 6;
sub.s32 %r7346, %r7345, %r2267;
mul.wide.s32 %rd2326, %r7346, 4;
add.s64 %rd2327, %rd1, %rd2326;
ld.local.u32 %r8679, [%rd2327];
ld.local.u32 %r8680, [%rd2327+-4];
and.b32 %r2273, %r2265, 31;
setp.eq.s32 %p1460, %r2273, 0;
@%p1460 bra $L__BB0_1726;
mov.u32 %r7347, 32;
sub.s32 %r7348, %r7347, %r2273;
shr.u32 %r7349, %r8680, %r7348;
shl.b32 %r7350, %r8679, %r2273;
add.s32 %r8679, %r7349, %r7350;
mul.wide.s32 %rd2328, %r2270, 4;
add.s64 %rd2329, %rd1, %rd2328;
ld.local.u32 %r7351, [%rd2329];
shr.u32 %r7352, %r7351, %r7348;
shl.b32 %r7353, %r8680, %r2273;
add.s32 %r8680, %r7352, %r7353;
$L__BB0_1726:
and.b32 %r7354, %r2264, -2147483648;
shr.u32 %r7355, %r8680, 30;
shl.b32 %r7356, %r8679, 2;
or.b32 %r7357, %r7355, %r7356;
shr.u32 %r7358, %r7357, 31;
shr.u32 %r7359, %r8679, 30;
add.s32 %r7360, %r7358, %r7359;
neg.s32 %r7361, %r7360;
setp.eq.s32 %p1461, %r7354, 0;
selp.b32 %r8681, %r7360, %r7361, %p1461;
setp.ne.s32 %p1462, %r7358, 0;
xor.b32 %r7362, %r7354, -2147483648;
selp.b32 %r7363, %r7362, %r7354, %p1462;
selp.b32 %r7364, -1, 0, %p1462;
xor.b32 %r7365, %r7357, %r7364;
shl.b32 %r7366, %r8680, 2;
xor.b32 %r7367, %r7366, %r7364;
cvt.u64.u32 %rd2330, %r7365;
cvt.u64.u32 %rd2331, %r7367;
bfi.b64 %rd2332, %rd2330, %rd2331, 32, 32;
cvt.rn.f64.s64 %fd227, %rd2332;
mul.f64 %fd228, %fd227, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4703, %fd228;
setp.eq.s32 %p1463, %r7363, 0;
neg.f32 %f4704, %f4703;
selp.f32 %f5853, %f4703, %f4704, %p1463;
$L__BB0_1728:
add.s32 %r2280, %r8681, 1;
and.b32 %r2281, %r2280, 1;
setp.eq.s32 %p1464, %r2281, 0;
selp.f32 %f1958, %f5853, 0f3F800000, %p1464;
mul.rn.f32 %f1959, %f5853, %f5853;
mov.f32 %f5854, 0fB94D4153;
@%p1464 bra $L__BB0_1730;
mov.f32 %f4707, 0fBAB607ED;
mov.f32 %f4708, 0f37CBAC00;
fma.rn.f32 %f5854, %f4708, %f1959, %f4707;
$L__BB0_1730:
selp.f32 %f4709, 0f3C0885E4, 0f3D2AAABB, %p1464;
fma.rn.f32 %f4710, %f5854, %f1959, %f4709;
selp.f32 %f4711, 0fBE2AAAA8, 0fBEFFFFFF, %p1464;
fma.rn.f32 %f4712, %f4710, %f1959, %f4711;
mov.f32 %f4713, 0f00000000;
fma.rn.f32 %f4714, %f1959, %f1958, %f4713;
fma.rn.f32 %f5855, %f4712, %f4714, %f1958;
and.b32 %r7369, %r2280, 2;
setp.eq.s32 %p1466, %r7369, 0;
@%p1466 bra $L__BB0_1732;
mov.f32 %f4716, 0fBF800000;
fma.rn.f32 %f5855, %f5855, %f4716, %f4713;
$L__BB0_1732:
add.f32 %f5905, %f5852, %f5855;
mul.f32 %f4717, %f1934, 0f3F22F983;
cvt.rni.s32.f32 %r8685, %f4717;
cvt.rn.f32.s32 %f4718, %r8685;
mov.f32 %f4719, 0fBFC90FDA;
fma.rn.f32 %f4720, %f4718, %f4719, %f1934;
mov.f32 %f4721, 0fB3A22168;
fma.rn.f32 %f4722, %f4718, %f4721, %f4720;
mov.f32 %f4723, 0fA7C234C5;
fma.rn.f32 %f5856, %f4718, %f4723, %f4722;
abs.f32 %f1967, %f1934;
setp.ltu.f32 %p1467, %f1967, 0f47CE4780;
@%p1467 bra $L__BB0_1740;
setp.eq.f32 %p1468, %f1967, 0f7F800000;
@%p1468 bra $L__BB0_1739;
bra.uni $L__BB0_1734;
$L__BB0_1739:
mov.f32 %f4726, 0f00000000;
mul.rn.f32 %f5856, %f1934, %f4726;
mov.u32 %r8685, 0;
bra.uni $L__BB0_1740;
$L__BB0_1734:
mov.b32 %r2283, %f1934;
shr.u32 %r7371, %r2283, 23;
and.b32 %r7372, %r7371, 255;
add.s32 %r2284, %r7372, -128;
shl.b32 %r7373, %r2283, 8;
or.b32 %r2285, %r7373, -2147483648;
shr.u32 %r2286, %r2284, 5;
mov.u64 %rd2756, 0;
mov.u32 %r8682, 0;
mov.u64 %rd2754, __cudart_i2opi_f;
mov.u64 %rd2755, %rd1;
$L__BB0_1735:
.pragma "nounroll";
ld.global.nc.u32 %r7374, [%rd2754];
mad.wide.u32 %rd2335, %r7374, %r2285, %rd2756;
shr.u64 %rd2756, %rd2335, 32;
st.local.u32 [%rd2755], %rd2335;
add.s64 %rd2755, %rd2755, 4;
add.s64 %rd2754, %rd2754, 4;
add.s32 %r8682, %r8682, 1;
setp.ne.s32 %p1469, %r8682, 6;
@%p1469 bra $L__BB0_1735;
st.local.u32 [%rd5], %rd2756;
mov.u32 %r7375, 4;
sub.s32 %r2289, %r7375, %r2286;
mov.u32 %r7376, 6;
sub.s32 %r7377, %r7376, %r2286;
mul.wide.s32 %rd2336, %r7377, 4;
add.s64 %rd2337, %rd1, %rd2336;
ld.local.u32 %r8683, [%rd2337];
ld.local.u32 %r8684, [%rd2337+-4];
and.b32 %r2292, %r2284, 31;
setp.eq.s32 %p1470, %r2292, 0;
@%p1470 bra $L__BB0_1738;
mov.u32 %r7378, 32;
sub.s32 %r7379, %r7378, %r2292;
shr.u32 %r7380, %r8684, %r7379;
shl.b32 %r7381, %r8683, %r2292;
add.s32 %r8683, %r7380, %r7381;
mul.wide.s32 %rd2338, %r2289, 4;
add.s64 %rd2339, %rd1, %rd2338;
ld.local.u32 %r7382, [%rd2339];
shr.u32 %r7383, %r7382, %r7379;
shl.b32 %r7384, %r8684, %r2292;
add.s32 %r8684, %r7383, %r7384;
$L__BB0_1738:
and.b32 %r7385, %r2283, -2147483648;
shr.u32 %r7386, %r8684, 30;
shl.b32 %r7387, %r8683, 2;
or.b32 %r7388, %r7386, %r7387;
shr.u32 %r7389, %r7388, 31;
shr.u32 %r7390, %r8683, 30;
add.s32 %r7391, %r7389, %r7390;
neg.s32 %r7392, %r7391;
setp.eq.s32 %p1471, %r7385, 0;
selp.b32 %r8685, %r7391, %r7392, %p1471;
setp.ne.s32 %p1472, %r7389, 0;
xor.b32 %r7393, %r7385, -2147483648;
selp.b32 %r7394, %r7393, %r7385, %p1472;
selp.b32 %r7395, -1, 0, %p1472;
xor.b32 %r7396, %r7388, %r7395;
shl.b32 %r7397, %r8684, 2;
xor.b32 %r7398, %r7397, %r7395;
cvt.u64.u32 %rd2340, %r7396;
cvt.u64.u32 %rd2341, %r7398;
bfi.b64 %rd2342, %rd2340, %rd2341, 32, 32;
cvt.rn.f64.s64 %fd229, %rd2342;
mul.f64 %fd230, %fd229, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4724, %fd230;
setp.eq.s32 %p1473, %r7394, 0;
neg.f32 %f4725, %f4724;
selp.f32 %f5856, %f4724, %f4725, %p1473;
$L__BB0_1740:
and.b32 %r2299, %r8685, 1;
setp.eq.s32 %p1474, %r2299, 0;
selp.f32 %f1971, %f5856, 0f3F800000, %p1474;
mul.rn.f32 %f1972, %f5856, %f5856;
mov.f32 %f5857, 0fB94D4153;
@%p1474 bra $L__BB0_1742;
mov.f32 %f4728, 0fBAB607ED;
mov.f32 %f4729, 0f37CBAC00;
fma.rn.f32 %f5857, %f4729, %f1972, %f4728;
$L__BB0_1742:
selp.f32 %f4730, 0f3C0885E4, 0f3D2AAABB, %p1474;
fma.rn.f32 %f4731, %f5857, %f1972, %f4730;
selp.f32 %f4732, 0fBE2AAAA8, 0fBEFFFFFF, %p1474;
fma.rn.f32 %f4733, %f4731, %f1972, %f4732;
mov.f32 %f4734, 0f00000000;
fma.rn.f32 %f4735, %f1972, %f1971, %f4734;
fma.rn.f32 %f5858, %f4733, %f4735, %f1971;
and.b32 %r7400, %r8685, 2;
setp.eq.s32 %p1476, %r7400, 0;
@%p1476 bra $L__BB0_1744;
mov.f32 %f4737, 0fBF800000;
fma.rn.f32 %f5858, %f5858, %f4737, %f4734;
$L__BB0_1744:
mul.f32 %f4738, %f1926, 0f3F22F983;
cvt.rni.s32.f32 %r8689, %f4738;
cvt.rn.f32.s32 %f4739, %r8689;
mov.f32 %f4740, 0fBFC90FDA;
fma.rn.f32 %f4741, %f4739, %f4740, %f1926;
mov.f32 %f4742, 0fB3A22168;
fma.rn.f32 %f4743, %f4739, %f4742, %f4741;
mov.f32 %f4744, 0fA7C234C5;
fma.rn.f32 %f5859, %f4739, %f4744, %f4743;
abs.f32 %f1979, %f1926;
setp.ltu.f32 %p1477, %f1979, 0f47CE4780;
@%p1477 bra $L__BB0_1752;
setp.eq.f32 %p1478, %f1979, 0f7F800000;
@%p1478 bra $L__BB0_1751;
bra.uni $L__BB0_1746;
$L__BB0_1751:
mov.f32 %f4747, 0f00000000;
mul.rn.f32 %f5859, %f1926, %f4747;
mov.u32 %r8689, 0;
bra.uni $L__BB0_1752;
$L__BB0_1746:
mov.b32 %r2301, %f1926;
shr.u32 %r7402, %r2301, 23;
and.b32 %r7403, %r7402, 255;
add.s32 %r2302, %r7403, -128;
shl.b32 %r7404, %r2301, 8;
or.b32 %r2303, %r7404, -2147483648;
shr.u32 %r2304, %r2302, 5;
mov.u64 %rd2759, 0;
mov.u32 %r8686, 0;
mov.u64 %rd2757, __cudart_i2opi_f;
mov.u64 %rd2758, %rd1;
$L__BB0_1747:
.pragma "nounroll";
ld.global.nc.u32 %r7405, [%rd2757];
mad.wide.u32 %rd2345, %r7405, %r2303, %rd2759;
shr.u64 %rd2759, %rd2345, 32;
st.local.u32 [%rd2758], %rd2345;
add.s64 %rd2758, %rd2758, 4;
add.s64 %rd2757, %rd2757, 4;
add.s32 %r8686, %r8686, 1;
setp.ne.s32 %p1479, %r8686, 6;
@%p1479 bra $L__BB0_1747;
st.local.u32 [%rd5], %rd2759;
mov.u32 %r7406, 4;
sub.s32 %r2307, %r7406, %r2304;
mov.u32 %r7407, 6;
sub.s32 %r7408, %r7407, %r2304;
mul.wide.s32 %rd2346, %r7408, 4;
add.s64 %rd2347, %rd1, %rd2346;
ld.local.u32 %r8687, [%rd2347];
ld.local.u32 %r8688, [%rd2347+-4];
and.b32 %r2310, %r2302, 31;
setp.eq.s32 %p1480, %r2310, 0;
@%p1480 bra $L__BB0_1750;
mov.u32 %r7409, 32;
sub.s32 %r7410, %r7409, %r2310;
shr.u32 %r7411, %r8688, %r7410;
shl.b32 %r7412, %r8687, %r2310;
add.s32 %r8687, %r7411, %r7412;
mul.wide.s32 %rd2348, %r2307, 4;
add.s64 %rd2349, %rd1, %rd2348;
ld.local.u32 %r7413, [%rd2349];
shr.u32 %r7414, %r7413, %r7410;
shl.b32 %r7415, %r8688, %r2310;
add.s32 %r8688, %r7414, %r7415;
$L__BB0_1750:
and.b32 %r7416, %r2301, -2147483648;
shr.u32 %r7417, %r8688, 30;
shl.b32 %r7418, %r8687, 2;
or.b32 %r7419, %r7417, %r7418;
shr.u32 %r7420, %r7419, 31;
shr.u32 %r7421, %r8687, 30;
add.s32 %r7422, %r7420, %r7421;
neg.s32 %r7423, %r7422;
setp.eq.s32 %p1481, %r7416, 0;
selp.b32 %r8689, %r7422, %r7423, %p1481;
setp.ne.s32 %p1482, %r7420, 0;
xor.b32 %r7424, %r7416, -2147483648;
selp.b32 %r7425, %r7424, %r7416, %p1482;
selp.b32 %r7426, -1, 0, %p1482;
xor.b32 %r7427, %r7419, %r7426;
shl.b32 %r7428, %r8688, 2;
xor.b32 %r7429, %r7428, %r7426;
cvt.u64.u32 %rd2350, %r7427;
cvt.u64.u32 %rd2351, %r7429;
bfi.b64 %rd2352, %rd2350, %rd2351, 32, 32;
cvt.rn.f64.s64 %fd231, %rd2352;
mul.f64 %fd232, %fd231, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4745, %fd232;
setp.eq.s32 %p1483, %r7425, 0;
neg.f32 %f4746, %f4745;
selp.f32 %f5859, %f4745, %f4746, %p1483;
$L__BB0_1752:
add.s32 %r2317, %r8689, 1;
and.b32 %r2318, %r2317, 1;
setp.eq.s32 %p1484, %r2318, 0;
selp.f32 %f1983, %f5859, 0f3F800000, %p1484;
mul.rn.f32 %f1984, %f5859, %f5859;
mov.f32 %f5860, 0fB94D4153;
@%p1484 bra $L__BB0_1754;
mov.f32 %f4749, 0fBAB607ED;
mov.f32 %f4750, 0f37CBAC00;
fma.rn.f32 %f5860, %f4750, %f1984, %f4749;
$L__BB0_1754:
selp.f32 %f4751, 0f3C0885E4, 0f3D2AAABB, %p1484;
fma.rn.f32 %f4752, %f5860, %f1984, %f4751;
selp.f32 %f4753, 0fBE2AAAA8, 0fBEFFFFFF, %p1484;
fma.rn.f32 %f4754, %f4752, %f1984, %f4753;
mov.f32 %f4755, 0f00000000;
fma.rn.f32 %f4756, %f1984, %f1983, %f4755;
fma.rn.f32 %f5861, %f4754, %f4756, %f1983;
and.b32 %r7431, %r2317, 2;
setp.eq.s32 %p1486, %r7431, 0;
@%p1486 bra $L__BB0_1756;
mov.f32 %f4758, 0fBF800000;
fma.rn.f32 %f5861, %f5861, %f4758, %f4755;
$L__BB0_1756:
add.f32 %f5904, %f5858, %f5861;
mul.f32 %f4759, %f1935, 0f3F22F983;
cvt.rni.s32.f32 %r8693, %f4759;
cvt.rn.f32.s32 %f4760, %r8693;
mov.f32 %f4761, 0fBFC90FDA;
fma.rn.f32 %f4762, %f4760, %f4761, %f1935;
mov.f32 %f4763, 0fB3A22168;
fma.rn.f32 %f4764, %f4760, %f4763, %f4762;
mov.f32 %f4765, 0fA7C234C5;
fma.rn.f32 %f5862, %f4760, %f4765, %f4764;
abs.f32 %f1992, %f1935;
setp.ltu.f32 %p1487, %f1992, 0f47CE4780;
@%p1487 bra $L__BB0_1764;
setp.eq.f32 %p1488, %f1992, 0f7F800000;
@%p1488 bra $L__BB0_1763;
bra.uni $L__BB0_1758;
$L__BB0_1763:
mov.f32 %f4768, 0f00000000;
mul.rn.f32 %f5862, %f1935, %f4768;
mov.u32 %r8693, 0;
bra.uni $L__BB0_1764;
$L__BB0_1758:
mov.b32 %r2320, %f1935;
shr.u32 %r7433, %r2320, 23;
and.b32 %r7434, %r7433, 255;
add.s32 %r2321, %r7434, -128;
shl.b32 %r7435, %r2320, 8;
or.b32 %r2322, %r7435, -2147483648;
shr.u32 %r2323, %r2321, 5;
mov.u64 %rd2762, 0;
mov.u32 %r8690, 0;
mov.u64 %rd2760, __cudart_i2opi_f;
mov.u64 %rd2761, %rd1;
$L__BB0_1759:
.pragma "nounroll";
ld.global.nc.u32 %r7436, [%rd2760];
mad.wide.u32 %rd2355, %r7436, %r2322, %rd2762;
shr.u64 %rd2762, %rd2355, 32;
st.local.u32 [%rd2761], %rd2355;
add.s64 %rd2761, %rd2761, 4;
add.s64 %rd2760, %rd2760, 4;
add.s32 %r8690, %r8690, 1;
setp.ne.s32 %p1489, %r8690, 6;
@%p1489 bra $L__BB0_1759;
st.local.u32 [%rd5], %rd2762;
mov.u32 %r7437, 4;
sub.s32 %r2326, %r7437, %r2323;
mov.u32 %r7438, 6;
sub.s32 %r7439, %r7438, %r2323;
mul.wide.s32 %rd2356, %r7439, 4;
add.s64 %rd2357, %rd1, %rd2356;
ld.local.u32 %r8691, [%rd2357];
ld.local.u32 %r8692, [%rd2357+-4];
and.b32 %r2329, %r2321, 31;
setp.eq.s32 %p1490, %r2329, 0;
@%p1490 bra $L__BB0_1762;
mov.u32 %r7440, 32;
sub.s32 %r7441, %r7440, %r2329;
shr.u32 %r7442, %r8692, %r7441;
shl.b32 %r7443, %r8691, %r2329;
add.s32 %r8691, %r7442, %r7443;
mul.wide.s32 %rd2358, %r2326, 4;
add.s64 %rd2359, %rd1, %rd2358;
ld.local.u32 %r7444, [%rd2359];
shr.u32 %r7445, %r7444, %r7441;
shl.b32 %r7446, %r8692, %r2329;
add.s32 %r8692, %r7445, %r7446;
$L__BB0_1762:
and.b32 %r7447, %r2320, -2147483648;
shr.u32 %r7448, %r8692, 30;
shl.b32 %r7449, %r8691, 2;
or.b32 %r7450, %r7448, %r7449;
shr.u32 %r7451, %r7450, 31;
shr.u32 %r7452, %r8691, 30;
add.s32 %r7453, %r7451, %r7452;
neg.s32 %r7454, %r7453;
setp.eq.s32 %p1491, %r7447, 0;
selp.b32 %r8693, %r7453, %r7454, %p1491;
setp.ne.s32 %p1492, %r7451, 0;
xor.b32 %r7455, %r7447, -2147483648;
selp.b32 %r7456, %r7455, %r7447, %p1492;
selp.b32 %r7457, -1, 0, %p1492;
xor.b32 %r7458, %r7450, %r7457;
shl.b32 %r7459, %r8692, 2;
xor.b32 %r7460, %r7459, %r7457;
cvt.u64.u32 %rd2360, %r7458;
cvt.u64.u32 %rd2361, %r7460;
bfi.b64 %rd2362, %rd2360, %rd2361, 32, 32;
cvt.rn.f64.s64 %fd233, %rd2362;
mul.f64 %fd234, %fd233, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4766, %fd234;
setp.eq.s32 %p1493, %r7456, 0;
neg.f32 %f4767, %f4766;
selp.f32 %f5862, %f4766, %f4767, %p1493;
$L__BB0_1764:
and.b32 %r2336, %r8693, 1;
setp.eq.s32 %p1494, %r2336, 0;
selp.f32 %f1996, %f5862, 0f3F800000, %p1494;
mul.rn.f32 %f1997, %f5862, %f5862;
mov.f32 %f5863, 0fB94D4153;
@%p1494 bra $L__BB0_1766;
mov.f32 %f4770, 0fBAB607ED;
mov.f32 %f4771, 0f37CBAC00;
fma.rn.f32 %f5863, %f4771, %f1997, %f4770;
$L__BB0_1766:
selp.f32 %f4772, 0f3C0885E4, 0f3D2AAABB, %p1494;
fma.rn.f32 %f4773, %f5863, %f1997, %f4772;
selp.f32 %f4774, 0fBE2AAAA8, 0fBEFFFFFF, %p1494;
fma.rn.f32 %f4775, %f4773, %f1997, %f4774;
mov.f32 %f4776, 0f00000000;
fma.rn.f32 %f4777, %f1997, %f1996, %f4776;
fma.rn.f32 %f5864, %f4775, %f4777, %f1996;
and.b32 %r7462, %r8693, 2;
setp.eq.s32 %p1496, %r7462, 0;
@%p1496 bra $L__BB0_1768;
mov.f32 %f4779, 0fBF800000;
fma.rn.f32 %f5864, %f5864, %f4779, %f4776;
$L__BB0_1768:
mul.f32 %f4780, %f1927, 0f3F22F983;
cvt.rni.s32.f32 %r8697, %f4780;
cvt.rn.f32.s32 %f4781, %r8697;
mov.f32 %f4782, 0fBFC90FDA;
fma.rn.f32 %f4783, %f4781, %f4782, %f1927;
mov.f32 %f4784, 0fB3A22168;
fma.rn.f32 %f4785, %f4781, %f4784, %f4783;
mov.f32 %f4786, 0fA7C234C5;
fma.rn.f32 %f5865, %f4781, %f4786, %f4785;
abs.f32 %f2004, %f1927;
setp.ltu.f32 %p1497, %f2004, 0f47CE4780;
@%p1497 bra $L__BB0_1776;
setp.eq.f32 %p1498, %f2004, 0f7F800000;
@%p1498 bra $L__BB0_1775;
bra.uni $L__BB0_1770;
$L__BB0_1775:
mov.f32 %f4789, 0f00000000;
mul.rn.f32 %f5865, %f1927, %f4789;
mov.u32 %r8697, 0;
bra.uni $L__BB0_1776;
$L__BB0_1770:
mov.b32 %r2338, %f1927;
shr.u32 %r7464, %r2338, 23;
and.b32 %r7465, %r7464, 255;
add.s32 %r2339, %r7465, -128;
shl.b32 %r7466, %r2338, 8;
or.b32 %r2340, %r7466, -2147483648;
shr.u32 %r2341, %r2339, 5;
mov.u64 %rd2765, 0;
mov.u32 %r8694, 0;
mov.u64 %rd2763, __cudart_i2opi_f;
mov.u64 %rd2764, %rd1;
$L__BB0_1771:
.pragma "nounroll";
ld.global.nc.u32 %r7467, [%rd2763];
mad.wide.u32 %rd2365, %r7467, %r2340, %rd2765;
shr.u64 %rd2765, %rd2365, 32;
st.local.u32 [%rd2764], %rd2365;
add.s64 %rd2764, %rd2764, 4;
add.s64 %rd2763, %rd2763, 4;
add.s32 %r8694, %r8694, 1;
setp.ne.s32 %p1499, %r8694, 6;
@%p1499 bra $L__BB0_1771;
st.local.u32 [%rd5], %rd2765;
mov.u32 %r7468, 4;
sub.s32 %r2344, %r7468, %r2341;
mov.u32 %r7469, 6;
sub.s32 %r7470, %r7469, %r2341;
mul.wide.s32 %rd2366, %r7470, 4;
add.s64 %rd2367, %rd1, %rd2366;
ld.local.u32 %r8695, [%rd2367];
ld.local.u32 %r8696, [%rd2367+-4];
and.b32 %r2347, %r2339, 31;
setp.eq.s32 %p1500, %r2347, 0;
@%p1500 bra $L__BB0_1774;
mov.u32 %r7471, 32;
sub.s32 %r7472, %r7471, %r2347;
shr.u32 %r7473, %r8696, %r7472;
shl.b32 %r7474, %r8695, %r2347;
add.s32 %r8695, %r7473, %r7474;
mul.wide.s32 %rd2368, %r2344, 4;
add.s64 %rd2369, %rd1, %rd2368;
ld.local.u32 %r7475, [%rd2369];
shr.u32 %r7476, %r7475, %r7472;
shl.b32 %r7477, %r8696, %r2347;
add.s32 %r8696, %r7476, %r7477;
$L__BB0_1774:
and.b32 %r7478, %r2338, -2147483648;
shr.u32 %r7479, %r8696, 30;
shl.b32 %r7480, %r8695, 2;
or.b32 %r7481, %r7479, %r7480;
shr.u32 %r7482, %r7481, 31;
shr.u32 %r7483, %r8695, 30;
add.s32 %r7484, %r7482, %r7483;
neg.s32 %r7485, %r7484;
setp.eq.s32 %p1501, %r7478, 0;
selp.b32 %r8697, %r7484, %r7485, %p1501;
setp.ne.s32 %p1502, %r7482, 0;
xor.b32 %r7486, %r7478, -2147483648;
selp.b32 %r7487, %r7486, %r7478, %p1502;
selp.b32 %r7488, -1, 0, %p1502;
xor.b32 %r7489, %r7481, %r7488;
shl.b32 %r7490, %r8696, 2;
xor.b32 %r7491, %r7490, %r7488;
cvt.u64.u32 %rd2370, %r7489;
cvt.u64.u32 %rd2371, %r7491;
bfi.b64 %rd2372, %rd2370, %rd2371, 32, 32;
cvt.rn.f64.s64 %fd235, %rd2372;
mul.f64 %fd236, %fd235, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4787, %fd236;
setp.eq.s32 %p1503, %r7487, 0;
neg.f32 %f4788, %f4787;
selp.f32 %f5865, %f4787, %f4788, %p1503;
$L__BB0_1776:
add.s32 %r2354, %r8697, 1;
and.b32 %r2355, %r2354, 1;
setp.eq.s32 %p1504, %r2355, 0;
selp.f32 %f2008, %f5865, 0f3F800000, %p1504;
mul.rn.f32 %f2009, %f5865, %f5865;
mov.f32 %f5866, 0fB94D4153;
@%p1504 bra $L__BB0_1778;
mov.f32 %f4791, 0fBAB607ED;
mov.f32 %f4792, 0f37CBAC00;
fma.rn.f32 %f5866, %f4792, %f2009, %f4791;
$L__BB0_1778:
selp.f32 %f4793, 0f3C0885E4, 0f3D2AAABB, %p1504;
fma.rn.f32 %f4794, %f5866, %f2009, %f4793;
selp.f32 %f4795, 0fBE2AAAA8, 0fBEFFFFFF, %p1504;
fma.rn.f32 %f4796, %f4794, %f2009, %f4795;
mov.f32 %f4797, 0f00000000;
fma.rn.f32 %f4798, %f2009, %f2008, %f4797;
fma.rn.f32 %f5867, %f4796, %f4798, %f2008;
and.b32 %r7493, %r2354, 2;
setp.eq.s32 %p1506, %r7493, 0;
@%p1506 bra $L__BB0_1780;
mov.f32 %f4800, 0fBF800000;
fma.rn.f32 %f5867, %f5867, %f4800, %f4797;
$L__BB0_1780:
add.f32 %f5903, %f5864, %f5867;
mul.f32 %f4801, %f1936, 0f3F22F983;
cvt.rni.s32.f32 %r8701, %f4801;
cvt.rn.f32.s32 %f4802, %r8701;
mov.f32 %f4803, 0fBFC90FDA;
fma.rn.f32 %f4804, %f4802, %f4803, %f1936;
mov.f32 %f4805, 0fB3A22168;
fma.rn.f32 %f4806, %f4802, %f4805, %f4804;
mov.f32 %f4807, 0fA7C234C5;
fma.rn.f32 %f5868, %f4802, %f4807, %f4806;
abs.f32 %f2017, %f1936;
setp.ltu.f32 %p1507, %f2017, 0f47CE4780;
@%p1507 bra $L__BB0_1788;
setp.eq.f32 %p1508, %f2017, 0f7F800000;
@%p1508 bra $L__BB0_1787;
bra.uni $L__BB0_1782;
$L__BB0_1787:
mov.f32 %f4810, 0f00000000;
mul.rn.f32 %f5868, %f1936, %f4810;
mov.u32 %r8701, 0;
bra.uni $L__BB0_1788;
$L__BB0_1782:
mov.b32 %r2357, %f1936;
shr.u32 %r7495, %r2357, 23;
and.b32 %r7496, %r7495, 255;
add.s32 %r2358, %r7496, -128;
shl.b32 %r7497, %r2357, 8;
or.b32 %r2359, %r7497, -2147483648;
shr.u32 %r2360, %r2358, 5;
mov.u64 %rd2768, 0;
mov.u32 %r8698, 0;
mov.u64 %rd2766, __cudart_i2opi_f;
mov.u64 %rd2767, %rd1;
$L__BB0_1783:
.pragma "nounroll";
ld.global.nc.u32 %r7498, [%rd2766];
mad.wide.u32 %rd2375, %r7498, %r2359, %rd2768;
shr.u64 %rd2768, %rd2375, 32;
st.local.u32 [%rd2767], %rd2375;
add.s64 %rd2767, %rd2767, 4;
add.s64 %rd2766, %rd2766, 4;
add.s32 %r8698, %r8698, 1;
setp.ne.s32 %p1509, %r8698, 6;
@%p1509 bra $L__BB0_1783;
st.local.u32 [%rd5], %rd2768;
mov.u32 %r7499, 4;
sub.s32 %r2363, %r7499, %r2360;
mov.u32 %r7500, 6;
sub.s32 %r7501, %r7500, %r2360;
mul.wide.s32 %rd2376, %r7501, 4;
add.s64 %rd2377, %rd1, %rd2376;
ld.local.u32 %r8699, [%rd2377];
ld.local.u32 %r8700, [%rd2377+-4];
and.b32 %r2366, %r2358, 31;
setp.eq.s32 %p1510, %r2366, 0;
@%p1510 bra $L__BB0_1786;
mov.u32 %r7502, 32;
sub.s32 %r7503, %r7502, %r2366;
shr.u32 %r7504, %r8700, %r7503;
shl.b32 %r7505, %r8699, %r2366;
add.s32 %r8699, %r7504, %r7505;
mul.wide.s32 %rd2378, %r2363, 4;
add.s64 %rd2379, %rd1, %rd2378;
ld.local.u32 %r7506, [%rd2379];
shr.u32 %r7507, %r7506, %r7503;
shl.b32 %r7508, %r8700, %r2366;
add.s32 %r8700, %r7507, %r7508;
$L__BB0_1786:
and.b32 %r7509, %r2357, -2147483648;
shr.u32 %r7510, %r8700, 30;
shl.b32 %r7511, %r8699, 2;
or.b32 %r7512, %r7510, %r7511;
shr.u32 %r7513, %r7512, 31;
shr.u32 %r7514, %r8699, 30;
add.s32 %r7515, %r7513, %r7514;
neg.s32 %r7516, %r7515;
setp.eq.s32 %p1511, %r7509, 0;
selp.b32 %r8701, %r7515, %r7516, %p1511;
setp.ne.s32 %p1512, %r7513, 0;
xor.b32 %r7517, %r7509, -2147483648;
selp.b32 %r7518, %r7517, %r7509, %p1512;
selp.b32 %r7519, -1, 0, %p1512;
xor.b32 %r7520, %r7512, %r7519;
shl.b32 %r7521, %r8700, 2;
xor.b32 %r7522, %r7521, %r7519;
cvt.u64.u32 %rd2380, %r7520;
cvt.u64.u32 %rd2381, %r7522;
bfi.b64 %rd2382, %rd2380, %rd2381, 32, 32;
cvt.rn.f64.s64 %fd237, %rd2382;
mul.f64 %fd238, %fd237, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4808, %fd238;
setp.eq.s32 %p1513, %r7518, 0;
neg.f32 %f4809, %f4808;
selp.f32 %f5868, %f4808, %f4809, %p1513;
$L__BB0_1788:
and.b32 %r2373, %r8701, 1;
setp.eq.s32 %p1514, %r2373, 0;
selp.f32 %f2021, %f5868, 0f3F800000, %p1514;
mul.rn.f32 %f2022, %f5868, %f5868;
mov.f32 %f5869, 0fB94D4153;
@%p1514 bra $L__BB0_1790;
mov.f32 %f4812, 0fBAB607ED;
mov.f32 %f4813, 0f37CBAC00;
fma.rn.f32 %f5869, %f4813, %f2022, %f4812;
$L__BB0_1790:
selp.f32 %f4814, 0f3C0885E4, 0f3D2AAABB, %p1514;
fma.rn.f32 %f4815, %f5869, %f2022, %f4814;
selp.f32 %f4816, 0fBE2AAAA8, 0fBEFFFFFF, %p1514;
fma.rn.f32 %f4817, %f4815, %f2022, %f4816;
mov.f32 %f4818, 0f00000000;
fma.rn.f32 %f4819, %f2022, %f2021, %f4818;
fma.rn.f32 %f5870, %f4817, %f4819, %f2021;
and.b32 %r7524, %r8701, 2;
setp.eq.s32 %p1516, %r7524, 0;
@%p1516 bra $L__BB0_1792;
mov.f32 %f4821, 0fBF800000;
fma.rn.f32 %f5870, %f5870, %f4821, %f4818;
$L__BB0_1792:
mul.f32 %f4822, %f1928, 0f3F22F983;
cvt.rni.s32.f32 %r8705, %f4822;
cvt.rn.f32.s32 %f4823, %r8705;
mov.f32 %f4824, 0fBFC90FDA;
fma.rn.f32 %f4825, %f4823, %f4824, %f1928;
mov.f32 %f4826, 0fB3A22168;
fma.rn.f32 %f4827, %f4823, %f4826, %f4825;
mov.f32 %f4828, 0fA7C234C5;
fma.rn.f32 %f5871, %f4823, %f4828, %f4827;
abs.f32 %f2029, %f1928;
setp.ltu.f32 %p1517, %f2029, 0f47CE4780;
@%p1517 bra $L__BB0_1800;
setp.eq.f32 %p1518, %f2029, 0f7F800000;
@%p1518 bra $L__BB0_1799;
bra.uni $L__BB0_1794;
$L__BB0_1799:
mov.f32 %f4831, 0f00000000;
mul.rn.f32 %f5871, %f1928, %f4831;
mov.u32 %r8705, 0;
bra.uni $L__BB0_1800;
$L__BB0_1794:
mov.b32 %r2375, %f1928;
shr.u32 %r7526, %r2375, 23;
and.b32 %r7527, %r7526, 255;
add.s32 %r2376, %r7527, -128;
shl.b32 %r7528, %r2375, 8;
or.b32 %r2377, %r7528, -2147483648;
shr.u32 %r2378, %r2376, 5;
mov.u64 %rd2771, 0;
mov.u32 %r8702, 0;
mov.u64 %rd2769, __cudart_i2opi_f;
mov.u64 %rd2770, %rd1;
$L__BB0_1795:
.pragma "nounroll";
ld.global.nc.u32 %r7529, [%rd2769];
mad.wide.u32 %rd2385, %r7529, %r2377, %rd2771;
shr.u64 %rd2771, %rd2385, 32;
st.local.u32 [%rd2770], %rd2385;
add.s64 %rd2770, %rd2770, 4;
add.s64 %rd2769, %rd2769, 4;
add.s32 %r8702, %r8702, 1;
setp.ne.s32 %p1519, %r8702, 6;
@%p1519 bra $L__BB0_1795;
st.local.u32 [%rd5], %rd2771;
mov.u32 %r7530, 4;
sub.s32 %r2381, %r7530, %r2378;
mov.u32 %r7531, 6;
sub.s32 %r7532, %r7531, %r2378;
mul.wide.s32 %rd2386, %r7532, 4;
add.s64 %rd2387, %rd1, %rd2386;
ld.local.u32 %r8703, [%rd2387];
ld.local.u32 %r8704, [%rd2387+-4];
and.b32 %r2384, %r2376, 31;
setp.eq.s32 %p1520, %r2384, 0;
@%p1520 bra $L__BB0_1798;
mov.u32 %r7533, 32;
sub.s32 %r7534, %r7533, %r2384;
shr.u32 %r7535, %r8704, %r7534;
shl.b32 %r7536, %r8703, %r2384;
add.s32 %r8703, %r7535, %r7536;
mul.wide.s32 %rd2388, %r2381, 4;
add.s64 %rd2389, %rd1, %rd2388;
ld.local.u32 %r7537, [%rd2389];
shr.u32 %r7538, %r7537, %r7534;
shl.b32 %r7539, %r8704, %r2384;
add.s32 %r8704, %r7538, %r7539;
$L__BB0_1798:
and.b32 %r7540, %r2375, -2147483648;
shr.u32 %r7541, %r8704, 30;
shl.b32 %r7542, %r8703, 2;
or.b32 %r7543, %r7541, %r7542;
shr.u32 %r7544, %r7543, 31;
shr.u32 %r7545, %r8703, 30;
add.s32 %r7546, %r7544, %r7545;
neg.s32 %r7547, %r7546;
setp.eq.s32 %p1521, %r7540, 0;
selp.b32 %r8705, %r7546, %r7547, %p1521;
setp.ne.s32 %p1522, %r7544, 0;
xor.b32 %r7548, %r7540, -2147483648;
selp.b32 %r7549, %r7548, %r7540, %p1522;
selp.b32 %r7550, -1, 0, %p1522;
xor.b32 %r7551, %r7543, %r7550;
shl.b32 %r7552, %r8704, 2;
xor.b32 %r7553, %r7552, %r7550;
cvt.u64.u32 %rd2390, %r7551;
cvt.u64.u32 %rd2391, %r7553;
bfi.b64 %rd2392, %rd2390, %rd2391, 32, 32;
cvt.rn.f64.s64 %fd239, %rd2392;
mul.f64 %fd240, %fd239, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4829, %fd240;
setp.eq.s32 %p1523, %r7549, 0;
neg.f32 %f4830, %f4829;
selp.f32 %f5871, %f4829, %f4830, %p1523;
$L__BB0_1800:
add.s32 %r2391, %r8705, 1;
and.b32 %r2392, %r2391, 1;
setp.eq.s32 %p1524, %r2392, 0;
selp.f32 %f2033, %f5871, 0f3F800000, %p1524;
mul.rn.f32 %f2034, %f5871, %f5871;
mov.f32 %f5872, 0fB94D4153;
@%p1524 bra $L__BB0_1802;
mov.f32 %f4833, 0fBAB607ED;
mov.f32 %f4834, 0f37CBAC00;
fma.rn.f32 %f5872, %f4834, %f2034, %f4833;
$L__BB0_1802:
selp.f32 %f4835, 0f3C0885E4, 0f3D2AAABB, %p1524;
fma.rn.f32 %f4836, %f5872, %f2034, %f4835;
selp.f32 %f4837, 0fBE2AAAA8, 0fBEFFFFFF, %p1524;
fma.rn.f32 %f4838, %f4836, %f2034, %f4837;
mov.f32 %f4839, 0f00000000;
fma.rn.f32 %f4840, %f2034, %f2033, %f4839;
fma.rn.f32 %f5873, %f4838, %f4840, %f2033;
and.b32 %r7555, %r2391, 2;
setp.eq.s32 %p1526, %r7555, 0;
@%p1526 bra $L__BB0_1804;
mov.f32 %f4842, 0fBF800000;
fma.rn.f32 %f5873, %f5873, %f4842, %f4839;
$L__BB0_1804:
add.f32 %f5902, %f5870, %f5873;
mul.f32 %f4843, %f1937, 0f3F22F983;
cvt.rni.s32.f32 %r8709, %f4843;
cvt.rn.f32.s32 %f4844, %r8709;
mov.f32 %f4845, 0fBFC90FDA;
fma.rn.f32 %f4846, %f4844, %f4845, %f1937;
mov.f32 %f4847, 0fB3A22168;
fma.rn.f32 %f4848, %f4844, %f4847, %f4846;
mov.f32 %f4849, 0fA7C234C5;
fma.rn.f32 %f5874, %f4844, %f4849, %f4848;
abs.f32 %f2042, %f1937;
setp.ltu.f32 %p1527, %f2042, 0f47CE4780;
@%p1527 bra $L__BB0_1812;
setp.eq.f32 %p1528, %f2042, 0f7F800000;
@%p1528 bra $L__BB0_1811;
bra.uni $L__BB0_1806;
$L__BB0_1811:
mov.f32 %f4852, 0f00000000;
mul.rn.f32 %f5874, %f1937, %f4852;
mov.u32 %r8709, 0;
bra.uni $L__BB0_1812;
$L__BB0_1806:
mov.b32 %r2394, %f1937;
shr.u32 %r7557, %r2394, 23;
and.b32 %r7558, %r7557, 255;
add.s32 %r2395, %r7558, -128;
shl.b32 %r7559, %r2394, 8;
or.b32 %r2396, %r7559, -2147483648;
shr.u32 %r2397, %r2395, 5;
mov.u64 %rd2774, 0;
mov.u32 %r8706, 0;
mov.u64 %rd2772, __cudart_i2opi_f;
mov.u64 %rd2773, %rd1;
$L__BB0_1807:
.pragma "nounroll";
ld.global.nc.u32 %r7560, [%rd2772];
mad.wide.u32 %rd2395, %r7560, %r2396, %rd2774;
shr.u64 %rd2774, %rd2395, 32;
st.local.u32 [%rd2773], %rd2395;
add.s64 %rd2773, %rd2773, 4;
add.s64 %rd2772, %rd2772, 4;
add.s32 %r8706, %r8706, 1;
setp.ne.s32 %p1529, %r8706, 6;
@%p1529 bra $L__BB0_1807;
st.local.u32 [%rd5], %rd2774;
mov.u32 %r7561, 4;
sub.s32 %r2400, %r7561, %r2397;
mov.u32 %r7562, 6;
sub.s32 %r7563, %r7562, %r2397;
mul.wide.s32 %rd2396, %r7563, 4;
add.s64 %rd2397, %rd1, %rd2396;
ld.local.u32 %r8707, [%rd2397];
ld.local.u32 %r8708, [%rd2397+-4];
and.b32 %r2403, %r2395, 31;
setp.eq.s32 %p1530, %r2403, 0;
@%p1530 bra $L__BB0_1810;
mov.u32 %r7564, 32;
sub.s32 %r7565, %r7564, %r2403;
shr.u32 %r7566, %r8708, %r7565;
shl.b32 %r7567, %r8707, %r2403;
add.s32 %r8707, %r7566, %r7567;
mul.wide.s32 %rd2398, %r2400, 4;
add.s64 %rd2399, %rd1, %rd2398;
ld.local.u32 %r7568, [%rd2399];
shr.u32 %r7569, %r7568, %r7565;
shl.b32 %r7570, %r8708, %r2403;
add.s32 %r8708, %r7569, %r7570;
$L__BB0_1810:
and.b32 %r7571, %r2394, -2147483648;
shr.u32 %r7572, %r8708, 30;
shl.b32 %r7573, %r8707, 2;
or.b32 %r7574, %r7572, %r7573;
shr.u32 %r7575, %r7574, 31;
shr.u32 %r7576, %r8707, 30;
add.s32 %r7577, %r7575, %r7576;
neg.s32 %r7578, %r7577;
setp.eq.s32 %p1531, %r7571, 0;
selp.b32 %r8709, %r7577, %r7578, %p1531;
setp.ne.s32 %p1532, %r7575, 0;
xor.b32 %r7579, %r7571, -2147483648;
selp.b32 %r7580, %r7579, %r7571, %p1532;
selp.b32 %r7581, -1, 0, %p1532;
xor.b32 %r7582, %r7574, %r7581;
shl.b32 %r7583, %r8708, 2;
xor.b32 %r7584, %r7583, %r7581;
cvt.u64.u32 %rd2400, %r7582;
cvt.u64.u32 %rd2401, %r7584;
bfi.b64 %rd2402, %rd2400, %rd2401, 32, 32;
cvt.rn.f64.s64 %fd241, %rd2402;
mul.f64 %fd242, %fd241, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4850, %fd242;
setp.eq.s32 %p1533, %r7580, 0;
neg.f32 %f4851, %f4850;
selp.f32 %f5874, %f4850, %f4851, %p1533;
$L__BB0_1812:
and.b32 %r2410, %r8709, 1;
setp.eq.s32 %p1534, %r2410, 0;
selp.f32 %f2046, %f5874, 0f3F800000, %p1534;
mul.rn.f32 %f2047, %f5874, %f5874;
mov.f32 %f5875, 0fB94D4153;
@%p1534 bra $L__BB0_1814;
mov.f32 %f4854, 0fBAB607ED;
mov.f32 %f4855, 0f37CBAC00;
fma.rn.f32 %f5875, %f4855, %f2047, %f4854;
$L__BB0_1814:
selp.f32 %f4856, 0f3C0885E4, 0f3D2AAABB, %p1534;
fma.rn.f32 %f4857, %f5875, %f2047, %f4856;
selp.f32 %f4858, 0fBE2AAAA8, 0fBEFFFFFF, %p1534;
fma.rn.f32 %f4859, %f4857, %f2047, %f4858;
mov.f32 %f4860, 0f00000000;
fma.rn.f32 %f4861, %f2047, %f2046, %f4860;
fma.rn.f32 %f5876, %f4859, %f4861, %f2046;
and.b32 %r7586, %r8709, 2;
setp.eq.s32 %p1536, %r7586, 0;
@%p1536 bra $L__BB0_1816;
mov.f32 %f4863, 0fBF800000;
fma.rn.f32 %f5876, %f5876, %f4863, %f4860;
$L__BB0_1816:
mul.f32 %f4864, %f1929, 0f3F22F983;
cvt.rni.s32.f32 %r8713, %f4864;
cvt.rn.f32.s32 %f4865, %r8713;
mov.f32 %f4866, 0fBFC90FDA;
fma.rn.f32 %f4867, %f4865, %f4866, %f1929;
mov.f32 %f4868, 0fB3A22168;
fma.rn.f32 %f4869, %f4865, %f4868, %f4867;
mov.f32 %f4870, 0fA7C234C5;
fma.rn.f32 %f5877, %f4865, %f4870, %f4869;
abs.f32 %f2054, %f1929;
setp.ltu.f32 %p1537, %f2054, 0f47CE4780;
@%p1537 bra $L__BB0_1824;
setp.eq.f32 %p1538, %f2054, 0f7F800000;
@%p1538 bra $L__BB0_1823;
bra.uni $L__BB0_1818;
$L__BB0_1823:
mov.f32 %f4873, 0f00000000;
mul.rn.f32 %f5877, %f1929, %f4873;
mov.u32 %r8713, 0;
bra.uni $L__BB0_1824;
$L__BB0_1818:
mov.b32 %r2412, %f1929;
shr.u32 %r7588, %r2412, 23;
and.b32 %r7589, %r7588, 255;
add.s32 %r2413, %r7589, -128;
shl.b32 %r7590, %r2412, 8;
or.b32 %r2414, %r7590, -2147483648;
shr.u32 %r2415, %r2413, 5;
mov.u64 %rd2777, 0;
mov.u32 %r8710, 0;
mov.u64 %rd2775, __cudart_i2opi_f;
mov.u64 %rd2776, %rd1;
$L__BB0_1819:
.pragma "nounroll";
ld.global.nc.u32 %r7591, [%rd2775];
mad.wide.u32 %rd2405, %r7591, %r2414, %rd2777;
shr.u64 %rd2777, %rd2405, 32;
st.local.u32 [%rd2776], %rd2405;
add.s64 %rd2776, %rd2776, 4;
add.s64 %rd2775, %rd2775, 4;
add.s32 %r8710, %r8710, 1;
setp.ne.s32 %p1539, %r8710, 6;
@%p1539 bra $L__BB0_1819;
st.local.u32 [%rd5], %rd2777;
mov.u32 %r7592, 4;
sub.s32 %r2418, %r7592, %r2415;
mov.u32 %r7593, 6;
sub.s32 %r7594, %r7593, %r2415;
mul.wide.s32 %rd2406, %r7594, 4;
add.s64 %rd2407, %rd1, %rd2406;
ld.local.u32 %r8711, [%rd2407];
ld.local.u32 %r8712, [%rd2407+-4];
and.b32 %r2421, %r2413, 31;
setp.eq.s32 %p1540, %r2421, 0;
@%p1540 bra $L__BB0_1822;
mov.u32 %r7595, 32;
sub.s32 %r7596, %r7595, %r2421;
shr.u32 %r7597, %r8712, %r7596;
shl.b32 %r7598, %r8711, %r2421;
add.s32 %r8711, %r7597, %r7598;
mul.wide.s32 %rd2408, %r2418, 4;
add.s64 %rd2409, %rd1, %rd2408;
ld.local.u32 %r7599, [%rd2409];
shr.u32 %r7600, %r7599, %r7596;
shl.b32 %r7601, %r8712, %r2421;
add.s32 %r8712, %r7600, %r7601;
$L__BB0_1822:
and.b32 %r7602, %r2412, -2147483648;
shr.u32 %r7603, %r8712, 30;
shl.b32 %r7604, %r8711, 2;
or.b32 %r7605, %r7603, %r7604;
shr.u32 %r7606, %r7605, 31;
shr.u32 %r7607, %r8711, 30;
add.s32 %r7608, %r7606, %r7607;
neg.s32 %r7609, %r7608;
setp.eq.s32 %p1541, %r7602, 0;
selp.b32 %r8713, %r7608, %r7609, %p1541;
setp.ne.s32 %p1542, %r7606, 0;
xor.b32 %r7610, %r7602, -2147483648;
selp.b32 %r7611, %r7610, %r7602, %p1542;
selp.b32 %r7612, -1, 0, %p1542;
xor.b32 %r7613, %r7605, %r7612;
shl.b32 %r7614, %r8712, 2;
xor.b32 %r7615, %r7614, %r7612;
cvt.u64.u32 %rd2410, %r7613;
cvt.u64.u32 %rd2411, %r7615;
bfi.b64 %rd2412, %rd2410, %rd2411, 32, 32;
cvt.rn.f64.s64 %fd243, %rd2412;
mul.f64 %fd244, %fd243, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4871, %fd244;
setp.eq.s32 %p1543, %r7611, 0;
neg.f32 %f4872, %f4871;
selp.f32 %f5877, %f4871, %f4872, %p1543;
$L__BB0_1824:
add.s32 %r2428, %r8713, 1;
and.b32 %r2429, %r2428, 1;
setp.eq.s32 %p1544, %r2429, 0;
selp.f32 %f2058, %f5877, 0f3F800000, %p1544;
mul.rn.f32 %f2059, %f5877, %f5877;
mov.f32 %f5878, 0fB94D4153;
@%p1544 bra $L__BB0_1826;
mov.f32 %f4875, 0fBAB607ED;
mov.f32 %f4876, 0f37CBAC00;
fma.rn.f32 %f5878, %f4876, %f2059, %f4875;
$L__BB0_1826:
selp.f32 %f4877, 0f3C0885E4, 0f3D2AAABB, %p1544;
fma.rn.f32 %f4878, %f5878, %f2059, %f4877;
selp.f32 %f4879, 0fBE2AAAA8, 0fBEFFFFFF, %p1544;
fma.rn.f32 %f4880, %f4878, %f2059, %f4879;
mov.f32 %f4881, 0f00000000;
fma.rn.f32 %f4882, %f2059, %f2058, %f4881;
fma.rn.f32 %f5879, %f4880, %f4882, %f2058;
and.b32 %r7617, %r2428, 2;
setp.eq.s32 %p1546, %r7617, 0;
@%p1546 bra $L__BB0_1828;
mov.f32 %f4884, 0fBF800000;
fma.rn.f32 %f5879, %f5879, %f4884, %f4881;
$L__BB0_1828:
add.f32 %f5901, %f5876, %f5879;
mul.f32 %f4885, %f1938, 0f3F22F983;
cvt.rni.s32.f32 %r8717, %f4885;
cvt.rn.f32.s32 %f4886, %r8717;
mov.f32 %f4887, 0fBFC90FDA;
fma.rn.f32 %f4888, %f4886, %f4887, %f1938;
mov.f32 %f4889, 0fB3A22168;
fma.rn.f32 %f4890, %f4886, %f4889, %f4888;
mov.f32 %f4891, 0fA7C234C5;
fma.rn.f32 %f5880, %f4886, %f4891, %f4890;
abs.f32 %f2067, %f1938;
setp.ltu.f32 %p1547, %f2067, 0f47CE4780;
@%p1547 bra $L__BB0_1836;
setp.eq.f32 %p1548, %f2067, 0f7F800000;
@%p1548 bra $L__BB0_1835;
bra.uni $L__BB0_1830;
$L__BB0_1835:
mov.f32 %f4894, 0f00000000;
mul.rn.f32 %f5880, %f1938, %f4894;
mov.u32 %r8717, 0;
bra.uni $L__BB0_1836;
$L__BB0_1830:
mov.b32 %r2431, %f1938;
shr.u32 %r7619, %r2431, 23;
and.b32 %r7620, %r7619, 255;
add.s32 %r2432, %r7620, -128;
shl.b32 %r7621, %r2431, 8;
or.b32 %r2433, %r7621, -2147483648;
shr.u32 %r2434, %r2432, 5;
mov.u64 %rd2780, 0;
mov.u32 %r8714, 0;
mov.u64 %rd2778, __cudart_i2opi_f;
mov.u64 %rd2779, %rd1;
$L__BB0_1831:
.pragma "nounroll";
ld.global.nc.u32 %r7622, [%rd2778];
mad.wide.u32 %rd2415, %r7622, %r2433, %rd2780;
shr.u64 %rd2780, %rd2415, 32;
st.local.u32 [%rd2779], %rd2415;
add.s64 %rd2779, %rd2779, 4;
add.s64 %rd2778, %rd2778, 4;
add.s32 %r8714, %r8714, 1;
setp.ne.s32 %p1549, %r8714, 6;
@%p1549 bra $L__BB0_1831;
st.local.u32 [%rd5], %rd2780;
mov.u32 %r7623, 4;
sub.s32 %r2437, %r7623, %r2434;
mov.u32 %r7624, 6;
sub.s32 %r7625, %r7624, %r2434;
mul.wide.s32 %rd2416, %r7625, 4;
add.s64 %rd2417, %rd1, %rd2416;
ld.local.u32 %r8715, [%rd2417];
ld.local.u32 %r8716, [%rd2417+-4];
and.b32 %r2440, %r2432, 31;
setp.eq.s32 %p1550, %r2440, 0;
@%p1550 bra $L__BB0_1834;
mov.u32 %r7626, 32;
sub.s32 %r7627, %r7626, %r2440;
shr.u32 %r7628, %r8716, %r7627;
shl.b32 %r7629, %r8715, %r2440;
add.s32 %r8715, %r7628, %r7629;
mul.wide.s32 %rd2418, %r2437, 4;
add.s64 %rd2419, %rd1, %rd2418;
ld.local.u32 %r7630, [%rd2419];
shr.u32 %r7631, %r7630, %r7627;
shl.b32 %r7632, %r8716, %r2440;
add.s32 %r8716, %r7631, %r7632;
$L__BB0_1834:
and.b32 %r7633, %r2431, -2147483648;
shr.u32 %r7634, %r8716, 30;
shl.b32 %r7635, %r8715, 2;
or.b32 %r7636, %r7634, %r7635;
shr.u32 %r7637, %r7636, 31;
shr.u32 %r7638, %r8715, 30;
add.s32 %r7639, %r7637, %r7638;
neg.s32 %r7640, %r7639;
setp.eq.s32 %p1551, %r7633, 0;
selp.b32 %r8717, %r7639, %r7640, %p1551;
setp.ne.s32 %p1552, %r7637, 0;
xor.b32 %r7641, %r7633, -2147483648;
selp.b32 %r7642, %r7641, %r7633, %p1552;
selp.b32 %r7643, -1, 0, %p1552;
xor.b32 %r7644, %r7636, %r7643;
shl.b32 %r7645, %r8716, 2;
xor.b32 %r7646, %r7645, %r7643;
cvt.u64.u32 %rd2420, %r7644;
cvt.u64.u32 %rd2421, %r7646;
bfi.b64 %rd2422, %rd2420, %rd2421, 32, 32;
cvt.rn.f64.s64 %fd245, %rd2422;
mul.f64 %fd246, %fd245, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4892, %fd246;
setp.eq.s32 %p1553, %r7642, 0;
neg.f32 %f4893, %f4892;
selp.f32 %f5880, %f4892, %f4893, %p1553;
$L__BB0_1836:
and.b32 %r2447, %r8717, 1;
setp.eq.s32 %p1554, %r2447, 0;
selp.f32 %f2071, %f5880, 0f3F800000, %p1554;
mul.rn.f32 %f2072, %f5880, %f5880;
mov.f32 %f5881, 0fB94D4153;
@%p1554 bra $L__BB0_1838;
mov.f32 %f4896, 0fBAB607ED;
mov.f32 %f4897, 0f37CBAC00;
fma.rn.f32 %f5881, %f4897, %f2072, %f4896;
$L__BB0_1838:
selp.f32 %f4898, 0f3C0885E4, 0f3D2AAABB, %p1554;
fma.rn.f32 %f4899, %f5881, %f2072, %f4898;
selp.f32 %f4900, 0fBE2AAAA8, 0fBEFFFFFF, %p1554;
fma.rn.f32 %f4901, %f4899, %f2072, %f4900;
mov.f32 %f4902, 0f00000000;
fma.rn.f32 %f4903, %f2072, %f2071, %f4902;
fma.rn.f32 %f5882, %f4901, %f4903, %f2071;
and.b32 %r7648, %r8717, 2;
setp.eq.s32 %p1556, %r7648, 0;
@%p1556 bra $L__BB0_1840;
mov.f32 %f4905, 0fBF800000;
fma.rn.f32 %f5882, %f5882, %f4905, %f4902;
$L__BB0_1840:
mul.f32 %f4906, %f1930, 0f3F22F983;
cvt.rni.s32.f32 %r8721, %f4906;
cvt.rn.f32.s32 %f4907, %r8721;
mov.f32 %f4908, 0fBFC90FDA;
fma.rn.f32 %f4909, %f4907, %f4908, %f1930;
mov.f32 %f4910, 0fB3A22168;
fma.rn.f32 %f4911, %f4907, %f4910, %f4909;
mov.f32 %f4912, 0fA7C234C5;
fma.rn.f32 %f5883, %f4907, %f4912, %f4911;
abs.f32 %f2079, %f1930;
setp.ltu.f32 %p1557, %f2079, 0f47CE4780;
@%p1557 bra $L__BB0_1848;
setp.eq.f32 %p1558, %f2079, 0f7F800000;
@%p1558 bra $L__BB0_1847;
bra.uni $L__BB0_1842;
$L__BB0_1847:
mov.f32 %f4915, 0f00000000;
mul.rn.f32 %f5883, %f1930, %f4915;
mov.u32 %r8721, 0;
bra.uni $L__BB0_1848;
$L__BB0_1842:
mov.b32 %r2449, %f1930;
shr.u32 %r7650, %r2449, 23;
and.b32 %r7651, %r7650, 255;
add.s32 %r2450, %r7651, -128;
shl.b32 %r7652, %r2449, 8;
or.b32 %r2451, %r7652, -2147483648;
shr.u32 %r2452, %r2450, 5;
mov.u64 %rd2783, 0;
mov.u32 %r8718, 0;
mov.u64 %rd2781, __cudart_i2opi_f;
mov.u64 %rd2782, %rd1;
$L__BB0_1843:
.pragma "nounroll";
ld.global.nc.u32 %r7653, [%rd2781];
mad.wide.u32 %rd2425, %r7653, %r2451, %rd2783;
shr.u64 %rd2783, %rd2425, 32;
st.local.u32 [%rd2782], %rd2425;
add.s64 %rd2782, %rd2782, 4;
add.s64 %rd2781, %rd2781, 4;
add.s32 %r8718, %r8718, 1;
setp.ne.s32 %p1559, %r8718, 6;
@%p1559 bra $L__BB0_1843;
st.local.u32 [%rd5], %rd2783;
mov.u32 %r7654, 4;
sub.s32 %r2455, %r7654, %r2452;
mov.u32 %r7655, 6;
sub.s32 %r7656, %r7655, %r2452;
mul.wide.s32 %rd2426, %r7656, 4;
add.s64 %rd2427, %rd1, %rd2426;
ld.local.u32 %r8719, [%rd2427];
ld.local.u32 %r8720, [%rd2427+-4];
and.b32 %r2458, %r2450, 31;
setp.eq.s32 %p1560, %r2458, 0;
@%p1560 bra $L__BB0_1846;
mov.u32 %r7657, 32;
sub.s32 %r7658, %r7657, %r2458;
shr.u32 %r7659, %r8720, %r7658;
shl.b32 %r7660, %r8719, %r2458;
add.s32 %r8719, %r7659, %r7660;
mul.wide.s32 %rd2428, %r2455, 4;
add.s64 %rd2429, %rd1, %rd2428;
ld.local.u32 %r7661, [%rd2429];
shr.u32 %r7662, %r7661, %r7658;
shl.b32 %r7663, %r8720, %r2458;
add.s32 %r8720, %r7662, %r7663;
$L__BB0_1846:
and.b32 %r7664, %r2449, -2147483648;
shr.u32 %r7665, %r8720, 30;
shl.b32 %r7666, %r8719, 2;
or.b32 %r7667, %r7665, %r7666;
shr.u32 %r7668, %r7667, 31;
shr.u32 %r7669, %r8719, 30;
add.s32 %r7670, %r7668, %r7669;
neg.s32 %r7671, %r7670;
setp.eq.s32 %p1561, %r7664, 0;
selp.b32 %r8721, %r7670, %r7671, %p1561;
setp.ne.s32 %p1562, %r7668, 0;
xor.b32 %r7672, %r7664, -2147483648;
selp.b32 %r7673, %r7672, %r7664, %p1562;
selp.b32 %r7674, -1, 0, %p1562;
xor.b32 %r7675, %r7667, %r7674;
shl.b32 %r7676, %r8720, 2;
xor.b32 %r7677, %r7676, %r7674;
cvt.u64.u32 %rd2430, %r7675;
cvt.u64.u32 %rd2431, %r7677;
bfi.b64 %rd2432, %rd2430, %rd2431, 32, 32;
cvt.rn.f64.s64 %fd247, %rd2432;
mul.f64 %fd248, %fd247, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4913, %fd248;
setp.eq.s32 %p1563, %r7673, 0;
neg.f32 %f4914, %f4913;
selp.f32 %f5883, %f4913, %f4914, %p1563;
$L__BB0_1848:
add.s32 %r2465, %r8721, 1;
and.b32 %r2466, %r2465, 1;
setp.eq.s32 %p1564, %r2466, 0;
selp.f32 %f2083, %f5883, 0f3F800000, %p1564;
mul.rn.f32 %f2084, %f5883, %f5883;
mov.f32 %f5884, 0fB94D4153;
@%p1564 bra $L__BB0_1850;
mov.f32 %f4917, 0fBAB607ED;
mov.f32 %f4918, 0f37CBAC00;
fma.rn.f32 %f5884, %f4918, %f2084, %f4917;
$L__BB0_1850:
selp.f32 %f4919, 0f3C0885E4, 0f3D2AAABB, %p1564;
fma.rn.f32 %f4920, %f5884, %f2084, %f4919;
selp.f32 %f4921, 0fBE2AAAA8, 0fBEFFFFFF, %p1564;
fma.rn.f32 %f4922, %f4920, %f2084, %f4921;
mov.f32 %f4923, 0f00000000;
fma.rn.f32 %f4924, %f2084, %f2083, %f4923;
fma.rn.f32 %f5885, %f4922, %f4924, %f2083;
and.b32 %r7679, %r2465, 2;
setp.eq.s32 %p1566, %r7679, 0;
@%p1566 bra $L__BB0_1852;
mov.f32 %f4926, 0fBF800000;
fma.rn.f32 %f5885, %f5885, %f4926, %f4923;
$L__BB0_1852:
add.f32 %f5900, %f5882, %f5885;
mul.f32 %f4927, %f1939, 0f3F22F983;
cvt.rni.s32.f32 %r8725, %f4927;
cvt.rn.f32.s32 %f4928, %r8725;
mov.f32 %f4929, 0fBFC90FDA;
fma.rn.f32 %f4930, %f4928, %f4929, %f1939;
mov.f32 %f4931, 0fB3A22168;
fma.rn.f32 %f4932, %f4928, %f4931, %f4930;
mov.f32 %f4933, 0fA7C234C5;
fma.rn.f32 %f5886, %f4928, %f4933, %f4932;
abs.f32 %f2092, %f1939;
setp.ltu.f32 %p1567, %f2092, 0f47CE4780;
@%p1567 bra $L__BB0_1860;
setp.eq.f32 %p1568, %f2092, 0f7F800000;
@%p1568 bra $L__BB0_1859;
bra.uni $L__BB0_1854;
$L__BB0_1859:
mov.f32 %f4936, 0f00000000;
mul.rn.f32 %f5886, %f1939, %f4936;
mov.u32 %r8725, 0;
bra.uni $L__BB0_1860;
$L__BB0_1854:
mov.b32 %r2468, %f1939;
shr.u32 %r7681, %r2468, 23;
and.b32 %r7682, %r7681, 255;
add.s32 %r2469, %r7682, -128;
shl.b32 %r7683, %r2468, 8;
or.b32 %r2470, %r7683, -2147483648;
shr.u32 %r2471, %r2469, 5;
mov.u64 %rd2786, 0;
mov.u32 %r8722, 0;
mov.u64 %rd2784, __cudart_i2opi_f;
mov.u64 %rd2785, %rd1;
$L__BB0_1855:
.pragma "nounroll";
ld.global.nc.u32 %r7684, [%rd2784];
mad.wide.u32 %rd2435, %r7684, %r2470, %rd2786;
shr.u64 %rd2786, %rd2435, 32;
st.local.u32 [%rd2785], %rd2435;
add.s64 %rd2785, %rd2785, 4;
add.s64 %rd2784, %rd2784, 4;
add.s32 %r8722, %r8722, 1;
setp.ne.s32 %p1569, %r8722, 6;
@%p1569 bra $L__BB0_1855;
st.local.u32 [%rd5], %rd2786;
mov.u32 %r7685, 4;
sub.s32 %r2474, %r7685, %r2471;
mov.u32 %r7686, 6;
sub.s32 %r7687, %r7686, %r2471;
mul.wide.s32 %rd2436, %r7687, 4;
add.s64 %rd2437, %rd1, %rd2436;
ld.local.u32 %r8723, [%rd2437];
ld.local.u32 %r8724, [%rd2437+-4];
and.b32 %r2477, %r2469, 31;
setp.eq.s32 %p1570, %r2477, 0;
@%p1570 bra $L__BB0_1858;
mov.u32 %r7688, 32;
sub.s32 %r7689, %r7688, %r2477;
shr.u32 %r7690, %r8724, %r7689;
shl.b32 %r7691, %r8723, %r2477;
add.s32 %r8723, %r7690, %r7691;
mul.wide.s32 %rd2438, %r2474, 4;
add.s64 %rd2439, %rd1, %rd2438;
ld.local.u32 %r7692, [%rd2439];
shr.u32 %r7693, %r7692, %r7689;
shl.b32 %r7694, %r8724, %r2477;
add.s32 %r8724, %r7693, %r7694;
$L__BB0_1858:
and.b32 %r7695, %r2468, -2147483648;
shr.u32 %r7696, %r8724, 30;
shl.b32 %r7697, %r8723, 2;
or.b32 %r7698, %r7696, %r7697;
shr.u32 %r7699, %r7698, 31;
shr.u32 %r7700, %r8723, 30;
add.s32 %r7701, %r7699, %r7700;
neg.s32 %r7702, %r7701;
setp.eq.s32 %p1571, %r7695, 0;
selp.b32 %r8725, %r7701, %r7702, %p1571;
setp.ne.s32 %p1572, %r7699, 0;
xor.b32 %r7703, %r7695, -2147483648;
selp.b32 %r7704, %r7703, %r7695, %p1572;
selp.b32 %r7705, -1, 0, %p1572;
xor.b32 %r7706, %r7698, %r7705;
shl.b32 %r7707, %r8724, 2;
xor.b32 %r7708, %r7707, %r7705;
cvt.u64.u32 %rd2440, %r7706;
cvt.u64.u32 %rd2441, %r7708;
bfi.b64 %rd2442, %rd2440, %rd2441, 32, 32;
cvt.rn.f64.s64 %fd249, %rd2442;
mul.f64 %fd250, %fd249, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4934, %fd250;
setp.eq.s32 %p1573, %r7704, 0;
neg.f32 %f4935, %f4934;
selp.f32 %f5886, %f4934, %f4935, %p1573;
$L__BB0_1860:
and.b32 %r2484, %r8725, 1;
setp.eq.s32 %p1574, %r2484, 0;
selp.f32 %f2096, %f5886, 0f3F800000, %p1574;
mul.rn.f32 %f2097, %f5886, %f5886;
mov.f32 %f5887, 0fB94D4153;
@%p1574 bra $L__BB0_1862;
mov.f32 %f4938, 0fBAB607ED;
mov.f32 %f4939, 0f37CBAC00;
fma.rn.f32 %f5887, %f4939, %f2097, %f4938;
$L__BB0_1862:
selp.f32 %f4940, 0f3C0885E4, 0f3D2AAABB, %p1574;
fma.rn.f32 %f4941, %f5887, %f2097, %f4940;
selp.f32 %f4942, 0fBE2AAAA8, 0fBEFFFFFF, %p1574;
fma.rn.f32 %f4943, %f4941, %f2097, %f4942;
mov.f32 %f4944, 0f00000000;
fma.rn.f32 %f4945, %f2097, %f2096, %f4944;
fma.rn.f32 %f5888, %f4943, %f4945, %f2096;
and.b32 %r7710, %r8725, 2;
setp.eq.s32 %p1576, %r7710, 0;
@%p1576 bra $L__BB0_1864;
mov.f32 %f4947, 0fBF800000;
fma.rn.f32 %f5888, %f5888, %f4947, %f4944;
$L__BB0_1864:
mul.f32 %f4948, %f1931, 0f3F22F983;
cvt.rni.s32.f32 %r8729, %f4948;
cvt.rn.f32.s32 %f4949, %r8729;
mov.f32 %f4950, 0fBFC90FDA;
fma.rn.f32 %f4951, %f4949, %f4950, %f1931;
mov.f32 %f4952, 0fB3A22168;
fma.rn.f32 %f4953, %f4949, %f4952, %f4951;
mov.f32 %f4954, 0fA7C234C5;
fma.rn.f32 %f5889, %f4949, %f4954, %f4953;
abs.f32 %f2104, %f1931;
setp.ltu.f32 %p1577, %f2104, 0f47CE4780;
@%p1577 bra $L__BB0_1872;
setp.eq.f32 %p1578, %f2104, 0f7F800000;
@%p1578 bra $L__BB0_1871;
bra.uni $L__BB0_1866;
$L__BB0_1871:
mov.f32 %f4957, 0f00000000;
mul.rn.f32 %f5889, %f1931, %f4957;
mov.u32 %r8729, 0;
bra.uni $L__BB0_1872;
$L__BB0_1866:
mov.b32 %r2486, %f1931;
shr.u32 %r7712, %r2486, 23;
and.b32 %r7713, %r7712, 255;
add.s32 %r2487, %r7713, -128;
shl.b32 %r7714, %r2486, 8;
or.b32 %r2488, %r7714, -2147483648;
shr.u32 %r2489, %r2487, 5;
mov.u64 %rd2789, 0;
mov.u32 %r8726, 0;
mov.u64 %rd2787, __cudart_i2opi_f;
mov.u64 %rd2788, %rd1;
$L__BB0_1867:
.pragma "nounroll";
ld.global.nc.u32 %r7715, [%rd2787];
mad.wide.u32 %rd2445, %r7715, %r2488, %rd2789;
shr.u64 %rd2789, %rd2445, 32;
st.local.u32 [%rd2788], %rd2445;
add.s64 %rd2788, %rd2788, 4;
add.s64 %rd2787, %rd2787, 4;
add.s32 %r8726, %r8726, 1;
setp.ne.s32 %p1579, %r8726, 6;
@%p1579 bra $L__BB0_1867;
st.local.u32 [%rd5], %rd2789;
mov.u32 %r7716, 4;
sub.s32 %r2492, %r7716, %r2489;
mov.u32 %r7717, 6;
sub.s32 %r7718, %r7717, %r2489;
mul.wide.s32 %rd2446, %r7718, 4;
add.s64 %rd2447, %rd1, %rd2446;
ld.local.u32 %r8727, [%rd2447];
ld.local.u32 %r8728, [%rd2447+-4];
and.b32 %r2495, %r2487, 31;
setp.eq.s32 %p1580, %r2495, 0;
@%p1580 bra $L__BB0_1870;
mov.u32 %r7719, 32;
sub.s32 %r7720, %r7719, %r2495;
shr.u32 %r7721, %r8728, %r7720;
shl.b32 %r7722, %r8727, %r2495;
add.s32 %r8727, %r7721, %r7722;
mul.wide.s32 %rd2448, %r2492, 4;
add.s64 %rd2449, %rd1, %rd2448;
ld.local.u32 %r7723, [%rd2449];
shr.u32 %r7724, %r7723, %r7720;
shl.b32 %r7725, %r8728, %r2495;
add.s32 %r8728, %r7724, %r7725;
$L__BB0_1870:
and.b32 %r7726, %r2486, -2147483648;
shr.u32 %r7727, %r8728, 30;
shl.b32 %r7728, %r8727, 2;
or.b32 %r7729, %r7727, %r7728;
shr.u32 %r7730, %r7729, 31;
shr.u32 %r7731, %r8727, 30;
add.s32 %r7732, %r7730, %r7731;
neg.s32 %r7733, %r7732;
setp.eq.s32 %p1581, %r7726, 0;
selp.b32 %r8729, %r7732, %r7733, %p1581;
setp.ne.s32 %p1582, %r7730, 0;
xor.b32 %r7734, %r7726, -2147483648;
selp.b32 %r7735, %r7734, %r7726, %p1582;
selp.b32 %r7736, -1, 0, %p1582;
xor.b32 %r7737, %r7729, %r7736;
shl.b32 %r7738, %r8728, 2;
xor.b32 %r7739, %r7738, %r7736;
cvt.u64.u32 %rd2450, %r7737;
cvt.u64.u32 %rd2451, %r7739;
bfi.b64 %rd2452, %rd2450, %rd2451, 32, 32;
cvt.rn.f64.s64 %fd251, %rd2452;
mul.f64 %fd252, %fd251, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4955, %fd252;
setp.eq.s32 %p1583, %r7735, 0;
neg.f32 %f4956, %f4955;
selp.f32 %f5889, %f4955, %f4956, %p1583;
$L__BB0_1872:
add.s32 %r2502, %r8729, 1;
and.b32 %r2503, %r2502, 1;
setp.eq.s32 %p1584, %r2503, 0;
selp.f32 %f2108, %f5889, 0f3F800000, %p1584;
mul.rn.f32 %f2109, %f5889, %f5889;
mov.f32 %f5890, 0fB94D4153;
@%p1584 bra $L__BB0_1874;
mov.f32 %f4959, 0fBAB607ED;
mov.f32 %f4960, 0f37CBAC00;
fma.rn.f32 %f5890, %f4960, %f2109, %f4959;
$L__BB0_1874:
selp.f32 %f4961, 0f3C0885E4, 0f3D2AAABB, %p1584;
fma.rn.f32 %f4962, %f5890, %f2109, %f4961;
selp.f32 %f4963, 0fBE2AAAA8, 0fBEFFFFFF, %p1584;
fma.rn.f32 %f4964, %f4962, %f2109, %f4963;
mov.f32 %f4965, 0f00000000;
fma.rn.f32 %f4966, %f2109, %f2108, %f4965;
fma.rn.f32 %f5891, %f4964, %f4966, %f2108;
and.b32 %r7741, %r2502, 2;
setp.eq.s32 %p1586, %r7741, 0;
@%p1586 bra $L__BB0_1876;
mov.f32 %f4968, 0fBF800000;
fma.rn.f32 %f5891, %f5891, %f4968, %f4965;
$L__BB0_1876:
add.f32 %f5899, %f5888, %f5891;
mul.f32 %f4969, %f1940, 0f3F22F983;
cvt.rni.s32.f32 %r8733, %f4969;
cvt.rn.f32.s32 %f4970, %r8733;
mov.f32 %f4971, 0fBFC90FDA;
fma.rn.f32 %f4972, %f4970, %f4971, %f1940;
mov.f32 %f4973, 0fB3A22168;
fma.rn.f32 %f4974, %f4970, %f4973, %f4972;
mov.f32 %f4975, 0fA7C234C5;
fma.rn.f32 %f5892, %f4970, %f4975, %f4974;
abs.f32 %f2117, %f1940;
setp.ltu.f32 %p1587, %f2117, 0f47CE4780;
@%p1587 bra $L__BB0_1884;
setp.eq.f32 %p1588, %f2117, 0f7F800000;
@%p1588 bra $L__BB0_1883;
bra.uni $L__BB0_1878;
$L__BB0_1883:
mov.f32 %f4978, 0f00000000;
mul.rn.f32 %f5892, %f1940, %f4978;
mov.u32 %r8733, 0;
bra.uni $L__BB0_1884;
$L__BB0_1878:
mov.b32 %r2505, %f1940;
shr.u32 %r7743, %r2505, 23;
and.b32 %r7744, %r7743, 255;
add.s32 %r2506, %r7744, -128;
shl.b32 %r7745, %r2505, 8;
or.b32 %r2507, %r7745, -2147483648;
shr.u32 %r2508, %r2506, 5;
mov.u64 %rd2792, 0;
mov.u32 %r8730, 0;
mov.u64 %rd2790, __cudart_i2opi_f;
mov.u64 %rd2791, %rd1;
$L__BB0_1879:
.pragma "nounroll";
ld.global.nc.u32 %r7746, [%rd2790];
mad.wide.u32 %rd2455, %r7746, %r2507, %rd2792;
shr.u64 %rd2792, %rd2455, 32;
st.local.u32 [%rd2791], %rd2455;
add.s64 %rd2791, %rd2791, 4;
add.s64 %rd2790, %rd2790, 4;
add.s32 %r8730, %r8730, 1;
setp.ne.s32 %p1589, %r8730, 6;
@%p1589 bra $L__BB0_1879;
st.local.u32 [%rd5], %rd2792;
mov.u32 %r7747, 4;
sub.s32 %r2511, %r7747, %r2508;
mov.u32 %r7748, 6;
sub.s32 %r7749, %r7748, %r2508;
mul.wide.s32 %rd2456, %r7749, 4;
add.s64 %rd2457, %rd1, %rd2456;
ld.local.u32 %r8731, [%rd2457];
ld.local.u32 %r8732, [%rd2457+-4];
and.b32 %r2514, %r2506, 31;
setp.eq.s32 %p1590, %r2514, 0;
@%p1590 bra $L__BB0_1882;
mov.u32 %r7750, 32;
sub.s32 %r7751, %r7750, %r2514;
shr.u32 %r7752, %r8732, %r7751;
shl.b32 %r7753, %r8731, %r2514;
add.s32 %r8731, %r7752, %r7753;
mul.wide.s32 %rd2458, %r2511, 4;
add.s64 %rd2459, %rd1, %rd2458;
ld.local.u32 %r7754, [%rd2459];
shr.u32 %r7755, %r7754, %r7751;
shl.b32 %r7756, %r8732, %r2514;
add.s32 %r8732, %r7755, %r7756;
$L__BB0_1882:
and.b32 %r7757, %r2505, -2147483648;
shr.u32 %r7758, %r8732, 30;
shl.b32 %r7759, %r8731, 2;
or.b32 %r7760, %r7758, %r7759;
shr.u32 %r7761, %r7760, 31;
shr.u32 %r7762, %r8731, 30;
add.s32 %r7763, %r7761, %r7762;
neg.s32 %r7764, %r7763;
setp.eq.s32 %p1591, %r7757, 0;
selp.b32 %r8733, %r7763, %r7764, %p1591;
setp.ne.s32 %p1592, %r7761, 0;
xor.b32 %r7765, %r7757, -2147483648;
selp.b32 %r7766, %r7765, %r7757, %p1592;
selp.b32 %r7767, -1, 0, %p1592;
xor.b32 %r7768, %r7760, %r7767;
shl.b32 %r7769, %r8732, 2;
xor.b32 %r7770, %r7769, %r7767;
cvt.u64.u32 %rd2460, %r7768;
cvt.u64.u32 %rd2461, %r7770;
bfi.b64 %rd2462, %rd2460, %rd2461, 32, 32;
cvt.rn.f64.s64 %fd253, %rd2462;
mul.f64 %fd254, %fd253, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4976, %fd254;
setp.eq.s32 %p1593, %r7766, 0;
neg.f32 %f4977, %f4976;
selp.f32 %f5892, %f4976, %f4977, %p1593;
$L__BB0_1884:
and.b32 %r2521, %r8733, 1;
setp.eq.s32 %p1594, %r2521, 0;
selp.f32 %f2121, %f5892, 0f3F800000, %p1594;
mul.rn.f32 %f2122, %f5892, %f5892;
mov.f32 %f5893, 0fB94D4153;
@%p1594 bra $L__BB0_1886;
mov.f32 %f4980, 0fBAB607ED;
mov.f32 %f4981, 0f37CBAC00;
fma.rn.f32 %f5893, %f4981, %f2122, %f4980;
$L__BB0_1886:
selp.f32 %f4982, 0f3C0885E4, 0f3D2AAABB, %p1594;
fma.rn.f32 %f4983, %f5893, %f2122, %f4982;
selp.f32 %f4984, 0fBE2AAAA8, 0fBEFFFFFF, %p1594;
fma.rn.f32 %f4985, %f4983, %f2122, %f4984;
mov.f32 %f4986, 0f00000000;
fma.rn.f32 %f4987, %f2122, %f2121, %f4986;
fma.rn.f32 %f5894, %f4985, %f4987, %f2121;
and.b32 %r7772, %r8733, 2;
setp.eq.s32 %p1596, %r7772, 0;
@%p1596 bra $L__BB0_1888;
mov.f32 %f4989, 0fBF800000;
fma.rn.f32 %f5894, %f5894, %f4989, %f4986;
$L__BB0_1888:
mul.f32 %f4990, %f1932, 0f3F22F983;
cvt.rni.s32.f32 %r8737, %f4990;
cvt.rn.f32.s32 %f4991, %r8737;
mov.f32 %f4992, 0fBFC90FDA;
fma.rn.f32 %f4993, %f4991, %f4992, %f1932;
mov.f32 %f4994, 0fB3A22168;
fma.rn.f32 %f4995, %f4991, %f4994, %f4993;
mov.f32 %f4996, 0fA7C234C5;
fma.rn.f32 %f5895, %f4991, %f4996, %f4995;
abs.f32 %f2129, %f1932;
setp.ltu.f32 %p1597, %f2129, 0f47CE4780;
@%p1597 bra $L__BB0_1896;
setp.eq.f32 %p1598, %f2129, 0f7F800000;
@%p1598 bra $L__BB0_1895;
bra.uni $L__BB0_1890;
$L__BB0_1895:
mov.f32 %f4999, 0f00000000;
mul.rn.f32 %f5895, %f1932, %f4999;
mov.u32 %r8737, 0;
bra.uni $L__BB0_1896;
$L__BB0_1890:
mov.b32 %r2523, %f1932;
shr.u32 %r7774, %r2523, 23;
and.b32 %r7775, %r7774, 255;
add.s32 %r2524, %r7775, -128;
shl.b32 %r7776, %r2523, 8;
or.b32 %r2525, %r7776, -2147483648;
shr.u32 %r2526, %r2524, 5;
mov.u64 %rd2795, 0;
mov.u32 %r8734, 0;
mov.u64 %rd2793, __cudart_i2opi_f;
mov.u64 %rd2794, %rd1;
$L__BB0_1891:
.pragma "nounroll";
ld.global.nc.u32 %r7777, [%rd2793];
mad.wide.u32 %rd2465, %r7777, %r2525, %rd2795;
shr.u64 %rd2795, %rd2465, 32;
st.local.u32 [%rd2794], %rd2465;
add.s64 %rd2794, %rd2794, 4;
add.s64 %rd2793, %rd2793, 4;
add.s32 %r8734, %r8734, 1;
setp.ne.s32 %p1599, %r8734, 6;
@%p1599 bra $L__BB0_1891;
st.local.u32 [%rd5], %rd2795;
mov.u32 %r7778, 4;
sub.s32 %r2529, %r7778, %r2526;
mov.u32 %r7779, 6;
sub.s32 %r7780, %r7779, %r2526;
mul.wide.s32 %rd2466, %r7780, 4;
add.s64 %rd2467, %rd1, %rd2466;
ld.local.u32 %r8735, [%rd2467];
ld.local.u32 %r8736, [%rd2467+-4];
and.b32 %r2532, %r2524, 31;
setp.eq.s32 %p1600, %r2532, 0;
@%p1600 bra $L__BB0_1894;
mov.u32 %r7781, 32;
sub.s32 %r7782, %r7781, %r2532;
shr.u32 %r7783, %r8736, %r7782;
shl.b32 %r7784, %r8735, %r2532;
add.s32 %r8735, %r7783, %r7784;
mul.wide.s32 %rd2468, %r2529, 4;
add.s64 %rd2469, %rd1, %rd2468;
ld.local.u32 %r7785, [%rd2469];
shr.u32 %r7786, %r7785, %r7782;
shl.b32 %r7787, %r8736, %r2532;
add.s32 %r8736, %r7786, %r7787;
$L__BB0_1894:
and.b32 %r7788, %r2523, -2147483648;
shr.u32 %r7789, %r8736, 30;
shl.b32 %r7790, %r8735, 2;
or.b32 %r7791, %r7789, %r7790;
shr.u32 %r7792, %r7791, 31;
shr.u32 %r7793, %r8735, 30;
add.s32 %r7794, %r7792, %r7793;
neg.s32 %r7795, %r7794;
setp.eq.s32 %p1601, %r7788, 0;
selp.b32 %r8737, %r7794, %r7795, %p1601;
setp.ne.s32 %p1602, %r7792, 0;
xor.b32 %r7796, %r7788, -2147483648;
selp.b32 %r7797, %r7796, %r7788, %p1602;
selp.b32 %r7798, -1, 0, %p1602;
xor.b32 %r7799, %r7791, %r7798;
shl.b32 %r7800, %r8736, 2;
xor.b32 %r7801, %r7800, %r7798;
cvt.u64.u32 %rd2470, %r7799;
cvt.u64.u32 %rd2471, %r7801;
bfi.b64 %rd2472, %rd2470, %rd2471, 32, 32;
cvt.rn.f64.s64 %fd255, %rd2472;
mul.f64 %fd256, %fd255, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4997, %fd256;
setp.eq.s32 %p1603, %r7797, 0;
neg.f32 %f4998, %f4997;
selp.f32 %f5895, %f4997, %f4998, %p1603;
$L__BB0_1896:
add.s32 %r2539, %r8737, 1;
and.b32 %r2540, %r2539, 1;
setp.eq.s32 %p1604, %r2540, 0;
selp.f32 %f2133, %f5895, 0f3F800000, %p1604;
mul.rn.f32 %f2134, %f5895, %f5895;
mov.f32 %f5896, 0fB94D4153;
@%p1604 bra $L__BB0_1898;
mov.f32 %f5001, 0fBAB607ED;
mov.f32 %f5002, 0f37CBAC00;
fma.rn.f32 %f5896, %f5002, %f2134, %f5001;
$L__BB0_1898:
selp.f32 %f5003, 0f3C0885E4, 0f3D2AAABB, %p1604;
fma.rn.f32 %f5004, %f5896, %f2134, %f5003;
selp.f32 %f5005, 0fBE2AAAA8, 0fBEFFFFFF, %p1604;
fma.rn.f32 %f5006, %f5004, %f2134, %f5005;
mov.f32 %f5007, 0f00000000;
fma.rn.f32 %f5008, %f2134, %f2133, %f5007;
fma.rn.f32 %f5897, %f5006, %f5008, %f2133;
and.b32 %r7803, %r2539, 2;
setp.eq.s32 %p1606, %r7803, 0;
@%p1606 bra $L__BB0_1900;
mov.f32 %f5010, 0fBF800000;
fma.rn.f32 %f5897, %f5897, %f5010, %f5007;
$L__BB0_1900:
add.f32 %f5898, %f5894, %f5897;
bra.uni $L__BB0_1901;
$L__BB0_1480:
mov.b32 %r1943, %f5348;
shr.u32 %r6727, %r1943, 23;
and.b32 %r6728, %r6727, 255;
add.s32 %r1944, %r6728, -128;
shl.b32 %r6729, %r1943, 8;
or.b32 %r1945, %r6729, -2147483648;
shr.u32 %r1946, %r1944, 5;
mov.u64 %rd2702, 0;
mov.u32 %r8610, 0;
mov.u64 %rd2700, __cudart_i2opi_f;
mov.u64 %rd2701, %rd1;
$L__BB0_1481:
.pragma "nounroll";
ld.global.nc.u32 %r6730, [%rd2700];
mad.wide.u32 %rd2129, %r6730, %r1945, %rd2702;
shr.u64 %rd2702, %rd2129, 32;
st.local.u32 [%rd2701], %rd2129;
add.s64 %rd2701, %rd2701, 4;
add.s64 %rd2700, %rd2700, 4;
add.s32 %r8610, %r8610, 1;
setp.ne.s32 %p1250, %r8610, 6;
@%p1250 bra $L__BB0_1481;
st.local.u32 [%rd5], %rd2702;
mov.u32 %r6731, 4;
sub.s32 %r1949, %r6731, %r1946;
mov.u32 %r6732, 6;
sub.s32 %r6733, %r6732, %r1946;
mul.wide.s32 %rd2130, %r6733, 4;
add.s64 %rd2131, %rd1, %rd2130;
ld.local.u32 %r8611, [%rd2131];
ld.local.u32 %r8612, [%rd2131+-4];
and.b32 %r1952, %r1944, 31;
setp.eq.s32 %p1251, %r1952, 0;
@%p1251 bra $L__BB0_1484;
mov.u32 %r6734, 32;
sub.s32 %r6735, %r6734, %r1952;
shr.u32 %r6736, %r8612, %r6735;
shl.b32 %r6737, %r8611, %r1952;
add.s32 %r8611, %r6736, %r6737;
mul.wide.s32 %rd2132, %r1949, 4;
add.s64 %rd2133, %rd1, %rd2132;
ld.local.u32 %r6738, [%rd2133];
shr.u32 %r6739, %r6738, %r6735;
shl.b32 %r6740, %r8612, %r1952;
add.s32 %r8612, %r6739, %r6740;
$L__BB0_1484:
and.b32 %r6741, %r1943, -2147483648;
shr.u32 %r6742, %r8612, 30;
shl.b32 %r6743, %r8611, 2;
or.b32 %r6744, %r6742, %r6743;
shr.u32 %r6745, %r6744, 31;
shr.u32 %r6746, %r8611, 30;
add.s32 %r6747, %r6745, %r6746;
neg.s32 %r6748, %r6747;
setp.eq.s32 %p1252, %r6741, 0;
selp.b32 %r8613, %r6747, %r6748, %p1252;
setp.ne.s32 %p1253, %r6745, 0;
xor.b32 %r6749, %r6741, -2147483648;
selp.b32 %r6750, %r6749, %r6741, %p1253;
selp.b32 %r6751, -1, 0, %p1253;
xor.b32 %r6752, %r6744, %r6751;
shl.b32 %r6753, %r8612, 2;
xor.b32 %r6754, %r6753, %r6751;
cvt.u64.u32 %rd2134, %r6752;
cvt.u64.u32 %rd2135, %r6754;
bfi.b64 %rd2136, %rd2134, %rd2135, 32, 32;
cvt.rn.f64.s64 %fd193, %rd2136;
mul.f64 %fd194, %fd193, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4331, %fd194;
setp.eq.s32 %p1254, %r6750, 0;
neg.f32 %f4332, %f4331;
selp.f32 %f5751, %f4331, %f4332, %p1254;
$L__BB0_1486:
and.b32 %r1959, %r8613, 1;
setp.eq.s32 %p1255, %r1959, 0;
selp.f32 %f1663, %f5751, 0f3F800000, %p1255;
mul.rn.f32 %f1664, %f5751, %f5751;
mov.f32 %f5752, 0fB94D4153;
@%p1255 bra $L__BB0_1488;
mov.f32 %f4335, 0fBAB607ED;
mov.f32 %f4336, 0f37CBAC00;
fma.rn.f32 %f5752, %f4336, %f1664, %f4335;
$L__BB0_1488:
selp.f32 %f4337, 0f3C0885E4, 0f3D2AAABB, %p1255;
fma.rn.f32 %f4338, %f5752, %f1664, %f4337;
selp.f32 %f4339, 0fBE2AAAA8, 0fBEFFFFFF, %p1255;
fma.rn.f32 %f4340, %f4338, %f1664, %f4339;
mov.f32 %f4341, 0f00000000;
fma.rn.f32 %f4342, %f1664, %f1663, %f4341;
fma.rn.f32 %f5213, %f4340, %f4342, %f1663;
and.b32 %r6756, %r8613, 2;
setp.eq.s32 %p1257, %r6756, 0;
@%p1257 bra $L__BB0_1490;
mov.f32 %f4344, 0fBF800000;
fma.rn.f32 %f5213, %f5213, %f4344, %f4341;
$L__BB0_1490:
setp.lt.s32 %p24, %r14, %r1941;
@%p1247 bra $L__BB0_1503;
mul.f32 %f4345, %f5531, 0f3F22F983;
cvt.rni.s32.f32 %r8617, %f4345;
cvt.rn.f32.s32 %f4346, %r8617;
mov.f32 %f4347, 0fBFC90FDA;
fma.rn.f32 %f4348, %f4346, %f4347, %f5531;
mov.f32 %f4349, 0fB3A22168;
fma.rn.f32 %f4350, %f4346, %f4349, %f4348;
mov.f32 %f4351, 0fA7C234C5;
fma.rn.f32 %f5755, %f4346, %f4351, %f4350;
abs.f32 %f1672, %f5531;
setp.ltu.f32 %p1259, %f1672, 0f47CE4780;
@%p1259 bra $L__BB0_1499;
setp.eq.f32 %p1260, %f1672, 0f7F800000;
@%p1260 bra $L__BB0_1498;
bra.uni $L__BB0_1493;
$L__BB0_1498:
mov.f32 %f4354, 0f00000000;
mul.rn.f32 %f5755, %f5531, %f4354;
mov.u32 %r8617, 0;
bra.uni $L__BB0_1499;
$L__BB0_1493:
mov.b32 %r1961, %f5531;
shr.u32 %r6758, %r1961, 23;
and.b32 %r6759, %r6758, 255;
add.s32 %r1962, %r6759, -128;
shl.b32 %r6760, %r1961, 8;
or.b32 %r1963, %r6760, -2147483648;
shr.u32 %r1964, %r1962, 5;
mov.u64 %rd2705, 0;
mov.u32 %r8614, 0;
mov.u64 %rd2703, __cudart_i2opi_f;
mov.u64 %rd2704, %rd1;
$L__BB0_1494:
.pragma "nounroll";
ld.global.nc.u32 %r6761, [%rd2703];
mad.wide.u32 %rd2139, %r6761, %r1963, %rd2705;
shr.u64 %rd2705, %rd2139, 32;
st.local.u32 [%rd2704], %rd2139;
add.s64 %rd2704, %rd2704, 4;
add.s64 %rd2703, %rd2703, 4;
add.s32 %r8614, %r8614, 1;
setp.ne.s32 %p1261, %r8614, 6;
@%p1261 bra $L__BB0_1494;
st.local.u32 [%rd5], %rd2705;
mov.u32 %r6762, 4;
sub.s32 %r1967, %r6762, %r1964;
mov.u32 %r6763, 6;
sub.s32 %r6764, %r6763, %r1964;
mul.wide.s32 %rd2140, %r6764, 4;
add.s64 %rd2141, %rd1, %rd2140;
ld.local.u32 %r8615, [%rd2141];
ld.local.u32 %r8616, [%rd2141+-4];
and.b32 %r1970, %r1962, 31;
setp.eq.s32 %p1262, %r1970, 0;
@%p1262 bra $L__BB0_1497;
mov.u32 %r6765, 32;
sub.s32 %r6766, %r6765, %r1970;
shr.u32 %r6767, %r8616, %r6766;
shl.b32 %r6768, %r8615, %r1970;
add.s32 %r8615, %r6767, %r6768;
mul.wide.s32 %rd2142, %r1967, 4;
add.s64 %rd2143, %rd1, %rd2142;
ld.local.u32 %r6769, [%rd2143];
shr.u32 %r6770, %r6769, %r6766;
shl.b32 %r6771, %r8616, %r1970;
add.s32 %r8616, %r6770, %r6771;
$L__BB0_1497:
and.b32 %r6772, %r1961, -2147483648;
shr.u32 %r6773, %r8616, 30;
shl.b32 %r6774, %r8615, 2;
or.b32 %r6775, %r6773, %r6774;
shr.u32 %r6776, %r6775, 31;
shr.u32 %r6777, %r8615, 30;
add.s32 %r6778, %r6776, %r6777;
neg.s32 %r6779, %r6778;
setp.eq.s32 %p1263, %r6772, 0;
selp.b32 %r8617, %r6778, %r6779, %p1263;
setp.ne.s32 %p1264, %r6776, 0;
xor.b32 %r6780, %r6772, -2147483648;
selp.b32 %r6781, %r6780, %r6772, %p1264;
selp.b32 %r6782, -1, 0, %p1264;
xor.b32 %r6783, %r6775, %r6782;
shl.b32 %r6784, %r8616, 2;
xor.b32 %r6785, %r6784, %r6782;
cvt.u64.u32 %rd2144, %r6783;
cvt.u64.u32 %rd2145, %r6785;
bfi.b64 %rd2146, %rd2144, %rd2145, 32, 32;
cvt.rn.f64.s64 %fd195, %rd2146;
mul.f64 %fd196, %fd195, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4352, %fd196;
setp.eq.s32 %p1265, %r6781, 0;
neg.f32 %f4353, %f4352;
selp.f32 %f5755, %f4352, %f4353, %p1265;
$L__BB0_1499:
add.s32 %r1977, %r8617, 1;
and.b32 %r1978, %r1977, 1;
setp.eq.s32 %p1266, %r1978, 0;
selp.f32 %f1676, %f5755, 0f3F800000, %p1266;
mul.rn.f32 %f1677, %f5755, %f5755;
mov.f32 %f5756, 0fB94D4153;
@%p1266 bra $L__BB0_1501;
mov.f32 %f4356, 0fBAB607ED;
mov.f32 %f4357, 0f37CBAC00;
fma.rn.f32 %f5756, %f4357, %f1677, %f4356;
$L__BB0_1501:
selp.f32 %f4358, 0f3C0885E4, 0f3D2AAABB, %p1266;
fma.rn.f32 %f4359, %f5756, %f1677, %f4358;
selp.f32 %f4360, 0fBE2AAAA8, 0fBEFFFFFF, %p1266;
fma.rn.f32 %f4361, %f4359, %f1677, %f4360;
mov.f32 %f4362, 0f00000000;
fma.rn.f32 %f4363, %f1677, %f1676, %f4362;
fma.rn.f32 %f5215, %f4361, %f4363, %f1676;
and.b32 %r6787, %r1977, 2;
setp.eq.s32 %p1268, %r6787, 0;
@%p1268 bra $L__BB0_1503;
mov.f32 %f4365, 0fBF800000;
fma.rn.f32 %f5215, %f5215, %f4365, %f4362;
$L__BB0_1503:
selp.f32 %f1684, %f5215, %f5216, %p24;
selp.f32 %f1685, %f5213, %f5214, %p24;
@%p1247 bra $L__BB0_1505;
add.f32 %f5905, %f1685, %f1684;
mov.f32 %f5214, %f5213;
mov.f32 %f5216, %f5215;
$L__BB0_1505:
@%p1214 bra $L__BB0_1534;
shl.b32 %r6789, %r12, 5;
mov.u32 %r6790, -32;
sub.s32 %r1979, %r6790, %r6789;
setp.ge.s32 %p1272, %r14, %r1979;
@%p1272 bra $L__BB0_1519;
mul.f32 %f4368, %f5347, 0f3F22F983;
cvt.rni.s32.f32 %r8621, %f4368;
cvt.rn.f32.s32 %f4369, %r8621;
mov.f32 %f4370, 0fBFC90FDA;
fma.rn.f32 %f4371, %f4369, %f4370, %f5347;
mov.f32 %f4372, 0fB3A22168;
fma.rn.f32 %f4373, %f4369, %f4372, %f4371;
mov.f32 %f4374, 0fA7C234C5;
fma.rn.f32 %f5764, %f4369, %f4374, %f4373;
abs.f32 %f1693, %f5347;
setp.ltu.f32 %p1273, %f1693, 0f47CE4780;
@%p1273 bra $L__BB0_1515;
setp.eq.f32 %p1274, %f1693, 0f7F800000;
@%p1274 bra $L__BB0_1514;
bra.uni $L__BB0_1509;
$L__BB0_1514:
mov.f32 %f4377, 0f00000000;
mul.rn.f32 %f5764, %f5347, %f4377;
mov.u32 %r8621, 0;
bra.uni $L__BB0_1515;
$L__BB0_1509:
mov.b32 %r1981, %f5347;
shr.u32 %r6792, %r1981, 23;
and.b32 %r6793, %r6792, 255;
add.s32 %r1982, %r6793, -128;
shl.b32 %r6794, %r1981, 8;
or.b32 %r1983, %r6794, -2147483648;
shr.u32 %r1984, %r1982, 5;
mov.u64 %rd2708, 0;
mov.u32 %r8618, 0;
mov.u64 %rd2706, __cudart_i2opi_f;
mov.u64 %rd2707, %rd1;
$L__BB0_1510:
.pragma "nounroll";
ld.global.nc.u32 %r6795, [%rd2706];
mad.wide.u32 %rd2149, %r6795, %r1983, %rd2708;
shr.u64 %rd2708, %rd2149, 32;
st.local.u32 [%rd2707], %rd2149;
add.s64 %rd2707, %rd2707, 4;
add.s64 %rd2706, %rd2706, 4;
add.s32 %r8618, %r8618, 1;
setp.ne.s32 %p1275, %r8618, 6;
@%p1275 bra $L__BB0_1510;
st.local.u32 [%rd5], %rd2708;
mov.u32 %r6796, 4;
sub.s32 %r1987, %r6796, %r1984;
mov.u32 %r6797, 6;
sub.s32 %r6798, %r6797, %r1984;
mul.wide.s32 %rd2150, %r6798, 4;
add.s64 %rd2151, %rd1, %rd2150;
ld.local.u32 %r8619, [%rd2151];
ld.local.u32 %r8620, [%rd2151+-4];
and.b32 %r1990, %r1982, 31;
setp.eq.s32 %p1276, %r1990, 0;
@%p1276 bra $L__BB0_1513;
mov.u32 %r6799, 32;
sub.s32 %r6800, %r6799, %r1990;
shr.u32 %r6801, %r8620, %r6800;
shl.b32 %r6802, %r8619, %r1990;
add.s32 %r8619, %r6801, %r6802;
mul.wide.s32 %rd2152, %r1987, 4;
add.s64 %rd2153, %rd1, %rd2152;
ld.local.u32 %r6803, [%rd2153];
shr.u32 %r6804, %r6803, %r6800;
shl.b32 %r6805, %r8620, %r1990;
add.s32 %r8620, %r6804, %r6805;
$L__BB0_1513:
and.b32 %r6806, %r1981, -2147483648;
shr.u32 %r6807, %r8620, 30;
shl.b32 %r6808, %r8619, 2;
or.b32 %r6809, %r6807, %r6808;
shr.u32 %r6810, %r6809, 31;
shr.u32 %r6811, %r8619, 30;
add.s32 %r6812, %r6810, %r6811;
neg.s32 %r6813, %r6812;
setp.eq.s32 %p1277, %r6806, 0;
selp.b32 %r8621, %r6812, %r6813, %p1277;
setp.ne.s32 %p1278, %r6810, 0;
xor.b32 %r6814, %r6806, -2147483648;
selp.b32 %r6815, %r6814, %r6806, %p1278;
selp.b32 %r6816, -1, 0, %p1278;
xor.b32 %r6817, %r6809, %r6816;
shl.b32 %r6818, %r8620, 2;
xor.b32 %r6819, %r6818, %r6816;
cvt.u64.u32 %rd2154, %r6817;
cvt.u64.u32 %rd2155, %r6819;
bfi.b64 %rd2156, %rd2154, %rd2155, 32, 32;
cvt.rn.f64.s64 %fd197, %rd2156;
mul.f64 %fd198, %fd197, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4375, %fd198;
setp.eq.s32 %p1279, %r6815, 0;
neg.f32 %f4376, %f4375;
selp.f32 %f5764, %f4375, %f4376, %p1279;
$L__BB0_1515:
and.b32 %r1997, %r8621, 1;
setp.eq.s32 %p1280, %r1997, 0;
selp.f32 %f1697, %f5764, 0f3F800000, %p1280;
mul.rn.f32 %f1698, %f5764, %f5764;
mov.f32 %f5765, 0fB94D4153;
@%p1280 bra $L__BB0_1517;
mov.f32 %f4379, 0fBAB607ED;
mov.f32 %f4380, 0f37CBAC00;
fma.rn.f32 %f5765, %f4380, %f1698, %f4379;
$L__BB0_1517:
selp.f32 %f4381, 0f3C0885E4, 0f3D2AAABB, %p1280;
fma.rn.f32 %f4382, %f5765, %f1698, %f4381;
selp.f32 %f4383, 0fBE2AAAA8, 0fBEFFFFFF, %p1280;
fma.rn.f32 %f4384, %f4382, %f1698, %f4383;
mov.f32 %f4385, 0f00000000;
fma.rn.f32 %f4386, %f1698, %f1697, %f4385;
fma.rn.f32 %f5213, %f4384, %f4386, %f1697;
and.b32 %r6821, %r8621, 2;
setp.eq.s32 %p1282, %r6821, 0;
@%p1282 bra $L__BB0_1519;
mov.f32 %f4388, 0fBF800000;
fma.rn.f32 %f5213, %f5213, %f4388, %f4385;
$L__BB0_1519:
setp.lt.s32 %p25, %r14, %r1979;
@%p1272 bra $L__BB0_1532;
mul.f32 %f4389, %f5339, 0f3F22F983;
cvt.rni.s32.f32 %r8625, %f4389;
cvt.rn.f32.s32 %f4390, %r8625;
mov.f32 %f4391, 0fBFC90FDA;
fma.rn.f32 %f4392, %f4390, %f4391, %f5339;
mov.f32 %f4393, 0fB3A22168;
fma.rn.f32 %f4394, %f4390, %f4393, %f4392;
mov.f32 %f4395, 0fA7C234C5;
fma.rn.f32 %f5768, %f4390, %f4395, %f4394;
abs.f32 %f1706, %f5339;
setp.ltu.f32 %p1284, %f1706, 0f47CE4780;
@%p1284 bra $L__BB0_1528;
setp.eq.f32 %p1285, %f1706, 0f7F800000;
@%p1285 bra $L__BB0_1527;
bra.uni $L__BB0_1522;
$L__BB0_1527:
mov.f32 %f4398, 0f00000000;
mul.rn.f32 %f5768, %f5339, %f4398;
mov.u32 %r8625, 0;
bra.uni $L__BB0_1528;
$L__BB0_1522:
mov.b32 %r1999, %f5339;
shr.u32 %r6823, %r1999, 23;
and.b32 %r6824, %r6823, 255;
add.s32 %r2000, %r6824, -128;
shl.b32 %r6825, %r1999, 8;
or.b32 %r2001, %r6825, -2147483648;
shr.u32 %r2002, %r2000, 5;
mov.u64 %rd2711, 0;
mov.u32 %r8622, 0;
mov.u64 %rd2709, __cudart_i2opi_f;
mov.u64 %rd2710, %rd1;
$L__BB0_1523:
.pragma "nounroll";
ld.global.nc.u32 %r6826, [%rd2709];
mad.wide.u32 %rd2159, %r6826, %r2001, %rd2711;
shr.u64 %rd2711, %rd2159, 32;
st.local.u32 [%rd2710], %rd2159;
add.s64 %rd2710, %rd2710, 4;
add.s64 %rd2709, %rd2709, 4;
add.s32 %r8622, %r8622, 1;
setp.ne.s32 %p1286, %r8622, 6;
@%p1286 bra $L__BB0_1523;
st.local.u32 [%rd5], %rd2711;
mov.u32 %r6827, 4;
sub.s32 %r2005, %r6827, %r2002;
mov.u32 %r6828, 6;
sub.s32 %r6829, %r6828, %r2002;
mul.wide.s32 %rd2160, %r6829, 4;
add.s64 %rd2161, %rd1, %rd2160;
ld.local.u32 %r8623, [%rd2161];
ld.local.u32 %r8624, [%rd2161+-4];
and.b32 %r2008, %r2000, 31;
setp.eq.s32 %p1287, %r2008, 0;
@%p1287 bra $L__BB0_1526;
mov.u32 %r6830, 32;
sub.s32 %r6831, %r6830, %r2008;
shr.u32 %r6832, %r8624, %r6831;
shl.b32 %r6833, %r8623, %r2008;
add.s32 %r8623, %r6832, %r6833;
mul.wide.s32 %rd2162, %r2005, 4;
add.s64 %rd2163, %rd1, %rd2162;
ld.local.u32 %r6834, [%rd2163];
shr.u32 %r6835, %r6834, %r6831;
shl.b32 %r6836, %r8624, %r2008;
add.s32 %r8624, %r6835, %r6836;
$L__BB0_1526:
and.b32 %r6837, %r1999, -2147483648;
shr.u32 %r6838, %r8624, 30;
shl.b32 %r6839, %r8623, 2;
or.b32 %r6840, %r6838, %r6839;
shr.u32 %r6841, %r6840, 31;
shr.u32 %r6842, %r8623, 30;
add.s32 %r6843, %r6841, %r6842;
neg.s32 %r6844, %r6843;
setp.eq.s32 %p1288, %r6837, 0;
selp.b32 %r8625, %r6843, %r6844, %p1288;
setp.ne.s32 %p1289, %r6841, 0;
xor.b32 %r6845, %r6837, -2147483648;
selp.b32 %r6846, %r6845, %r6837, %p1289;
selp.b32 %r6847, -1, 0, %p1289;
xor.b32 %r6848, %r6840, %r6847;
shl.b32 %r6849, %r8624, 2;
xor.b32 %r6850, %r6849, %r6847;
cvt.u64.u32 %rd2164, %r6848;
cvt.u64.u32 %rd2165, %r6850;
bfi.b64 %rd2166, %rd2164, %rd2165, 32, 32;
cvt.rn.f64.s64 %fd199, %rd2166;
mul.f64 %fd200, %fd199, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4396, %fd200;
setp.eq.s32 %p1290, %r6846, 0;
neg.f32 %f4397, %f4396;
selp.f32 %f5768, %f4396, %f4397, %p1290;
$L__BB0_1528:
add.s32 %r2015, %r8625, 1;
and.b32 %r2016, %r2015, 1;
setp.eq.s32 %p1291, %r2016, 0;
selp.f32 %f1710, %f5768, 0f3F800000, %p1291;
mul.rn.f32 %f1711, %f5768, %f5768;
mov.f32 %f5769, 0fB94D4153;
@%p1291 bra $L__BB0_1530;
mov.f32 %f4400, 0fBAB607ED;
mov.f32 %f4401, 0f37CBAC00;
fma.rn.f32 %f5769, %f4401, %f1711, %f4400;
$L__BB0_1530:
selp.f32 %f4402, 0f3C0885E4, 0f3D2AAABB, %p1291;
fma.rn.f32 %f4403, %f5769, %f1711, %f4402;
selp.f32 %f4404, 0fBE2AAAA8, 0fBEFFFFFF, %p1291;
fma.rn.f32 %f4405, %f4403, %f1711, %f4404;
mov.f32 %f4406, 0f00000000;
fma.rn.f32 %f4407, %f1711, %f1710, %f4406;
fma.rn.f32 %f5215, %f4405, %f4407, %f1710;
and.b32 %r6852, %r2015, 2;
setp.eq.s32 %p1293, %r6852, 0;
@%p1293 bra $L__BB0_1532;
mov.f32 %f4409, 0fBF800000;
fma.rn.f32 %f5215, %f5215, %f4409, %f4406;
$L__BB0_1532:
selp.f32 %f1718, %f5215, %f5216, %p25;
selp.f32 %f1719, %f5213, %f5214, %p25;
@%p1272 bra $L__BB0_1534;
add.f32 %f5904, %f1719, %f1718;
mov.f32 %f5214, %f5213;
mov.f32 %f5216, %f5215;
$L__BB0_1534:
@%p1218 bra $L__BB0_1563;
shl.b32 %r6854, %r12, 5;
neg.s32 %r2017, %r6854;
setp.ge.s32 %p1297, %r14, %r2017;
@%p1297 bra $L__BB0_1548;
mul.f32 %f4412, %f5346, 0f3F22F983;
cvt.rni.s32.f32 %r8629, %f4412;
cvt.rn.f32.s32 %f4413, %r8629;
mov.f32 %f4414, 0fBFC90FDA;
fma.rn.f32 %f4415, %f4413, %f4414, %f5346;
mov.f32 %f4416, 0fB3A22168;
fma.rn.f32 %f4417, %f4413, %f4416, %f4415;
mov.f32 %f4418, 0fA7C234C5;
fma.rn.f32 %f5777, %f4413, %f4418, %f4417;
abs.f32 %f1727, %f5346;
setp.ltu.f32 %p1298, %f1727, 0f47CE4780;
@%p1298 bra $L__BB0_1544;
setp.eq.f32 %p1299, %f1727, 0f7F800000;
@%p1299 bra $L__BB0_1543;
bra.uni $L__BB0_1538;
$L__BB0_1543:
mov.f32 %f4421, 0f00000000;
mul.rn.f32 %f5777, %f5346, %f4421;
mov.u32 %r8629, 0;
bra.uni $L__BB0_1544;
$L__BB0_1538:
mov.b32 %r2019, %f5346;
shr.u32 %r6856, %r2019, 23;
and.b32 %r6857, %r6856, 255;
add.s32 %r2020, %r6857, -128;
shl.b32 %r6858, %r2019, 8;
or.b32 %r2021, %r6858, -2147483648;
shr.u32 %r2022, %r2020, 5;
mov.u64 %rd2714, 0;
mov.u32 %r8626, 0;
mov.u64 %rd2712, __cudart_i2opi_f;
mov.u64 %rd2713, %rd1;
$L__BB0_1539:
.pragma "nounroll";
ld.global.nc.u32 %r6859, [%rd2712];
mad.wide.u32 %rd2169, %r6859, %r2021, %rd2714;
shr.u64 %rd2714, %rd2169, 32;
st.local.u32 [%rd2713], %rd2169;
add.s64 %rd2713, %rd2713, 4;
add.s64 %rd2712, %rd2712, 4;
add.s32 %r8626, %r8626, 1;
setp.ne.s32 %p1300, %r8626, 6;
@%p1300 bra $L__BB0_1539;
st.local.u32 [%rd5], %rd2714;
mov.u32 %r6860, 4;
sub.s32 %r2025, %r6860, %r2022;
mov.u32 %r6861, 6;
sub.s32 %r6862, %r6861, %r2022;
mul.wide.s32 %rd2170, %r6862, 4;
add.s64 %rd2171, %rd1, %rd2170;
ld.local.u32 %r8627, [%rd2171];
ld.local.u32 %r8628, [%rd2171+-4];
and.b32 %r2028, %r2020, 31;
setp.eq.s32 %p1301, %r2028, 0;
@%p1301 bra $L__BB0_1542;
mov.u32 %r6863, 32;
sub.s32 %r6864, %r6863, %r2028;
shr.u32 %r6865, %r8628, %r6864;
shl.b32 %r6866, %r8627, %r2028;
add.s32 %r8627, %r6865, %r6866;
mul.wide.s32 %rd2172, %r2025, 4;
add.s64 %rd2173, %rd1, %rd2172;
ld.local.u32 %r6867, [%rd2173];
shr.u32 %r6868, %r6867, %r6864;
shl.b32 %r6869, %r8628, %r2028;
add.s32 %r8628, %r6868, %r6869;
$L__BB0_1542:
and.b32 %r6870, %r2019, -2147483648;
shr.u32 %r6871, %r8628, 30;
shl.b32 %r6872, %r8627, 2;
or.b32 %r6873, %r6871, %r6872;
shr.u32 %r6874, %r6873, 31;
shr.u32 %r6875, %r8627, 30;
add.s32 %r6876, %r6874, %r6875;
neg.s32 %r6877, %r6876;
setp.eq.s32 %p1302, %r6870, 0;
selp.b32 %r8629, %r6876, %r6877, %p1302;
setp.ne.s32 %p1303, %r6874, 0;
xor.b32 %r6878, %r6870, -2147483648;
selp.b32 %r6879, %r6878, %r6870, %p1303;
selp.b32 %r6880, -1, 0, %p1303;
xor.b32 %r6881, %r6873, %r6880;
shl.b32 %r6882, %r8628, 2;
xor.b32 %r6883, %r6882, %r6880;
cvt.u64.u32 %rd2174, %r6881;
cvt.u64.u32 %rd2175, %r6883;
bfi.b64 %rd2176, %rd2174, %rd2175, 32, 32;
cvt.rn.f64.s64 %fd201, %rd2176;
mul.f64 %fd202, %fd201, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4419, %fd202;
setp.eq.s32 %p1304, %r6879, 0;
neg.f32 %f4420, %f4419;
selp.f32 %f5777, %f4419, %f4420, %p1304;
$L__BB0_1544:
and.b32 %r2035, %r8629, 1;
setp.eq.s32 %p1305, %r2035, 0;
selp.f32 %f1731, %f5777, 0f3F800000, %p1305;
mul.rn.f32 %f1732, %f5777, %f5777;
mov.f32 %f5778, 0fB94D4153;
@%p1305 bra $L__BB0_1546;
mov.f32 %f4423, 0fBAB607ED;
mov.f32 %f4424, 0f37CBAC00;
fma.rn.f32 %f5778, %f4424, %f1732, %f4423;
$L__BB0_1546:
selp.f32 %f4425, 0f3C0885E4, 0f3D2AAABB, %p1305;
fma.rn.f32 %f4426, %f5778, %f1732, %f4425;
selp.f32 %f4427, 0fBE2AAAA8, 0fBEFFFFFF, %p1305;
fma.rn.f32 %f4428, %f4426, %f1732, %f4427;
mov.f32 %f4429, 0f00000000;
fma.rn.f32 %f4430, %f1732, %f1731, %f4429;
fma.rn.f32 %f5213, %f4428, %f4430, %f1731;
and.b32 %r6885, %r8629, 2;
setp.eq.s32 %p1307, %r6885, 0;
@%p1307 bra $L__BB0_1548;
mov.f32 %f4432, 0fBF800000;
fma.rn.f32 %f5213, %f5213, %f4432, %f4429;
$L__BB0_1548:
setp.lt.s32 %p26, %r14, %r2017;
@%p1297 bra $L__BB0_1561;
mul.f32 %f4433, %f5338, 0f3F22F983;
cvt.rni.s32.f32 %r8633, %f4433;
cvt.rn.f32.s32 %f4434, %r8633;
mov.f32 %f4435, 0fBFC90FDA;
fma.rn.f32 %f4436, %f4434, %f4435, %f5338;
mov.f32 %f4437, 0fB3A22168;
fma.rn.f32 %f4438, %f4434, %f4437, %f4436;
mov.f32 %f4439, 0fA7C234C5;
fma.rn.f32 %f5781, %f4434, %f4439, %f4438;
abs.f32 %f1740, %f5338;
setp.ltu.f32 %p1309, %f1740, 0f47CE4780;
@%p1309 bra $L__BB0_1557;
setp.eq.f32 %p1310, %f1740, 0f7F800000;
@%p1310 bra $L__BB0_1556;
bra.uni $L__BB0_1551;
$L__BB0_1556:
mov.f32 %f4442, 0f00000000;
mul.rn.f32 %f5781, %f5338, %f4442;
mov.u32 %r8633, 0;
bra.uni $L__BB0_1557;
$L__BB0_1551:
mov.b32 %r2037, %f5338;
shr.u32 %r6887, %r2037, 23;
and.b32 %r6888, %r6887, 255;
add.s32 %r2038, %r6888, -128;
shl.b32 %r6889, %r2037, 8;
or.b32 %r2039, %r6889, -2147483648;
shr.u32 %r2040, %r2038, 5;
mov.u64 %rd2717, 0;
mov.u32 %r8630, 0;
mov.u64 %rd2715, __cudart_i2opi_f;
mov.u64 %rd2716, %rd1;
$L__BB0_1552:
.pragma "nounroll";
ld.global.nc.u32 %r6890, [%rd2715];
mad.wide.u32 %rd2179, %r6890, %r2039, %rd2717;
shr.u64 %rd2717, %rd2179, 32;
st.local.u32 [%rd2716], %rd2179;
add.s64 %rd2716, %rd2716, 4;
add.s64 %rd2715, %rd2715, 4;
add.s32 %r8630, %r8630, 1;
setp.ne.s32 %p1311, %r8630, 6;
@%p1311 bra $L__BB0_1552;
st.local.u32 [%rd5], %rd2717;
mov.u32 %r6891, 4;
sub.s32 %r2043, %r6891, %r2040;
mov.u32 %r6892, 6;
sub.s32 %r6893, %r6892, %r2040;
mul.wide.s32 %rd2180, %r6893, 4;
add.s64 %rd2181, %rd1, %rd2180;
ld.local.u32 %r8631, [%rd2181];
ld.local.u32 %r8632, [%rd2181+-4];
and.b32 %r2046, %r2038, 31;
setp.eq.s32 %p1312, %r2046, 0;
@%p1312 bra $L__BB0_1555;
mov.u32 %r6894, 32;
sub.s32 %r6895, %r6894, %r2046;
shr.u32 %r6896, %r8632, %r6895;
shl.b32 %r6897, %r8631, %r2046;
add.s32 %r8631, %r6896, %r6897;
mul.wide.s32 %rd2182, %r2043, 4;
add.s64 %rd2183, %rd1, %rd2182;
ld.local.u32 %r6898, [%rd2183];
shr.u32 %r6899, %r6898, %r6895;
shl.b32 %r6900, %r8632, %r2046;
add.s32 %r8632, %r6899, %r6900;
$L__BB0_1555:
and.b32 %r6901, %r2037, -2147483648;
shr.u32 %r6902, %r8632, 30;
shl.b32 %r6903, %r8631, 2;
or.b32 %r6904, %r6902, %r6903;
shr.u32 %r6905, %r6904, 31;
shr.u32 %r6906, %r8631, 30;
add.s32 %r6907, %r6905, %r6906;
neg.s32 %r6908, %r6907;
setp.eq.s32 %p1313, %r6901, 0;
selp.b32 %r8633, %r6907, %r6908, %p1313;
setp.ne.s32 %p1314, %r6905, 0;
xor.b32 %r6909, %r6901, -2147483648;
selp.b32 %r6910, %r6909, %r6901, %p1314;
selp.b32 %r6911, -1, 0, %p1314;
xor.b32 %r6912, %r6904, %r6911;
shl.b32 %r6913, %r8632, 2;
xor.b32 %r6914, %r6913, %r6911;
cvt.u64.u32 %rd2184, %r6912;
cvt.u64.u32 %rd2185, %r6914;
bfi.b64 %rd2186, %rd2184, %rd2185, 32, 32;
cvt.rn.f64.s64 %fd203, %rd2186;
mul.f64 %fd204, %fd203, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4440, %fd204;
setp.eq.s32 %p1315, %r6910, 0;
neg.f32 %f4441, %f4440;
selp.f32 %f5781, %f4440, %f4441, %p1315;
$L__BB0_1557:
add.s32 %r2053, %r8633, 1;
and.b32 %r2054, %r2053, 1;
setp.eq.s32 %p1316, %r2054, 0;
selp.f32 %f1744, %f5781, 0f3F800000, %p1316;
mul.rn.f32 %f1745, %f5781, %f5781;
mov.f32 %f5782, 0fB94D4153;
@%p1316 bra $L__BB0_1559;
mov.f32 %f4444, 0fBAB607ED;
mov.f32 %f4445, 0f37CBAC00;
fma.rn.f32 %f5782, %f4445, %f1745, %f4444;
$L__BB0_1559:
selp.f32 %f4446, 0f3C0885E4, 0f3D2AAABB, %p1316;
fma.rn.f32 %f4447, %f5782, %f1745, %f4446;
selp.f32 %f4448, 0fBE2AAAA8, 0fBEFFFFFF, %p1316;
fma.rn.f32 %f4449, %f4447, %f1745, %f4448;
mov.f32 %f4450, 0f00000000;
fma.rn.f32 %f4451, %f1745, %f1744, %f4450;
fma.rn.f32 %f5215, %f4449, %f4451, %f1744;
and.b32 %r6916, %r2053, 2;
setp.eq.s32 %p1318, %r6916, 0;
@%p1318 bra $L__BB0_1561;
mov.f32 %f4453, 0fBF800000;
fma.rn.f32 %f5215, %f5215, %f4453, %f4450;
$L__BB0_1561:
selp.f32 %f1752, %f5215, %f5216, %p26;
selp.f32 %f1753, %f5213, %f5214, %p26;
@%p1297 bra $L__BB0_1563;
add.f32 %f5903, %f1753, %f1752;
mov.f32 %f5214, %f5213;
mov.f32 %f5216, %f5215;
$L__BB0_1563:
@%p1218 bra $L__BB0_1592;
shl.b32 %r6918, %r12, 5;
mov.u32 %r6919, -32;
sub.s32 %r2055, %r6919, %r6918;
setp.ge.s32 %p1322, %r14, %r2055;
@%p1322 bra $L__BB0_1577;
mul.f32 %f4456, %f5345, 0f3F22F983;
cvt.rni.s32.f32 %r8637, %f4456;
cvt.rn.f32.s32 %f4457, %r8637;
mov.f32 %f4458, 0fBFC90FDA;
fma.rn.f32 %f4459, %f4457, %f4458, %f5345;
mov.f32 %f4460, 0fB3A22168;
fma.rn.f32 %f4461, %f4457, %f4460, %f4459;
mov.f32 %f4462, 0fA7C234C5;
fma.rn.f32 %f5790, %f4457, %f4462, %f4461;
abs.f32 %f1761, %f5345;
setp.ltu.f32 %p1323, %f1761, 0f47CE4780;
@%p1323 bra $L__BB0_1573;
setp.eq.f32 %p1324, %f1761, 0f7F800000;
@%p1324 bra $L__BB0_1572;
bra.uni $L__BB0_1567;
$L__BB0_1572:
mov.f32 %f4465, 0f00000000;
mul.rn.f32 %f5790, %f5345, %f4465;
mov.u32 %r8637, 0;
bra.uni $L__BB0_1573;
$L__BB0_1567:
mov.b32 %r2057, %f5345;
shr.u32 %r6921, %r2057, 23;
and.b32 %r6922, %r6921, 255;
add.s32 %r2058, %r6922, -128;
shl.b32 %r6923, %r2057, 8;
or.b32 %r2059, %r6923, -2147483648;
shr.u32 %r2060, %r2058, 5;
mov.u64 %rd2720, 0;
mov.u32 %r8634, 0;
mov.u64 %rd2718, __cudart_i2opi_f;
mov.u64 %rd2719, %rd1;
$L__BB0_1568:
.pragma "nounroll";
ld.global.nc.u32 %r6924, [%rd2718];
mad.wide.u32 %rd2189, %r6924, %r2059, %rd2720;
shr.u64 %rd2720, %rd2189, 32;
st.local.u32 [%rd2719], %rd2189;
add.s64 %rd2719, %rd2719, 4;
add.s64 %rd2718, %rd2718, 4;
add.s32 %r8634, %r8634, 1;
setp.ne.s32 %p1325, %r8634, 6;
@%p1325 bra $L__BB0_1568;
st.local.u32 [%rd5], %rd2720;
mov.u32 %r6925, 4;
sub.s32 %r2063, %r6925, %r2060;
mov.u32 %r6926, 6;
sub.s32 %r6927, %r6926, %r2060;
mul.wide.s32 %rd2190, %r6927, 4;
add.s64 %rd2191, %rd1, %rd2190;
ld.local.u32 %r8635, [%rd2191];
ld.local.u32 %r8636, [%rd2191+-4];
and.b32 %r2066, %r2058, 31;
setp.eq.s32 %p1326, %r2066, 0;
@%p1326 bra $L__BB0_1571;
mov.u32 %r6928, 32;
sub.s32 %r6929, %r6928, %r2066;
shr.u32 %r6930, %r8636, %r6929;
shl.b32 %r6931, %r8635, %r2066;
add.s32 %r8635, %r6930, %r6931;
mul.wide.s32 %rd2192, %r2063, 4;
add.s64 %rd2193, %rd1, %rd2192;
ld.local.u32 %r6932, [%rd2193];
shr.u32 %r6933, %r6932, %r6929;
shl.b32 %r6934, %r8636, %r2066;
add.s32 %r8636, %r6933, %r6934;
$L__BB0_1571:
and.b32 %r6935, %r2057, -2147483648;
shr.u32 %r6936, %r8636, 30;
shl.b32 %r6937, %r8635, 2;
or.b32 %r6938, %r6936, %r6937;
shr.u32 %r6939, %r6938, 31;
shr.u32 %r6940, %r8635, 30;
add.s32 %r6941, %r6939, %r6940;
neg.s32 %r6942, %r6941;
setp.eq.s32 %p1327, %r6935, 0;
selp.b32 %r8637, %r6941, %r6942, %p1327;
setp.ne.s32 %p1328, %r6939, 0;
xor.b32 %r6943, %r6935, -2147483648;
selp.b32 %r6944, %r6943, %r6935, %p1328;
selp.b32 %r6945, -1, 0, %p1328;
xor.b32 %r6946, %r6938, %r6945;
shl.b32 %r6947, %r8636, 2;
xor.b32 %r6948, %r6947, %r6945;
cvt.u64.u32 %rd2194, %r6946;
cvt.u64.u32 %rd2195, %r6948;
bfi.b64 %rd2196, %rd2194, %rd2195, 32, 32;
cvt.rn.f64.s64 %fd205, %rd2196;
mul.f64 %fd206, %fd205, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4463, %fd206;
setp.eq.s32 %p1329, %r6944, 0;
neg.f32 %f4464, %f4463;
selp.f32 %f5790, %f4463, %f4464, %p1329;
$L__BB0_1573:
and.b32 %r2073, %r8637, 1;
setp.eq.s32 %p1330, %r2073, 0;
selp.f32 %f1765, %f5790, 0f3F800000, %p1330;
mul.rn.f32 %f1766, %f5790, %f5790;
mov.f32 %f5791, 0fB94D4153;
@%p1330 bra $L__BB0_1575;
mov.f32 %f4467, 0fBAB607ED;
mov.f32 %f4468, 0f37CBAC00;
fma.rn.f32 %f5791, %f4468, %f1766, %f4467;
$L__BB0_1575:
selp.f32 %f4469, 0f3C0885E4, 0f3D2AAABB, %p1330;
fma.rn.f32 %f4470, %f5791, %f1766, %f4469;
selp.f32 %f4471, 0fBE2AAAA8, 0fBEFFFFFF, %p1330;
fma.rn.f32 %f4472, %f4470, %f1766, %f4471;
mov.f32 %f4473, 0f00000000;
fma.rn.f32 %f4474, %f1766, %f1765, %f4473;
fma.rn.f32 %f5213, %f4472, %f4474, %f1765;
and.b32 %r6950, %r8637, 2;
setp.eq.s32 %p1332, %r6950, 0;
@%p1332 bra $L__BB0_1577;
mov.f32 %f4476, 0fBF800000;
fma.rn.f32 %f5213, %f5213, %f4476, %f4473;
$L__BB0_1577:
setp.lt.s32 %p27, %r14, %r2055;
@%p1322 bra $L__BB0_1590;
mul.f32 %f4477, %f5337, 0f3F22F983;
cvt.rni.s32.f32 %r8641, %f4477;
cvt.rn.f32.s32 %f4478, %r8641;
mov.f32 %f4479, 0fBFC90FDA;
fma.rn.f32 %f4480, %f4478, %f4479, %f5337;
mov.f32 %f4481, 0fB3A22168;
fma.rn.f32 %f4482, %f4478, %f4481, %f4480;
mov.f32 %f4483, 0fA7C234C5;
fma.rn.f32 %f5794, %f4478, %f4483, %f4482;
abs.f32 %f1774, %f5337;
setp.ltu.f32 %p1334, %f1774, 0f47CE4780;
@%p1334 bra $L__BB0_1586;
setp.eq.f32 %p1335, %f1774, 0f7F800000;
@%p1335 bra $L__BB0_1585;
bra.uni $L__BB0_1580;
$L__BB0_1585:
mov.f32 %f4486, 0f00000000;
mul.rn.f32 %f5794, %f5337, %f4486;
mov.u32 %r8641, 0;
bra.uni $L__BB0_1586;
$L__BB0_1580:
mov.b32 %r2075, %f5337;
shr.u32 %r6952, %r2075, 23;
and.b32 %r6953, %r6952, 255;
add.s32 %r2076, %r6953, -128;
shl.b32 %r6954, %r2075, 8;
or.b32 %r2077, %r6954, -2147483648;
shr.u32 %r2078, %r2076, 5;
mov.u64 %rd2723, 0;
mov.u32 %r8638, 0;
mov.u64 %rd2721, __cudart_i2opi_f;
mov.u64 %rd2722, %rd1;
$L__BB0_1581:
.pragma "nounroll";
ld.global.nc.u32 %r6955, [%rd2721];
mad.wide.u32 %rd2199, %r6955, %r2077, %rd2723;
shr.u64 %rd2723, %rd2199, 32;
st.local.u32 [%rd2722], %rd2199;
add.s64 %rd2722, %rd2722, 4;
add.s64 %rd2721, %rd2721, 4;
add.s32 %r8638, %r8638, 1;
setp.ne.s32 %p1336, %r8638, 6;
@%p1336 bra $L__BB0_1581;
st.local.u32 [%rd5], %rd2723;
mov.u32 %r6956, 4;
sub.s32 %r2081, %r6956, %r2078;
mov.u32 %r6957, 6;
sub.s32 %r6958, %r6957, %r2078;
mul.wide.s32 %rd2200, %r6958, 4;
add.s64 %rd2201, %rd1, %rd2200;
ld.local.u32 %r8639, [%rd2201];
ld.local.u32 %r8640, [%rd2201+-4];
and.b32 %r2084, %r2076, 31;
setp.eq.s32 %p1337, %r2084, 0;
@%p1337 bra $L__BB0_1584;
mov.u32 %r6959, 32;
sub.s32 %r6960, %r6959, %r2084;
shr.u32 %r6961, %r8640, %r6960;
shl.b32 %r6962, %r8639, %r2084;
add.s32 %r8639, %r6961, %r6962;
mul.wide.s32 %rd2202, %r2081, 4;
add.s64 %rd2203, %rd1, %rd2202;
ld.local.u32 %r6963, [%rd2203];
shr.u32 %r6964, %r6963, %r6960;
shl.b32 %r6965, %r8640, %r2084;
add.s32 %r8640, %r6964, %r6965;
$L__BB0_1584:
and.b32 %r6966, %r2075, -2147483648;
shr.u32 %r6967, %r8640, 30;
shl.b32 %r6968, %r8639, 2;
or.b32 %r6969, %r6967, %r6968;
shr.u32 %r6970, %r6969, 31;
shr.u32 %r6971, %r8639, 30;
add.s32 %r6972, %r6970, %r6971;
neg.s32 %r6973, %r6972;
setp.eq.s32 %p1338, %r6966, 0;
selp.b32 %r8641, %r6972, %r6973, %p1338;
setp.ne.s32 %p1339, %r6970, 0;
xor.b32 %r6974, %r6966, -2147483648;
selp.b32 %r6975, %r6974, %r6966, %p1339;
selp.b32 %r6976, -1, 0, %p1339;
xor.b32 %r6977, %r6969, %r6976;
shl.b32 %r6978, %r8640, 2;
xor.b32 %r6979, %r6978, %r6976;
cvt.u64.u32 %rd2204, %r6977;
cvt.u64.u32 %rd2205, %r6979;
bfi.b64 %rd2206, %rd2204, %rd2205, 32, 32;
cvt.rn.f64.s64 %fd207, %rd2206;
mul.f64 %fd208, %fd207, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4484, %fd208;
setp.eq.s32 %p1340, %r6975, 0;
neg.f32 %f4485, %f4484;
selp.f32 %f5794, %f4484, %f4485, %p1340;
$L__BB0_1586:
add.s32 %r2091, %r8641, 1;
and.b32 %r2092, %r2091, 1;
setp.eq.s32 %p1341, %r2092, 0;
selp.f32 %f1778, %f5794, 0f3F800000, %p1341;
mul.rn.f32 %f1779, %f5794, %f5794;
mov.f32 %f5795, 0fB94D4153;
@%p1341 bra $L__BB0_1588;
mov.f32 %f4488, 0fBAB607ED;
mov.f32 %f4489, 0f37CBAC00;
fma.rn.f32 %f5795, %f4489, %f1779, %f4488;
$L__BB0_1588:
selp.f32 %f4490, 0f3C0885E4, 0f3D2AAABB, %p1341;
fma.rn.f32 %f4491, %f5795, %f1779, %f4490;
selp.f32 %f4492, 0fBE2AAAA8, 0fBEFFFFFF, %p1341;
fma.rn.f32 %f4493, %f4491, %f1779, %f4492;
mov.f32 %f4494, 0f00000000;
fma.rn.f32 %f4495, %f1779, %f1778, %f4494;
fma.rn.f32 %f5215, %f4493, %f4495, %f1778;
and.b32 %r6981, %r2091, 2;
setp.eq.s32 %p1343, %r6981, 0;
@%p1343 bra $L__BB0_1590;
mov.f32 %f4497, 0fBF800000;
fma.rn.f32 %f5215, %f5215, %f4497, %f4494;
$L__BB0_1590:
selp.f32 %f1786, %f5215, %f5216, %p27;
selp.f32 %f1787, %f5213, %f5214, %p27;
@%p1322 bra $L__BB0_1592;
add.f32 %f5902, %f1787, %f1786;
mov.f32 %f5214, %f5213;
mov.f32 %f5216, %f5215;
$L__BB0_1592:
@%p1222 bra $L__BB0_1621;
shl.b32 %r6983, %r12, 5;
neg.s32 %r2093, %r6983;
setp.ge.s32 %p1347, %r14, %r2093;
@%p1347 bra $L__BB0_1606;
mul.f32 %f4500, %f5344, 0f3F22F983;
cvt.rni.s32.f32 %r8645, %f4500;
cvt.rn.f32.s32 %f4501, %r8645;
mov.f32 %f4502, 0fBFC90FDA;
fma.rn.f32 %f4503, %f4501, %f4502, %f5344;
mov.f32 %f4504, 0fB3A22168;
fma.rn.f32 %f4505, %f4501, %f4504, %f4503;
mov.f32 %f4506, 0fA7C234C5;
fma.rn.f32 %f5803, %f4501, %f4506, %f4505;
abs.f32 %f1795, %f5344;
setp.ltu.f32 %p1348, %f1795, 0f47CE4780;
@%p1348 bra $L__BB0_1602;
setp.eq.f32 %p1349, %f1795, 0f7F800000;
@%p1349 bra $L__BB0_1601;
bra.uni $L__BB0_1596;
$L__BB0_1601:
mov.f32 %f4509, 0f00000000;
mul.rn.f32 %f5803, %f5344, %f4509;
mov.u32 %r8645, 0;
bra.uni $L__BB0_1602;
$L__BB0_1596:
mov.b32 %r2095, %f5344;
shr.u32 %r6985, %r2095, 23;
and.b32 %r6986, %r6985, 255;
add.s32 %r2096, %r6986, -128;
shl.b32 %r6987, %r2095, 8;
or.b32 %r2097, %r6987, -2147483648;
shr.u32 %r2098, %r2096, 5;
mov.u64 %rd2726, 0;
mov.u32 %r8642, 0;
mov.u64 %rd2724, __cudart_i2opi_f;
mov.u64 %rd2725, %rd1;
$L__BB0_1597:
.pragma "nounroll";
ld.global.nc.u32 %r6988, [%rd2724];
mad.wide.u32 %rd2209, %r6988, %r2097, %rd2726;
shr.u64 %rd2726, %rd2209, 32;
st.local.u32 [%rd2725], %rd2209;
add.s64 %rd2725, %rd2725, 4;
add.s64 %rd2724, %rd2724, 4;
add.s32 %r8642, %r8642, 1;
setp.ne.s32 %p1350, %r8642, 6;
@%p1350 bra $L__BB0_1597;
st.local.u32 [%rd5], %rd2726;
mov.u32 %r6989, 4;
sub.s32 %r2101, %r6989, %r2098;
mov.u32 %r6990, 6;
sub.s32 %r6991, %r6990, %r2098;
mul.wide.s32 %rd2210, %r6991, 4;
add.s64 %rd2211, %rd1, %rd2210;
ld.local.u32 %r8643, [%rd2211];
ld.local.u32 %r8644, [%rd2211+-4];
and.b32 %r2104, %r2096, 31;
setp.eq.s32 %p1351, %r2104, 0;
@%p1351 bra $L__BB0_1600;
mov.u32 %r6992, 32;
sub.s32 %r6993, %r6992, %r2104;
shr.u32 %r6994, %r8644, %r6993;
shl.b32 %r6995, %r8643, %r2104;
add.s32 %r8643, %r6994, %r6995;
mul.wide.s32 %rd2212, %r2101, 4;
add.s64 %rd2213, %rd1, %rd2212;
ld.local.u32 %r6996, [%rd2213];
shr.u32 %r6997, %r6996, %r6993;
shl.b32 %r6998, %r8644, %r2104;
add.s32 %r8644, %r6997, %r6998;
$L__BB0_1600:
and.b32 %r6999, %r2095, -2147483648;
shr.u32 %r7000, %r8644, 30;
shl.b32 %r7001, %r8643, 2;
or.b32 %r7002, %r7000, %r7001;
shr.u32 %r7003, %r7002, 31;
shr.u32 %r7004, %r8643, 30;
add.s32 %r7005, %r7003, %r7004;
neg.s32 %r7006, %r7005;
setp.eq.s32 %p1352, %r6999, 0;
selp.b32 %r8645, %r7005, %r7006, %p1352;
setp.ne.s32 %p1353, %r7003, 0;
xor.b32 %r7007, %r6999, -2147483648;
selp.b32 %r7008, %r7007, %r6999, %p1353;
selp.b32 %r7009, -1, 0, %p1353;
xor.b32 %r7010, %r7002, %r7009;
shl.b32 %r7011, %r8644, 2;
xor.b32 %r7012, %r7011, %r7009;
cvt.u64.u32 %rd2214, %r7010;
cvt.u64.u32 %rd2215, %r7012;
bfi.b64 %rd2216, %rd2214, %rd2215, 32, 32;
cvt.rn.f64.s64 %fd209, %rd2216;
mul.f64 %fd210, %fd209, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4507, %fd210;
setp.eq.s32 %p1354, %r7008, 0;
neg.f32 %f4508, %f4507;
selp.f32 %f5803, %f4507, %f4508, %p1354;
$L__BB0_1602:
and.b32 %r2111, %r8645, 1;
setp.eq.s32 %p1355, %r2111, 0;
selp.f32 %f1799, %f5803, 0f3F800000, %p1355;
mul.rn.f32 %f1800, %f5803, %f5803;
mov.f32 %f5804, 0fB94D4153;
@%p1355 bra $L__BB0_1604;
mov.f32 %f4511, 0fBAB607ED;
mov.f32 %f4512, 0f37CBAC00;
fma.rn.f32 %f5804, %f4512, %f1800, %f4511;
$L__BB0_1604:
selp.f32 %f4513, 0f3C0885E4, 0f3D2AAABB, %p1355;
fma.rn.f32 %f4514, %f5804, %f1800, %f4513;
selp.f32 %f4515, 0fBE2AAAA8, 0fBEFFFFFF, %p1355;
fma.rn.f32 %f4516, %f4514, %f1800, %f4515;
mov.f32 %f4517, 0f00000000;
fma.rn.f32 %f4518, %f1800, %f1799, %f4517;
fma.rn.f32 %f5213, %f4516, %f4518, %f1799;
and.b32 %r7014, %r8645, 2;
setp.eq.s32 %p1357, %r7014, 0;
@%p1357 bra $L__BB0_1606;
mov.f32 %f4520, 0fBF800000;
fma.rn.f32 %f5213, %f5213, %f4520, %f4517;
$L__BB0_1606:
setp.lt.s32 %p28, %r14, %r2093;
@%p1347 bra $L__BB0_1619;
mul.f32 %f4521, %f5336, 0f3F22F983;
cvt.rni.s32.f32 %r8649, %f4521;
cvt.rn.f32.s32 %f4522, %r8649;
mov.f32 %f4523, 0fBFC90FDA;
fma.rn.f32 %f4524, %f4522, %f4523, %f5336;
mov.f32 %f4525, 0fB3A22168;
fma.rn.f32 %f4526, %f4522, %f4525, %f4524;
mov.f32 %f4527, 0fA7C234C5;
fma.rn.f32 %f5807, %f4522, %f4527, %f4526;
abs.f32 %f1808, %f5336;
setp.ltu.f32 %p1359, %f1808, 0f47CE4780;
@%p1359 bra $L__BB0_1615;
setp.eq.f32 %p1360, %f1808, 0f7F800000;
@%p1360 bra $L__BB0_1614;
bra.uni $L__BB0_1609;
$L__BB0_1614:
mov.f32 %f4530, 0f00000000;
mul.rn.f32 %f5807, %f5336, %f4530;
mov.u32 %r8649, 0;
bra.uni $L__BB0_1615;
$L__BB0_1609:
mov.b32 %r2113, %f5336;
shr.u32 %r7016, %r2113, 23;
and.b32 %r7017, %r7016, 255;
add.s32 %r2114, %r7017, -128;
shl.b32 %r7018, %r2113, 8;
or.b32 %r2115, %r7018, -2147483648;
shr.u32 %r2116, %r2114, 5;
mov.u64 %rd2729, 0;
mov.u32 %r8646, 0;
mov.u64 %rd2727, __cudart_i2opi_f;
mov.u64 %rd2728, %rd1;
$L__BB0_1610:
.pragma "nounroll";
ld.global.nc.u32 %r7019, [%rd2727];
mad.wide.u32 %rd2219, %r7019, %r2115, %rd2729;
shr.u64 %rd2729, %rd2219, 32;
st.local.u32 [%rd2728], %rd2219;
add.s64 %rd2728, %rd2728, 4;
add.s64 %rd2727, %rd2727, 4;
add.s32 %r8646, %r8646, 1;
setp.ne.s32 %p1361, %r8646, 6;
@%p1361 bra $L__BB0_1610;
st.local.u32 [%rd5], %rd2729;
mov.u32 %r7020, 4;
sub.s32 %r2119, %r7020, %r2116;
mov.u32 %r7021, 6;
sub.s32 %r7022, %r7021, %r2116;
mul.wide.s32 %rd2220, %r7022, 4;
add.s64 %rd2221, %rd1, %rd2220;
ld.local.u32 %r8647, [%rd2221];
ld.local.u32 %r8648, [%rd2221+-4];
and.b32 %r2122, %r2114, 31;
setp.eq.s32 %p1362, %r2122, 0;
@%p1362 bra $L__BB0_1613;
mov.u32 %r7023, 32;
sub.s32 %r7024, %r7023, %r2122;
shr.u32 %r7025, %r8648, %r7024;
shl.b32 %r7026, %r8647, %r2122;
add.s32 %r8647, %r7025, %r7026;
mul.wide.s32 %rd2222, %r2119, 4;
add.s64 %rd2223, %rd1, %rd2222;
ld.local.u32 %r7027, [%rd2223];
shr.u32 %r7028, %r7027, %r7024;
shl.b32 %r7029, %r8648, %r2122;
add.s32 %r8648, %r7028, %r7029;
$L__BB0_1613:
and.b32 %r7030, %r2113, -2147483648;
shr.u32 %r7031, %r8648, 30;
shl.b32 %r7032, %r8647, 2;
or.b32 %r7033, %r7031, %r7032;
shr.u32 %r7034, %r7033, 31;
shr.u32 %r7035, %r8647, 30;
add.s32 %r7036, %r7034, %r7035;
neg.s32 %r7037, %r7036;
setp.eq.s32 %p1363, %r7030, 0;
selp.b32 %r8649, %r7036, %r7037, %p1363;
setp.ne.s32 %p1364, %r7034, 0;
xor.b32 %r7038, %r7030, -2147483648;
selp.b32 %r7039, %r7038, %r7030, %p1364;
selp.b32 %r7040, -1, 0, %p1364;
xor.b32 %r7041, %r7033, %r7040;
shl.b32 %r7042, %r8648, 2;
xor.b32 %r7043, %r7042, %r7040;
cvt.u64.u32 %rd2224, %r7041;
cvt.u64.u32 %rd2225, %r7043;
bfi.b64 %rd2226, %rd2224, %rd2225, 32, 32;
cvt.rn.f64.s64 %fd211, %rd2226;
mul.f64 %fd212, %fd211, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4528, %fd212;
setp.eq.s32 %p1365, %r7039, 0;
neg.f32 %f4529, %f4528;
selp.f32 %f5807, %f4528, %f4529, %p1365;
$L__BB0_1615:
add.s32 %r2129, %r8649, 1;
and.b32 %r2130, %r2129, 1;
setp.eq.s32 %p1366, %r2130, 0;
selp.f32 %f1812, %f5807, 0f3F800000, %p1366;
mul.rn.f32 %f1813, %f5807, %f5807;
mov.f32 %f5808, 0fB94D4153;
@%p1366 bra $L__BB0_1617;
mov.f32 %f4532, 0fBAB607ED;
mov.f32 %f4533, 0f37CBAC00;
fma.rn.f32 %f5808, %f4533, %f1813, %f4532;
$L__BB0_1617:
selp.f32 %f4534, 0f3C0885E4, 0f3D2AAABB, %p1366;
fma.rn.f32 %f4535, %f5808, %f1813, %f4534;
selp.f32 %f4536, 0fBE2AAAA8, 0fBEFFFFFF, %p1366;
fma.rn.f32 %f4537, %f4535, %f1813, %f4536;
mov.f32 %f4538, 0f00000000;
fma.rn.f32 %f4539, %f1813, %f1812, %f4538;
fma.rn.f32 %f5215, %f4537, %f4539, %f1812;
and.b32 %r7045, %r2129, 2;
setp.eq.s32 %p1368, %r7045, 0;
@%p1368 bra $L__BB0_1619;
mov.f32 %f4541, 0fBF800000;
fma.rn.f32 %f5215, %f5215, %f4541, %f4538;
$L__BB0_1619:
selp.f32 %f1820, %f5215, %f5216, %p28;
selp.f32 %f1821, %f5213, %f5214, %p28;
@%p1347 bra $L__BB0_1621;
add.f32 %f5901, %f1821, %f1820;
mov.f32 %f5214, %f5213;
mov.f32 %f5216, %f5215;
$L__BB0_1621:
@%p1222 bra $L__BB0_1650;
shl.b32 %r7047, %r12, 5;
mov.u32 %r7048, -32;
sub.s32 %r2131, %r7048, %r7047;
setp.ge.s32 %p1372, %r14, %r2131;
@%p1372 bra $L__BB0_1635;
mul.f32 %f4544, %f5343, 0f3F22F983;
cvt.rni.s32.f32 %r8653, %f4544;
cvt.rn.f32.s32 %f4545, %r8653;
mov.f32 %f4546, 0fBFC90FDA;
fma.rn.f32 %f4547, %f4545, %f4546, %f5343;
mov.f32 %f4548, 0fB3A22168;
fma.rn.f32 %f4549, %f4545, %f4548, %f4547;
mov.f32 %f4550, 0fA7C234C5;
fma.rn.f32 %f5816, %f4545, %f4550, %f4549;
abs.f32 %f1829, %f5343;
setp.ltu.f32 %p1373, %f1829, 0f47CE4780;
@%p1373 bra $L__BB0_1631;
setp.eq.f32 %p1374, %f1829, 0f7F800000;
@%p1374 bra $L__BB0_1630;
bra.uni $L__BB0_1625;
$L__BB0_1630:
mov.f32 %f4553, 0f00000000;
mul.rn.f32 %f5816, %f5343, %f4553;
mov.u32 %r8653, 0;
bra.uni $L__BB0_1631;
$L__BB0_1625:
mov.b32 %r2133, %f5343;
shr.u32 %r7050, %r2133, 23;
and.b32 %r7051, %r7050, 255;
add.s32 %r2134, %r7051, -128;
shl.b32 %r7052, %r2133, 8;
or.b32 %r2135, %r7052, -2147483648;
shr.u32 %r2136, %r2134, 5;
mov.u64 %rd2732, 0;
mov.u32 %r8650, 0;
mov.u64 %rd2730, __cudart_i2opi_f;
mov.u64 %rd2731, %rd1;
$L__BB0_1626:
.pragma "nounroll";
ld.global.nc.u32 %r7053, [%rd2730];
mad.wide.u32 %rd2229, %r7053, %r2135, %rd2732;
shr.u64 %rd2732, %rd2229, 32;
st.local.u32 [%rd2731], %rd2229;
add.s64 %rd2731, %rd2731, 4;
add.s64 %rd2730, %rd2730, 4;
add.s32 %r8650, %r8650, 1;
setp.ne.s32 %p1375, %r8650, 6;
@%p1375 bra $L__BB0_1626;
st.local.u32 [%rd5], %rd2732;
mov.u32 %r7054, 4;
sub.s32 %r2139, %r7054, %r2136;
mov.u32 %r7055, 6;
sub.s32 %r7056, %r7055, %r2136;
mul.wide.s32 %rd2230, %r7056, 4;
add.s64 %rd2231, %rd1, %rd2230;
ld.local.u32 %r8651, [%rd2231];
ld.local.u32 %r8652, [%rd2231+-4];
and.b32 %r2142, %r2134, 31;
setp.eq.s32 %p1376, %r2142, 0;
@%p1376 bra $L__BB0_1629;
mov.u32 %r7057, 32;
sub.s32 %r7058, %r7057, %r2142;
shr.u32 %r7059, %r8652, %r7058;
shl.b32 %r7060, %r8651, %r2142;
add.s32 %r8651, %r7059, %r7060;
mul.wide.s32 %rd2232, %r2139, 4;
add.s64 %rd2233, %rd1, %rd2232;
ld.local.u32 %r7061, [%rd2233];
shr.u32 %r7062, %r7061, %r7058;
shl.b32 %r7063, %r8652, %r2142;
add.s32 %r8652, %r7062, %r7063;
$L__BB0_1629:
and.b32 %r7064, %r2133, -2147483648;
shr.u32 %r7065, %r8652, 30;
shl.b32 %r7066, %r8651, 2;
or.b32 %r7067, %r7065, %r7066;
shr.u32 %r7068, %r7067, 31;
shr.u32 %r7069, %r8651, 30;
add.s32 %r7070, %r7068, %r7069;
neg.s32 %r7071, %r7070;
setp.eq.s32 %p1377, %r7064, 0;
selp.b32 %r8653, %r7070, %r7071, %p1377;
setp.ne.s32 %p1378, %r7068, 0;
xor.b32 %r7072, %r7064, -2147483648;
selp.b32 %r7073, %r7072, %r7064, %p1378;
selp.b32 %r7074, -1, 0, %p1378;
xor.b32 %r7075, %r7067, %r7074;
shl.b32 %r7076, %r8652, 2;
xor.b32 %r7077, %r7076, %r7074;
cvt.u64.u32 %rd2234, %r7075;
cvt.u64.u32 %rd2235, %r7077;
bfi.b64 %rd2236, %rd2234, %rd2235, 32, 32;
cvt.rn.f64.s64 %fd213, %rd2236;
mul.f64 %fd214, %fd213, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4551, %fd214;
setp.eq.s32 %p1379, %r7073, 0;
neg.f32 %f4552, %f4551;
selp.f32 %f5816, %f4551, %f4552, %p1379;
$L__BB0_1631:
and.b32 %r2149, %r8653, 1;
setp.eq.s32 %p1380, %r2149, 0;
selp.f32 %f1833, %f5816, 0f3F800000, %p1380;
mul.rn.f32 %f1834, %f5816, %f5816;
mov.f32 %f5817, 0fB94D4153;
@%p1380 bra $L__BB0_1633;
mov.f32 %f4555, 0fBAB607ED;
mov.f32 %f4556, 0f37CBAC00;
fma.rn.f32 %f5817, %f4556, %f1834, %f4555;
$L__BB0_1633:
selp.f32 %f4557, 0f3C0885E4, 0f3D2AAABB, %p1380;
fma.rn.f32 %f4558, %f5817, %f1834, %f4557;
selp.f32 %f4559, 0fBE2AAAA8, 0fBEFFFFFF, %p1380;
fma.rn.f32 %f4560, %f4558, %f1834, %f4559;
mov.f32 %f4561, 0f00000000;
fma.rn.f32 %f4562, %f1834, %f1833, %f4561;
fma.rn.f32 %f5213, %f4560, %f4562, %f1833;
and.b32 %r7079, %r8653, 2;
setp.eq.s32 %p1382, %r7079, 0;
@%p1382 bra $L__BB0_1635;
mov.f32 %f4564, 0fBF800000;
fma.rn.f32 %f5213, %f5213, %f4564, %f4561;
$L__BB0_1635:
setp.lt.s32 %p29, %r14, %r2131;
@%p1372 bra $L__BB0_1648;
mul.f32 %f4565, %f5335, 0f3F22F983;
cvt.rni.s32.f32 %r8657, %f4565;
cvt.rn.f32.s32 %f4566, %r8657;
mov.f32 %f4567, 0fBFC90FDA;
fma.rn.f32 %f4568, %f4566, %f4567, %f5335;
mov.f32 %f4569, 0fB3A22168;
fma.rn.f32 %f4570, %f4566, %f4569, %f4568;
mov.f32 %f4571, 0fA7C234C5;
fma.rn.f32 %f5820, %f4566, %f4571, %f4570;
abs.f32 %f1842, %f5335;
setp.ltu.f32 %p1384, %f1842, 0f47CE4780;
@%p1384 bra $L__BB0_1644;
setp.eq.f32 %p1385, %f1842, 0f7F800000;
@%p1385 bra $L__BB0_1643;
bra.uni $L__BB0_1638;
$L__BB0_1643:
mov.f32 %f4574, 0f00000000;
mul.rn.f32 %f5820, %f5335, %f4574;
mov.u32 %r8657, 0;
bra.uni $L__BB0_1644;
$L__BB0_1638:
mov.b32 %r2151, %f5335;
shr.u32 %r7081, %r2151, 23;
and.b32 %r7082, %r7081, 255;
add.s32 %r2152, %r7082, -128;
shl.b32 %r7083, %r2151, 8;
or.b32 %r2153, %r7083, -2147483648;
shr.u32 %r2154, %r2152, 5;
mov.u64 %rd2735, 0;
mov.u32 %r8654, 0;
mov.u64 %rd2733, __cudart_i2opi_f;
mov.u64 %rd2734, %rd1;
$L__BB0_1639:
.pragma "nounroll";
ld.global.nc.u32 %r7084, [%rd2733];
mad.wide.u32 %rd2239, %r7084, %r2153, %rd2735;
shr.u64 %rd2735, %rd2239, 32;
st.local.u32 [%rd2734], %rd2239;
add.s64 %rd2734, %rd2734, 4;
add.s64 %rd2733, %rd2733, 4;
add.s32 %r8654, %r8654, 1;
setp.ne.s32 %p1386, %r8654, 6;
@%p1386 bra $L__BB0_1639;
st.local.u32 [%rd5], %rd2735;
mov.u32 %r7085, 4;
sub.s32 %r2157, %r7085, %r2154;
mov.u32 %r7086, 6;
sub.s32 %r7087, %r7086, %r2154;
mul.wide.s32 %rd2240, %r7087, 4;
add.s64 %rd2241, %rd1, %rd2240;
ld.local.u32 %r8655, [%rd2241];
ld.local.u32 %r8656, [%rd2241+-4];
and.b32 %r2160, %r2152, 31;
setp.eq.s32 %p1387, %r2160, 0;
@%p1387 bra $L__BB0_1642;
mov.u32 %r7088, 32;
sub.s32 %r7089, %r7088, %r2160;
shr.u32 %r7090, %r8656, %r7089;
shl.b32 %r7091, %r8655, %r2160;
add.s32 %r8655, %r7090, %r7091;
mul.wide.s32 %rd2242, %r2157, 4;
add.s64 %rd2243, %rd1, %rd2242;
ld.local.u32 %r7092, [%rd2243];
shr.u32 %r7093, %r7092, %r7089;
shl.b32 %r7094, %r8656, %r2160;
add.s32 %r8656, %r7093, %r7094;
$L__BB0_1642:
and.b32 %r7095, %r2151, -2147483648;
shr.u32 %r7096, %r8656, 30;
shl.b32 %r7097, %r8655, 2;
or.b32 %r7098, %r7096, %r7097;
shr.u32 %r7099, %r7098, 31;
shr.u32 %r7100, %r8655, 30;
add.s32 %r7101, %r7099, %r7100;
neg.s32 %r7102, %r7101;
setp.eq.s32 %p1388, %r7095, 0;
selp.b32 %r8657, %r7101, %r7102, %p1388;
setp.ne.s32 %p1389, %r7099, 0;
xor.b32 %r7103, %r7095, -2147483648;
selp.b32 %r7104, %r7103, %r7095, %p1389;
selp.b32 %r7105, -1, 0, %p1389;
xor.b32 %r7106, %r7098, %r7105;
shl.b32 %r7107, %r8656, 2;
xor.b32 %r7108, %r7107, %r7105;
cvt.u64.u32 %rd2244, %r7106;
cvt.u64.u32 %rd2245, %r7108;
bfi.b64 %rd2246, %rd2244, %rd2245, 32, 32;
cvt.rn.f64.s64 %fd215, %rd2246;
mul.f64 %fd216, %fd215, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4572, %fd216;
setp.eq.s32 %p1390, %r7104, 0;
neg.f32 %f4573, %f4572;
selp.f32 %f5820, %f4572, %f4573, %p1390;
$L__BB0_1644:
add.s32 %r2167, %r8657, 1;
and.b32 %r2168, %r2167, 1;
setp.eq.s32 %p1391, %r2168, 0;
selp.f32 %f1846, %f5820, 0f3F800000, %p1391;
mul.rn.f32 %f1847, %f5820, %f5820;
mov.f32 %f5821, 0fB94D4153;
@%p1391 bra $L__BB0_1646;
mov.f32 %f4576, 0fBAB607ED;
mov.f32 %f4577, 0f37CBAC00;
fma.rn.f32 %f5821, %f4577, %f1847, %f4576;
$L__BB0_1646:
selp.f32 %f4578, 0f3C0885E4, 0f3D2AAABB, %p1391;
fma.rn.f32 %f4579, %f5821, %f1847, %f4578;
selp.f32 %f4580, 0fBE2AAAA8, 0fBEFFFFFF, %p1391;
fma.rn.f32 %f4581, %f4579, %f1847, %f4580;
mov.f32 %f4582, 0f00000000;
fma.rn.f32 %f4583, %f1847, %f1846, %f4582;
fma.rn.f32 %f5215, %f4581, %f4583, %f1846;
and.b32 %r7110, %r2167, 2;
setp.eq.s32 %p1393, %r7110, 0;
@%p1393 bra $L__BB0_1648;
mov.f32 %f4585, 0fBF800000;
fma.rn.f32 %f5215, %f5215, %f4585, %f4582;
$L__BB0_1648:
selp.f32 %f1854, %f5215, %f5216, %p29;
selp.f32 %f1855, %f5213, %f5214, %p29;
@%p1372 bra $L__BB0_1650;
add.f32 %f5900, %f1855, %f1854;
mov.f32 %f5214, %f5213;
mov.f32 %f5216, %f5215;
$L__BB0_1650:
@%p1226 bra $L__BB0_1679;
shl.b32 %r7112, %r12, 5;
neg.s32 %r2169, %r7112;
setp.ge.s32 %p1397, %r14, %r2169;
@%p1397 bra $L__BB0_1664;
mul.f32 %f4588, %f5342, 0f3F22F983;
cvt.rni.s32.f32 %r8661, %f4588;
cvt.rn.f32.s32 %f4589, %r8661;
mov.f32 %f4590, 0fBFC90FDA;
fma.rn.f32 %f4591, %f4589, %f4590, %f5342;
mov.f32 %f4592, 0fB3A22168;
fma.rn.f32 %f4593, %f4589, %f4592, %f4591;
mov.f32 %f4594, 0fA7C234C5;
fma.rn.f32 %f5829, %f4589, %f4594, %f4593;
abs.f32 %f1863, %f5342;
setp.ltu.f32 %p1398, %f1863, 0f47CE4780;
@%p1398 bra $L__BB0_1660;
setp.eq.f32 %p1399, %f1863, 0f7F800000;
@%p1399 bra $L__BB0_1659;
bra.uni $L__BB0_1654;
$L__BB0_1659:
mov.f32 %f4597, 0f00000000;
mul.rn.f32 %f5829, %f5342, %f4597;
mov.u32 %r8661, 0;
bra.uni $L__BB0_1660;
$L__BB0_1654:
mov.b32 %r2171, %f5342;
shr.u32 %r7114, %r2171, 23;
and.b32 %r7115, %r7114, 255;
add.s32 %r2172, %r7115, -128;
shl.b32 %r7116, %r2171, 8;
or.b32 %r2173, %r7116, -2147483648;
shr.u32 %r2174, %r2172, 5;
mov.u64 %rd2738, 0;
mov.u32 %r8658, 0;
mov.u64 %rd2736, __cudart_i2opi_f;
mov.u64 %rd2737, %rd1;
$L__BB0_1655:
.pragma "nounroll";
ld.global.nc.u32 %r7117, [%rd2736];
mad.wide.u32 %rd2249, %r7117, %r2173, %rd2738;
shr.u64 %rd2738, %rd2249, 32;
st.local.u32 [%rd2737], %rd2249;
add.s64 %rd2737, %rd2737, 4;
add.s64 %rd2736, %rd2736, 4;
add.s32 %r8658, %r8658, 1;
setp.ne.s32 %p1400, %r8658, 6;
@%p1400 bra $L__BB0_1655;
st.local.u32 [%rd5], %rd2738;
mov.u32 %r7118, 4;
sub.s32 %r2177, %r7118, %r2174;
mov.u32 %r7119, 6;
sub.s32 %r7120, %r7119, %r2174;
mul.wide.s32 %rd2250, %r7120, 4;
add.s64 %rd2251, %rd1, %rd2250;
ld.local.u32 %r8659, [%rd2251];
ld.local.u32 %r8660, [%rd2251+-4];
and.b32 %r2180, %r2172, 31;
setp.eq.s32 %p1401, %r2180, 0;
@%p1401 bra $L__BB0_1658;
mov.u32 %r7121, 32;
sub.s32 %r7122, %r7121, %r2180;
shr.u32 %r7123, %r8660, %r7122;
shl.b32 %r7124, %r8659, %r2180;
add.s32 %r8659, %r7123, %r7124;
mul.wide.s32 %rd2252, %r2177, 4;
add.s64 %rd2253, %rd1, %rd2252;
ld.local.u32 %r7125, [%rd2253];
shr.u32 %r7126, %r7125, %r7122;
shl.b32 %r7127, %r8660, %r2180;
add.s32 %r8660, %r7126, %r7127;
$L__BB0_1658:
and.b32 %r7128, %r2171, -2147483648;
shr.u32 %r7129, %r8660, 30;
shl.b32 %r7130, %r8659, 2;
or.b32 %r7131, %r7129, %r7130;
shr.u32 %r7132, %r7131, 31;
shr.u32 %r7133, %r8659, 30;
add.s32 %r7134, %r7132, %r7133;
neg.s32 %r7135, %r7134;
setp.eq.s32 %p1402, %r7128, 0;
selp.b32 %r8661, %r7134, %r7135, %p1402;
setp.ne.s32 %p1403, %r7132, 0;
xor.b32 %r7136, %r7128, -2147483648;
selp.b32 %r7137, %r7136, %r7128, %p1403;
selp.b32 %r7138, -1, 0, %p1403;
xor.b32 %r7139, %r7131, %r7138;
shl.b32 %r7140, %r8660, 2;
xor.b32 %r7141, %r7140, %r7138;
cvt.u64.u32 %rd2254, %r7139;
cvt.u64.u32 %rd2255, %r7141;
bfi.b64 %rd2256, %rd2254, %rd2255, 32, 32;
cvt.rn.f64.s64 %fd217, %rd2256;
mul.f64 %fd218, %fd217, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4595, %fd218;
setp.eq.s32 %p1404, %r7137, 0;
neg.f32 %f4596, %f4595;
selp.f32 %f5829, %f4595, %f4596, %p1404;
$L__BB0_1660:
and.b32 %r2187, %r8661, 1;
setp.eq.s32 %p1405, %r2187, 0;
selp.f32 %f1867, %f5829, 0f3F800000, %p1405;
mul.rn.f32 %f1868, %f5829, %f5829;
mov.f32 %f5830, 0fB94D4153;
@%p1405 bra $L__BB0_1662;
mov.f32 %f4599, 0fBAB607ED;
mov.f32 %f4600, 0f37CBAC00;
fma.rn.f32 %f5830, %f4600, %f1868, %f4599;
$L__BB0_1662:
selp.f32 %f4601, 0f3C0885E4, 0f3D2AAABB, %p1405;
fma.rn.f32 %f4602, %f5830, %f1868, %f4601;
selp.f32 %f4603, 0fBE2AAAA8, 0fBEFFFFFF, %p1405;
fma.rn.f32 %f4604, %f4602, %f1868, %f4603;
mov.f32 %f4605, 0f00000000;
fma.rn.f32 %f4606, %f1868, %f1867, %f4605;
fma.rn.f32 %f5213, %f4604, %f4606, %f1867;
and.b32 %r7143, %r8661, 2;
setp.eq.s32 %p1407, %r7143, 0;
@%p1407 bra $L__BB0_1664;
mov.f32 %f4608, 0fBF800000;
fma.rn.f32 %f5213, %f5213, %f4608, %f4605;
$L__BB0_1664:
setp.lt.s32 %p30, %r14, %r2169;
@%p1397 bra $L__BB0_1677;
mul.f32 %f4609, %f5334, 0f3F22F983;
cvt.rni.s32.f32 %r8665, %f4609;
cvt.rn.f32.s32 %f4610, %r8665;
mov.f32 %f4611, 0fBFC90FDA;
fma.rn.f32 %f4612, %f4610, %f4611, %f5334;
mov.f32 %f4613, 0fB3A22168;
fma.rn.f32 %f4614, %f4610, %f4613, %f4612;
mov.f32 %f4615, 0fA7C234C5;
fma.rn.f32 %f5833, %f4610, %f4615, %f4614;
abs.f32 %f1876, %f5334;
setp.ltu.f32 %p1409, %f1876, 0f47CE4780;
@%p1409 bra $L__BB0_1673;
setp.eq.f32 %p1410, %f1876, 0f7F800000;
@%p1410 bra $L__BB0_1672;
bra.uni $L__BB0_1667;
$L__BB0_1672:
mov.f32 %f4618, 0f00000000;
mul.rn.f32 %f5833, %f5334, %f4618;
mov.u32 %r8665, 0;
bra.uni $L__BB0_1673;
$L__BB0_1667:
mov.b32 %r2189, %f5334;
shr.u32 %r7145, %r2189, 23;
and.b32 %r7146, %r7145, 255;
add.s32 %r2190, %r7146, -128;
shl.b32 %r7147, %r2189, 8;
or.b32 %r2191, %r7147, -2147483648;
shr.u32 %r2192, %r2190, 5;
mov.u64 %rd2741, 0;
mov.u32 %r8662, 0;
mov.u64 %rd2739, __cudart_i2opi_f;
mov.u64 %rd2740, %rd1;
$L__BB0_1668:
.pragma "nounroll";
ld.global.nc.u32 %r7148, [%rd2739];
mad.wide.u32 %rd2259, %r7148, %r2191, %rd2741;
shr.u64 %rd2741, %rd2259, 32;
st.local.u32 [%rd2740], %rd2259;
add.s64 %rd2740, %rd2740, 4;
add.s64 %rd2739, %rd2739, 4;
add.s32 %r8662, %r8662, 1;
setp.ne.s32 %p1411, %r8662, 6;
@%p1411 bra $L__BB0_1668;
st.local.u32 [%rd5], %rd2741;
mov.u32 %r7149, 4;
sub.s32 %r2195, %r7149, %r2192;
mov.u32 %r7150, 6;
sub.s32 %r7151, %r7150, %r2192;
mul.wide.s32 %rd2260, %r7151, 4;
add.s64 %rd2261, %rd1, %rd2260;
ld.local.u32 %r8663, [%rd2261];
ld.local.u32 %r8664, [%rd2261+-4];
and.b32 %r2198, %r2190, 31;
setp.eq.s32 %p1412, %r2198, 0;
@%p1412 bra $L__BB0_1671;
mov.u32 %r7152, 32;
sub.s32 %r7153, %r7152, %r2198;
shr.u32 %r7154, %r8664, %r7153;
shl.b32 %r7155, %r8663, %r2198;
add.s32 %r8663, %r7154, %r7155;
mul.wide.s32 %rd2262, %r2195, 4;
add.s64 %rd2263, %rd1, %rd2262;
ld.local.u32 %r7156, [%rd2263];
shr.u32 %r7157, %r7156, %r7153;
shl.b32 %r7158, %r8664, %r2198;
add.s32 %r8664, %r7157, %r7158;
$L__BB0_1671:
and.b32 %r7159, %r2189, -2147483648;
shr.u32 %r7160, %r8664, 30;
shl.b32 %r7161, %r8663, 2;
or.b32 %r7162, %r7160, %r7161;
shr.u32 %r7163, %r7162, 31;
shr.u32 %r7164, %r8663, 30;
add.s32 %r7165, %r7163, %r7164;
neg.s32 %r7166, %r7165;
setp.eq.s32 %p1413, %r7159, 0;
selp.b32 %r8665, %r7165, %r7166, %p1413;
setp.ne.s32 %p1414, %r7163, 0;
xor.b32 %r7167, %r7159, -2147483648;
selp.b32 %r7168, %r7167, %r7159, %p1414;
selp.b32 %r7169, -1, 0, %p1414;
xor.b32 %r7170, %r7162, %r7169;
shl.b32 %r7171, %r8664, 2;
xor.b32 %r7172, %r7171, %r7169;
cvt.u64.u32 %rd2264, %r7170;
cvt.u64.u32 %rd2265, %r7172;
bfi.b64 %rd2266, %rd2264, %rd2265, 32, 32;
cvt.rn.f64.s64 %fd219, %rd2266;
mul.f64 %fd220, %fd219, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4616, %fd220;
setp.eq.s32 %p1415, %r7168, 0;
neg.f32 %f4617, %f4616;
selp.f32 %f5833, %f4616, %f4617, %p1415;
$L__BB0_1673:
add.s32 %r2205, %r8665, 1;
and.b32 %r2206, %r2205, 1;
setp.eq.s32 %p1416, %r2206, 0;
selp.f32 %f1880, %f5833, 0f3F800000, %p1416;
mul.rn.f32 %f1881, %f5833, %f5833;
mov.f32 %f5834, 0fB94D4153;
@%p1416 bra $L__BB0_1675;
mov.f32 %f4620, 0fBAB607ED;
mov.f32 %f4621, 0f37CBAC00;
fma.rn.f32 %f5834, %f4621, %f1881, %f4620;
$L__BB0_1675:
selp.f32 %f4622, 0f3C0885E4, 0f3D2AAABB, %p1416;
fma.rn.f32 %f4623, %f5834, %f1881, %f4622;
selp.f32 %f4624, 0fBE2AAAA8, 0fBEFFFFFF, %p1416;
fma.rn.f32 %f4625, %f4623, %f1881, %f4624;
mov.f32 %f4626, 0f00000000;
fma.rn.f32 %f4627, %f1881, %f1880, %f4626;
fma.rn.f32 %f5215, %f4625, %f4627, %f1880;
and.b32 %r7174, %r2205, 2;
setp.eq.s32 %p1418, %r7174, 0;
@%p1418 bra $L__BB0_1677;
mov.f32 %f4629, 0fBF800000;
fma.rn.f32 %f5215, %f5215, %f4629, %f4626;
$L__BB0_1677:
selp.f32 %f1888, %f5215, %f5216, %p30;
selp.f32 %f1889, %f5213, %f5214, %p30;
@%p1397 bra $L__BB0_1679;
add.f32 %f5899, %f1889, %f1888;
mov.f32 %f5214, %f5213;
mov.f32 %f5216, %f5215;
$L__BB0_1679:
@%p1226 bra $L__BB0_1901;
shl.b32 %r7176, %r12, 5;
mov.u32 %r7177, -32;
sub.s32 %r2207, %r7177, %r7176;
setp.ge.s32 %p1422, %r14, %r2207;
@%p1422 bra $L__BB0_1693;
mul.f32 %f4632, %f5341, 0f3F22F983;
cvt.rni.s32.f32 %r8669, %f4632;
cvt.rn.f32.s32 %f4633, %r8669;
mov.f32 %f4634, 0fBFC90FDA;
fma.rn.f32 %f4635, %f4633, %f4634, %f5341;
mov.f32 %f4636, 0fB3A22168;
fma.rn.f32 %f4637, %f4633, %f4636, %f4635;
mov.f32 %f4638, 0fA7C234C5;
fma.rn.f32 %f5842, %f4633, %f4638, %f4637;
abs.f32 %f1897, %f5341;
setp.ltu.f32 %p1423, %f1897, 0f47CE4780;
@%p1423 bra $L__BB0_1689;
setp.eq.f32 %p1424, %f1897, 0f7F800000;
@%p1424 bra $L__BB0_1688;
bra.uni $L__BB0_1683;
$L__BB0_1688:
mov.f32 %f4641, 0f00000000;
mul.rn.f32 %f5842, %f5341, %f4641;
mov.u32 %r8669, 0;
bra.uni $L__BB0_1689;
$L__BB0_1683:
mov.b32 %r2209, %f5341;
shr.u32 %r7179, %r2209, 23;
and.b32 %r7180, %r7179, 255;
add.s32 %r2210, %r7180, -128;
shl.b32 %r7181, %r2209, 8;
or.b32 %r2211, %r7181, -2147483648;
shr.u32 %r2212, %r2210, 5;
mov.u64 %rd2744, 0;
mov.u32 %r8666, 0;
mov.u64 %rd2742, __cudart_i2opi_f;
mov.u64 %rd2743, %rd1;
$L__BB0_1684:
.pragma "nounroll";
ld.global.nc.u32 %r7182, [%rd2742];
mad.wide.u32 %rd2269, %r7182, %r2211, %rd2744;
shr.u64 %rd2744, %rd2269, 32;
st.local.u32 [%rd2743], %rd2269;
add.s64 %rd2743, %rd2743, 4;
add.s64 %rd2742, %rd2742, 4;
add.s32 %r8666, %r8666, 1;
setp.ne.s32 %p1425, %r8666, 6;
@%p1425 bra $L__BB0_1684;
st.local.u32 [%rd5], %rd2744;
mov.u32 %r7183, 4;
sub.s32 %r2215, %r7183, %r2212;
mov.u32 %r7184, 6;
sub.s32 %r7185, %r7184, %r2212;
mul.wide.s32 %rd2270, %r7185, 4;
add.s64 %rd2271, %rd1, %rd2270;
ld.local.u32 %r8667, [%rd2271];
ld.local.u32 %r8668, [%rd2271+-4];
and.b32 %r2218, %r2210, 31;
setp.eq.s32 %p1426, %r2218, 0;
@%p1426 bra $L__BB0_1687;
mov.u32 %r7186, 32;
sub.s32 %r7187, %r7186, %r2218;
shr.u32 %r7188, %r8668, %r7187;
shl.b32 %r7189, %r8667, %r2218;
add.s32 %r8667, %r7188, %r7189;
mul.wide.s32 %rd2272, %r2215, 4;
add.s64 %rd2273, %rd1, %rd2272;
ld.local.u32 %r7190, [%rd2273];
shr.u32 %r7191, %r7190, %r7187;
shl.b32 %r7192, %r8668, %r2218;
add.s32 %r8668, %r7191, %r7192;
$L__BB0_1687:
and.b32 %r7193, %r2209, -2147483648;
shr.u32 %r7194, %r8668, 30;
shl.b32 %r7195, %r8667, 2;
or.b32 %r7196, %r7194, %r7195;
shr.u32 %r7197, %r7196, 31;
shr.u32 %r7198, %r8667, 30;
add.s32 %r7199, %r7197, %r7198;
neg.s32 %r7200, %r7199;
setp.eq.s32 %p1427, %r7193, 0;
selp.b32 %r8669, %r7199, %r7200, %p1427;
setp.ne.s32 %p1428, %r7197, 0;
xor.b32 %r7201, %r7193, -2147483648;
selp.b32 %r7202, %r7201, %r7193, %p1428;
selp.b32 %r7203, -1, 0, %p1428;
xor.b32 %r7204, %r7196, %r7203;
shl.b32 %r7205, %r8668, 2;
xor.b32 %r7206, %r7205, %r7203;
cvt.u64.u32 %rd2274, %r7204;
cvt.u64.u32 %rd2275, %r7206;
bfi.b64 %rd2276, %rd2274, %rd2275, 32, 32;
cvt.rn.f64.s64 %fd221, %rd2276;
mul.f64 %fd222, %fd221, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4639, %fd222;
setp.eq.s32 %p1429, %r7202, 0;
neg.f32 %f4640, %f4639;
selp.f32 %f5842, %f4639, %f4640, %p1429;
$L__BB0_1689:
and.b32 %r2225, %r8669, 1;
setp.eq.s32 %p1430, %r2225, 0;
selp.f32 %f1901, %f5842, 0f3F800000, %p1430;
mul.rn.f32 %f1902, %f5842, %f5842;
mov.f32 %f5843, 0fB94D4153;
@%p1430 bra $L__BB0_1691;
mov.f32 %f4643, 0fBAB607ED;
mov.f32 %f4644, 0f37CBAC00;
fma.rn.f32 %f5843, %f4644, %f1902, %f4643;
$L__BB0_1691:
selp.f32 %f4645, 0f3C0885E4, 0f3D2AAABB, %p1430;
fma.rn.f32 %f4646, %f5843, %f1902, %f4645;
selp.f32 %f4647, 0fBE2AAAA8, 0fBEFFFFFF, %p1430;
fma.rn.f32 %f4648, %f4646, %f1902, %f4647;
mov.f32 %f4649, 0f00000000;
fma.rn.f32 %f4650, %f1902, %f1901, %f4649;
fma.rn.f32 %f5213, %f4648, %f4650, %f1901;
and.b32 %r7208, %r8669, 2;
setp.eq.s32 %p1432, %r7208, 0;
@%p1432 bra $L__BB0_1693;
mov.f32 %f4652, 0fBF800000;
fma.rn.f32 %f5213, %f5213, %f4652, %f4649;
$L__BB0_1693:
setp.lt.s32 %p1434, %r14, %r2207;
selp.f32 %f1909, %f5213, %f5214, %p1434;
@%p1422 bra $L__BB0_1706;
mul.f32 %f4653, %f5333, 0f3F22F983;
cvt.rni.s32.f32 %r8673, %f4653;
cvt.rn.f32.s32 %f4654, %r8673;
mov.f32 %f4655, 0fBFC90FDA;
fma.rn.f32 %f4656, %f4654, %f4655, %f5333;
mov.f32 %f4657, 0fB3A22168;
fma.rn.f32 %f4658, %f4654, %f4657, %f4656;
mov.f32 %f4659, 0fA7C234C5;
fma.rn.f32 %f5846, %f4654, %f4659, %f4658;
abs.f32 %f1911, %f5333;
setp.ltu.f32 %p1435, %f1911, 0f47CE4780;
@%p1435 bra $L__BB0_1702;
setp.eq.f32 %p1436, %f1911, 0f7F800000;
@%p1436 bra $L__BB0_1701;
bra.uni $L__BB0_1696;
$L__BB0_1701:
mov.f32 %f4662, 0f00000000;
mul.rn.f32 %f5846, %f5333, %f4662;
mov.u32 %r8673, 0;
bra.uni $L__BB0_1702;
$L__BB0_1696:
mov.b32 %r2227, %f5333;
shr.u32 %r7210, %r2227, 23;
and.b32 %r7211, %r7210, 255;
add.s32 %r2228, %r7211, -128;
shl.b32 %r7212, %r2227, 8;
or.b32 %r2229, %r7212, -2147483648;
shr.u32 %r2230, %r2228, 5;
mov.u64 %rd2747, 0;
mov.u32 %r8670, 0;
mov.u64 %rd2745, __cudart_i2opi_f;
mov.u64 %rd2746, %rd1;
$L__BB0_1697:
.pragma "nounroll";
ld.global.nc.u32 %r7213, [%rd2745];
mad.wide.u32 %rd2279, %r7213, %r2229, %rd2747;
shr.u64 %rd2747, %rd2279, 32;
st.local.u32 [%rd2746], %rd2279;
add.s64 %rd2746, %rd2746, 4;
add.s64 %rd2745, %rd2745, 4;
add.s32 %r8670, %r8670, 1;
setp.ne.s32 %p1437, %r8670, 6;
@%p1437 bra $L__BB0_1697;
st.local.u32 [%rd5], %rd2747;
mov.u32 %r7214, 4;
sub.s32 %r2233, %r7214, %r2230;
mov.u32 %r7215, 6;
sub.s32 %r7216, %r7215, %r2230;
mul.wide.s32 %rd2280, %r7216, 4;
add.s64 %rd2281, %rd1, %rd2280;
ld.local.u32 %r8671, [%rd2281];
ld.local.u32 %r8672, [%rd2281+-4];
and.b32 %r2236, %r2228, 31;
setp.eq.s32 %p1438, %r2236, 0;
@%p1438 bra $L__BB0_1700;
mov.u32 %r7217, 32;
sub.s32 %r7218, %r7217, %r2236;
shr.u32 %r7219, %r8672, %r7218;
shl.b32 %r7220, %r8671, %r2236;
add.s32 %r8671, %r7219, %r7220;
mul.wide.s32 %rd2282, %r2233, 4;
add.s64 %rd2283, %rd1, %rd2282;
ld.local.u32 %r7221, [%rd2283];
shr.u32 %r7222, %r7221, %r7218;
shl.b32 %r7223, %r8672, %r2236;
add.s32 %r8672, %r7222, %r7223;
$L__BB0_1700:
and.b32 %r7224, %r2227, -2147483648;
shr.u32 %r7225, %r8672, 30;
shl.b32 %r7226, %r8671, 2;
or.b32 %r7227, %r7225, %r7226;
shr.u32 %r7228, %r7227, 31;
shr.u32 %r7229, %r8671, 30;
add.s32 %r7230, %r7228, %r7229;
neg.s32 %r7231, %r7230;
setp.eq.s32 %p1439, %r7224, 0;
selp.b32 %r8673, %r7230, %r7231, %p1439;
setp.ne.s32 %p1440, %r7228, 0;
xor.b32 %r7232, %r7224, -2147483648;
selp.b32 %r7233, %r7232, %r7224, %p1440;
selp.b32 %r7234, -1, 0, %p1440;
xor.b32 %r7235, %r7227, %r7234;
shl.b32 %r7236, %r8672, 2;
xor.b32 %r7237, %r7236, %r7234;
cvt.u64.u32 %rd2284, %r7235;
cvt.u64.u32 %rd2285, %r7237;
bfi.b64 %rd2286, %rd2284, %rd2285, 32, 32;
cvt.rn.f64.s64 %fd223, %rd2286;
mul.f64 %fd224, %fd223, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4660, %fd224;
setp.eq.s32 %p1441, %r7233, 0;
neg.f32 %f4661, %f4660;
selp.f32 %f5846, %f4660, %f4661, %p1441;
$L__BB0_1702:
add.s32 %r2243, %r8673, 1;
and.b32 %r2244, %r2243, 1;
setp.eq.s32 %p1442, %r2244, 0;
selp.f32 %f1915, %f5846, 0f3F800000, %p1442;
mul.rn.f32 %f1916, %f5846, %f5846;
mov.f32 %f5847, 0fB94D4153;
@%p1442 bra $L__BB0_1704;
mov.f32 %f4664, 0fBAB607ED;
mov.f32 %f4665, 0f37CBAC00;
fma.rn.f32 %f5847, %f4665, %f1916, %f4664;
$L__BB0_1704:
selp.f32 %f4666, 0f3C0885E4, 0f3D2AAABB, %p1442;
fma.rn.f32 %f4667, %f5847, %f1916, %f4666;
selp.f32 %f4668, 0fBE2AAAA8, 0fBEFFFFFF, %p1442;
fma.rn.f32 %f4669, %f4667, %f1916, %f4668;
mov.f32 %f4670, 0f00000000;
fma.rn.f32 %f4671, %f1916, %f1915, %f4670;
fma.rn.f32 %f5215, %f4669, %f4671, %f1915;
and.b32 %r7239, %r2243, 2;
setp.eq.s32 %p1444, %r7239, 0;
@%p1444 bra $L__BB0_1706;
mov.f32 %f4673, 0fBF800000;
fma.rn.f32 %f5215, %f5215, %f4673, %f4670;
$L__BB0_1706:
selp.f32 %f1923, %f5215, %f5216, %p1434;
@%p1422 bra $L__BB0_1901;
add.f32 %f5898, %f1909, %f1923;
$L__BB0_1901:
shl.b32 %r2541, %r12, 1;
@%p32 bra $L__BB0_1903;
shl.b32 %r7804, %r12, 3;
neg.s32 %r7805, %r7804;
add.s32 %r7806, %r13, -12;
setp.lt.s32 %p1608, %r7806, %r7805;
@%p1608 bra $L__BB0_1905;
bra.uni $L__BB0_1903;
$L__BB0_1905:
add.f32 %f5025, %f5332, 0f00000000;
add.f32 %f5026, %f5025, %f5331;
add.f32 %f5027, %f5026, %f5330;
add.f32 %f5028, %f5027, %f5329;
add.f32 %f5029, %f5028, %f5328;
add.f32 %f5030, %f5029, %f5327;
add.f32 %f5031, %f5030, %f5326;
add.f32 %f5906, %f5031, %f5325;
bra.uni $L__BB0_1906;
$L__BB0_1903:
neg.s32 %r7807, %r2541;
add.s32 %r7808, %r13, -15;
setp.lt.s32 %p1609, %r7808, %r7807;
shl.b32 %r7809, %r12, 6;
neg.s32 %r7810, %r7809;
setp.lt.s32 %p1610, %r14, %r7810;
and.pred %p1611, %p1609, %p1610;
add.f32 %f5011, %f5332, 0f00000000;
selp.f32 %f5012, %f5011, 0f00000000, %p1611;
mov.u32 %r7811, -32;
sub.s32 %r2542, %r7811, %r7809;
setp.lt.s32 %p1612, %r14, %r2542;
add.f32 %f5013, %f5012, %f5331;
and.pred %p1613, %p1609, %p1612;
selp.f32 %f5014, %f5013, %f5012, %p1613;
not.b32 %r7812, %r2541;
setp.lt.s32 %p1614, %r7808, %r7812;
add.f32 %f5015, %f5014, %f5330;
and.pred %p1615, %p1614, %p1610;
selp.f32 %f5016, %f5015, %f5014, %p1615;
add.f32 %f5017, %f5016, %f5329;
and.pred %p1616, %p1614, %p1612;
selp.f32 %f5018, %f5017, %f5016, %p1616;
mov.u32 %r7813, -2;
sub.s32 %r7814, %r7813, %r2541;
setp.lt.s32 %p1617, %r7808, %r7814;
add.f32 %f5019, %f5018, %f5328;
and.pred %p1618, %p1617, %p1610;
selp.f32 %f5020, %f5019, %f5018, %p1618;
add.f32 %f5021, %f5020, %f5327;
and.pred %p1619, %p1617, %p1612;
selp.f32 %f5022, %f5021, %f5020, %p1619;
mov.u32 %r7815, -3;
sub.s32 %r7816, %r7815, %r2541;
setp.ge.s32 %p1620, %r7808, %r7816;
setp.lt.s32 %p1621, %r7808, %r7816;
add.f32 %f5023, %f5022, %f5326;
and.pred %p1622, %p1621, %p1610;
selp.f32 %f5906, %f5023, %f5022, %p1622;
@%p1620 bra $L__BB0_1906;
add.f32 %f5024, %f5906, %f5325;
selp.f32 %f5906, %f5024, %f5906, %p1612;
$L__BB0_1906:
@%p32 bra $L__BB0_1908;
shl.b32 %r7817, %r12, 3;
mov.u32 %r7818, -4;
sub.s32 %r7819, %r7818, %r7817;
add.s32 %r7820, %r13, -12;
setp.lt.s32 %p1625, %r7820, %r7819;
@%p1625 bra $L__BB0_1910;
bra.uni $L__BB0_1908;
$L__BB0_1910:
add.f32 %f5046, %f5906, %f5523;
add.f32 %f5047, %f5046, %f5522;
add.f32 %f5048, %f5047, %f5521;
add.f32 %f5049, %f5048, %f5520;
add.f32 %f5050, %f5049, %f5519;
add.f32 %f5051, %f5050, %f5518;
add.f32 %f5052, %f5051, %f5517;
add.f32 %f5907, %f5052, %f5516;
bra.uni $L__BB0_1911;
$L__BB0_1908:
mov.u32 %r7821, -4;
sub.s32 %r7822, %r7821, %r2541;
add.s32 %r7823, %r13, -15;
setp.lt.s32 %p1626, %r7823, %r7822;
shl.b32 %r7824, %r12, 6;
neg.s32 %r7825, %r7824;
setp.lt.s32 %p1627, %r14, %r7825;
and.pred %p1628, %p1626, %p1627;
add.f32 %f5032, %f5906, %f5523;
selp.f32 %f5033, %f5032, %f5906, %p1628;
mov.u32 %r7826, -32;
sub.s32 %r2543, %r7826, %r7824;
setp.lt.s32 %p1629, %r14, %r2543;
add.f32 %f5034, %f5033, %f5522;
and.pred %p1630, %p1626, %p1629;
selp.f32 %f5035, %f5034, %f5033, %p1630;
mov.u32 %r7827, -5;
sub.s32 %r7828, %r7827, %r2541;
setp.lt.s32 %p1631, %r7823, %r7828;
add.f32 %f5036, %f5035, %f5521;
and.pred %p1632, %p1631, %p1627;
selp.f32 %f5037, %f5036, %f5035, %p1632;
add.f32 %f5038, %f5037, %f5520;
and.pred %p1633, %p1631, %p1629;
selp.f32 %f5039, %f5038, %f5037, %p1633;
mov.u32 %r7829, -6;
sub.s32 %r7830, %r7829, %r2541;
setp.lt.s32 %p1634, %r7823, %r7830;
add.f32 %f5040, %f5039, %f5519;
and.pred %p1635, %p1634, %p1627;
selp.f32 %f5041, %f5040, %f5039, %p1635;
add.f32 %f5042, %f5041, %f5518;
and.pred %p1636, %p1634, %p1629;
selp.f32 %f5043, %f5042, %f5041, %p1636;
mov.u32 %r7831, -7;
sub.s32 %r7832, %r7831, %r2541;
setp.ge.s32 %p1637, %r7823, %r7832;
setp.lt.s32 %p1638, %r7823, %r7832;
add.f32 %f5044, %f5043, %f5517;
and.pred %p1639, %p1638, %p1627;
selp.f32 %f5907, %f5044, %f5043, %p1639;
@%p1637 bra $L__BB0_1911;
add.f32 %f5045, %f5907, %f5516;
selp.f32 %f5907, %f5045, %f5907, %p1629;
$L__BB0_1911:
@%p32 bra $L__BB0_1913;
shl.b32 %r7833, %r12, 3;
mov.u32 %r7834, -8;
sub.s32 %r7835, %r7834, %r7833;
add.s32 %r7836, %r13, -12;
setp.lt.s32 %p1642, %r7836, %r7835;
@%p1642 bra $L__BB0_1915;
bra.uni $L__BB0_1913;
$L__BB0_1915:
add.f32 %f5067, %f5907, %f5714;
add.f32 %f5068, %f5067, %f5713;
add.f32 %f5069, %f5068, %f5712;
add.f32 %f5070, %f5069, %f5711;
add.f32 %f5071, %f5070, %f5710;
add.f32 %f5072, %f5071, %f5709;
add.f32 %f5073, %f5072, %f5708;
add.f32 %f5908, %f5073, %f5707;
bra.uni $L__BB0_1916;
$L__BB0_1913:
mov.u32 %r7837, -8;
sub.s32 %r7838, %r7837, %r2541;
add.s32 %r7839, %r13, -15;
setp.lt.s32 %p1643, %r7839, %r7838;
shl.b32 %r7840, %r12, 6;
neg.s32 %r7841, %r7840;
setp.lt.s32 %p1644, %r14, %r7841;
and.pred %p1645, %p1643, %p1644;
add.f32 %f5053, %f5907, %f5714;
selp.f32 %f5054, %f5053, %f5907, %p1645;
mov.u32 %r7842, -32;
sub.s32 %r2544, %r7842, %r7840;
setp.lt.s32 %p1646, %r14, %r2544;
add.f32 %f5055, %f5054, %f5713;
and.pred %p1647, %p1643, %p1646;
selp.f32 %f5056, %f5055, %f5054, %p1647;
mov.u32 %r7843, -9;
sub.s32 %r7844, %r7843, %r2541;
setp.lt.s32 %p1648, %r7839, %r7844;
add.f32 %f5057, %f5056, %f5712;
and.pred %p1649, %p1648, %p1644;
selp.f32 %f5058, %f5057, %f5056, %p1649;
add.f32 %f5059, %f5058, %f5711;
and.pred %p1650, %p1648, %p1646;
selp.f32 %f5060, %f5059, %f5058, %p1650;
mov.u32 %r7845, -10;
sub.s32 %r7846, %r7845, %r2541;
setp.lt.s32 %p1651, %r7839, %r7846;
add.f32 %f5061, %f5060, %f5710;
and.pred %p1652, %p1651, %p1644;
selp.f32 %f5062, %f5061, %f5060, %p1652;
add.f32 %f5063, %f5062, %f5709;
and.pred %p1653, %p1651, %p1646;
selp.f32 %f5064, %f5063, %f5062, %p1653;
mov.u32 %r7847, -11;
sub.s32 %r7848, %r7847, %r2541;
setp.ge.s32 %p1654, %r7839, %r7848;
setp.lt.s32 %p1655, %r7839, %r7848;
add.f32 %f5065, %f5064, %f5708;
and.pred %p1656, %p1655, %p1644;
selp.f32 %f5908, %f5065, %f5064, %p1656;
@%p1654 bra $L__BB0_1916;
add.f32 %f5066, %f5908, %f5707;
selp.f32 %f5908, %f5066, %f5908, %p1646;
$L__BB0_1916:
@%p32 bra $L__BB0_1918;
shl.b32 %r7849, %r12, 3;
mov.u32 %r7850, -12;
sub.s32 %r7851, %r7850, %r7849;
add.s32 %r7852, %r13, -12;
setp.lt.s32 %p1659, %r7852, %r7851;
@%p1659 bra $L__BB0_1920;
bra.uni $L__BB0_1918;
$L__BB0_1920:
add.f32 %f5088, %f5908, %f5905;
add.f32 %f5089, %f5088, %f5904;
add.f32 %f5090, %f5089, %f5903;
add.f32 %f5091, %f5090, %f5902;
add.f32 %f5092, %f5091, %f5901;
add.f32 %f5093, %f5092, %f5900;
add.f32 %f5094, %f5093, %f5899;
add.f32 %f5909, %f5094, %f5898;
bra.uni $L__BB0_1921;
$L__BB0_1918:
mov.u32 %r7853, -12;
sub.s32 %r7854, %r7853, %r2541;
add.s32 %r7855, %r13, -15;
mov.u32 %r7856, -15;
setp.lt.s32 %p1660, %r7855, %r7854;
shl.b32 %r7857, %r12, 6;
neg.s32 %r7858, %r7857;
setp.lt.s32 %p1661, %r14, %r7858;
and.pred %p1662, %p1660, %p1661;
add.f32 %f5074, %f5908, %f5905;
selp.f32 %f5075, %f5074, %f5908, %p1662;
mov.u32 %r7859, -32;
sub.s32 %r2545, %r7859, %r7857;
setp.lt.s32 %p1663, %r14, %r2545;
add.f32 %f5076, %f5075, %f5904;
and.pred %p1664, %p1660, %p1663;
selp.f32 %f5077, %f5076, %f5075, %p1664;
mov.u32 %r7860, -13;
sub.s32 %r7861, %r7860, %r2541;
setp.lt.s32 %p1665, %r7855, %r7861;
add.f32 %f5078, %f5077, %f5903;
and.pred %p1666, %p1665, %p1661;
selp.f32 %f5079, %f5078, %f5077, %p1666;
add.f32 %f5080, %f5079, %f5902;
and.pred %p1667, %p1665, %p1663;
selp.f32 %f5081, %f5080, %f5079, %p1667;
mov.u32 %r7862, -14;
sub.s32 %r7863, %r7862, %r2541;
setp.lt.s32 %p1668, %r7855, %r7863;
add.f32 %f5082, %f5081, %f5901;
and.pred %p1669, %p1668, %p1661;
selp.f32 %f5083, %f5082, %f5081, %p1669;
add.f32 %f5084, %f5083, %f5900;
and.pred %p1670, %p1668, %p1663;
selp.f32 %f5085, %f5084, %f5083, %p1670;
sub.s32 %r7864, %r7856, %r2541;
setp.ge.s32 %p1671, %r7855, %r7864;
setp.lt.s32 %p1672, %r7855, %r7864;
add.f32 %f5086, %f5085, %f5899;
and.pred %p1673, %p1672, %p1661;
selp.f32 %f5909, %f5086, %f5085, %p1673;
@%p1671 bra $L__BB0_1921;
add.f32 %f5087, %f5909, %f5898;
selp.f32 %f5909, %f5087, %f5909, %p1663;
$L__BB0_1921:
shl.b32 %r2546, %r12, 2;
or.b32 %r2547, %r1, %r13;
mov.u32 %r7865, %ntid.z;
mov.u32 %r2548, %ntid.x;
mul.lo.s32 %r2549, %r2548, %r7865;
mul.lo.s32 %r7866, %r13, %r2548;
add.s32 %r2550, %r7866, %r1;
mov.u32 %r2551, %tid.y;
mul.lo.s32 %r2552, %r2551, %r2548;
add.s32 %r7867, %r2552, %r1;
mov.u32 %r2553, %ntid.y;
mad.lo.s32 %r2554, %r7866, %r2553, %r7867;
mul.wide.u32 %rd2473, %r2554, 4;
mov.u64 %rd2474, _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_e06bf064_191105arrayE;
add.s64 %rd602, %rd2474, %rd2473;
st.shared.f32 [%rd602], %f5909;
bar.sync 0;
clz.b32 %r7868, %r2549;
mov.u32 %r7869, 31;
sub.s32 %r7870, %r7869, %r7868;
mov.u32 %r7871, 1;
shl.b32 %r8739, %r7871, %r7870;
setp.ge.u32 %p1675, %r2550, %r8739;
add.s32 %r2556, %r8739, %r2550;
setp.ge.u32 %p1676, %r2556, %r2549;
or.pred %p1677, %p1675, %p1676;
@%p1677 bra $L__BB0_1925;
add.s32 %r8738, %r8739, %r2554;
setp.lt.u32 %p1678, %r2553, 2;
@%p1678 bra $L__BB0_1924;
rem.u32 %r7872, %r2556, %r2548;
add.s32 %r7873, %r7872, %r2552;
sub.s32 %r7874, %r2556, %r7872;
mad.lo.s32 %r8738, %r7874, %r2553, %r7873;
$L__BB0_1924:
mul.wide.s32 %rd2475, %r8738, 4;
add.s64 %rd2477, %rd2474, %rd2475;
ld.shared.f32 %f5095, [%rd602];
ld.shared.f32 %f5096, [%rd2477];
add.f32 %f5097, %f5096, %f5095;
st.shared.f32 [%rd602], %f5097;
$L__BB0_1925:
bar.sync 0;
setp.lt.s32 %p1679, %r8739, 4;
@%p1679 bra $L__BB0_1931;
$L__BB0_1926:
shr.u32 %r2561, %r8739, 1;
setp.ge.u32 %p1680, %r2550, %r2561;
@%p1680 bra $L__BB0_1930;
setp.lt.u32 %p1681, %r2553, 2;
add.s32 %r8740, %r2561, %r2554;
@%p1681 bra $L__BB0_1929;
add.s32 %r7875, %r2561, %r2550;
rem.u32 %r7876, %r7875, %r2548;
add.s32 %r7877, %r7876, %r2552;
sub.s32 %r7878, %r7875, %r7876;
mad.lo.s32 %r8740, %r7878, %r2553, %r7877;
$L__BB0_1929:
mul.wide.s32 %rd2478, %r8740, 4;
add.s64 %rd2480, %rd2474, %rd2478;
ld.shared.f32 %f5098, [%rd602];
ld.shared.f32 %f5099, [%rd2480];
add.f32 %f5100, %f5099, %f5098;
st.shared.f32 [%rd602], %f5100;
$L__BB0_1930:
bar.sync 0;
setp.gt.u32 %p1682, %r8739, 7;
mov.u32 %r8739, %r2561;
@%p1682 bra $L__BB0_1926;
$L__BB0_1931:
setp.ne.s32 %p1683, %r2547, 0;
mov.f32 %f5910, 0f00000000;
@%p1683 bra $L__BB0_1934;
ld.shared.f32 %f5102, [%rd602];
add.f32 %f5910, %f5102, 0f00000000;
setp.lt.u32 %p1684, %r2549, 2;
@%p1684 bra $L__BB0_1934;
add.s32 %r7879, %r2554, 1;
mul.wide.u32 %rd2481, %r7879, 4;
add.s64 %rd2483, %rd2474, %rd2481;
ld.shared.f32 %f5103, [%rd2483];
add.f32 %f5910, %f5910, %f5103;
$L__BB0_1934:
bar.sync 0;
mul.wide.s32 %rd2484, %r2551, 4;
add.s64 %rd603, %rd2474, %rd2484;
setp.eq.s32 %p1685, %r2547, 0;
@%p1685 bra $L__BB0_1935;
bra.uni $L__BB0_1936;
$L__BB0_1935:
st.shared.f32 [%rd603], %f5910;
$L__BB0_1936:
bar.sync 0;
ld.shared.f32 %f2168, [%rd603];
bar.sync 0;
@%p32 bra $L__BB0_1938;
shl.b32 %r7880, %r12, 4;
neg.s32 %r7881, %r7880;
add.s32 %r7882, %r13, -12;
setp.lt.s32 %p1687, %r7882, %r7881;
@%p1687 bra $L__BB0_1962;
bra.uni $L__BB0_1938;
$L__BB0_1962:
add.f32 %f5105, %f2168, %f5332;
mov.u32 %r7920, %ctaid.x;
mad.lo.s32 %r7921, %r7920, 63, %r1;
mad.lo.s32 %r7922, %r13, 2646, %r7921;
shl.b32 %r7923, %r12, 7;
add.s32 %r7924, %r7922, %r7923;
mul.wide.s32 %rd2487, %r7924, 4;
add.s64 %rd2488, %rd4, %rd2487;
st.global.f32 [%rd2488], %f5105;
add.f32 %f5106, %f2168, %f5331;
st.global.f32 [%rd2488+128], %f5106;
add.f32 %f5107, %f2168, %f5330;
st.global.f32 [%rd2488+10584], %f5107;
add.f32 %f5108, %f2168, %f5329;
st.global.f32 [%rd2488+10712], %f5108;
add.f32 %f5109, %f2168, %f5328;
st.global.f32 [%rd2488+21168], %f5109;
add.f32 %f5110, %f2168, %f5327;
st.global.f32 [%rd2488+21296], %f5110;
add.f32 %f5111, %f2168, %f5326;
st.global.f32 [%rd2488+31752], %f5111;
add.f32 %f5112, %f2168, %f5325;
st.global.f32 [%rd2488+31880], %f5112;
bra.uni $L__BB0_1963;
$L__BB0_1938:
neg.s32 %r2565, %r2546;
add.s32 %r7883, %r13, -15;
setp.ge.s32 %p1688, %r7883, %r2565;
add.f32 %f5926, %f2168, %f5332;
add.f32 %f5925, %f2168, %f5331;
add.f32 %f5924, %f2168, %f5330;
add.f32 %f5923, %f2168, %f5329;
add.f32 %f5922, %f2168, %f5328;
add.f32 %f5921, %f2168, %f5327;
add.f32 %f5920, %f2168, %f5326;
add.f32 %f5919, %f2168, %f5325;
mov.u32 %r7884, %ctaid.x;
mad.lo.s32 %r7885, %r7884, 63, %r1;
mad.lo.s32 %r7886, %r13, 2646, %r7885;
shl.b32 %r2566, %r12, 7;
add.s32 %r7887, %r7886, %r2566;
mul.wide.s32 %rd2486, %r7887, 4;
add.s64 %rd604, %rd4, %rd2486;
@%p1688 bra $L__BB0_1941;
neg.s32 %r7888, %r2566;
setp.ge.s32 %p1689, %r14, %r7888;
@%p1689 bra $L__BB0_1941;
st.global.f32 [%rd604], %f5926;
$L__BB0_1941:
@%p1688 bra $L__BB0_1944;
mov.u32 %r7891, -32;
sub.s32 %r7892, %r7891, %r2566;
setp.ge.s32 %p1691, %r14, %r7892;
@%p1691 bra $L__BB0_1944;
st.global.f32 [%rd604+128], %f5925;
$L__BB0_1944:
add.s32 %r7894, %r2565, -1;
setp.ge.s32 %p1692, %r7883, %r7894;
@%p1692 bra $L__BB0_1947;
neg.s32 %r7896, %r2566;
setp.ge.s32 %p1693, %r14, %r7896;
@%p1693 bra $L__BB0_1947;
st.global.f32 [%rd604+10584], %f5924;
$L__BB0_1947:
@%p1692 bra $L__BB0_1950;
mov.u32 %r7900, -32;
sub.s32 %r7901, %r7900, %r2566;
setp.ge.s32 %p1695, %r14, %r7901;
@%p1695 bra $L__BB0_1950;
st.global.f32 [%rd604+10712], %f5923;
$L__BB0_1950:
add.s32 %r7903, %r2565, -2;
setp.ge.s32 %p1696, %r7883, %r7903;
@%p1696 bra $L__BB0_1953;
neg.s32 %r7905, %r2566;
setp.ge.s32 %p1697, %r14, %r7905;
@%p1697 bra $L__BB0_1953;
st.global.f32 [%rd604+21168], %f5922;
$L__BB0_1953:
@%p1696 bra $L__BB0_1956;
mov.u32 %r7909, -32;
sub.s32 %r7910, %r7909, %r2566;
setp.ge.s32 %p1699, %r14, %r7910;
@%p1699 bra $L__BB0_1956;
st.global.f32 [%rd604+21296], %f5921;
$L__BB0_1956:
add.s32 %r7912, %r2565, -3;
setp.ge.s32 %p1700, %r7883, %r7912;
@%p1700 bra $L__BB0_1959;
neg.s32 %r7914, %r2566;
setp.ge.s32 %p1701, %r14, %r7914;
@%p1701 bra $L__BB0_1959;
st.global.f32 [%rd604+31752], %f5920;
$L__BB0_1959:
@%p1700 bra $L__BB0_1963;
mov.u32 %r7918, -32;
sub.s32 %r7919, %r7918, %r2566;
setp.ge.s32 %p1703, %r14, %r7919;
@%p1703 bra $L__BB0_1963;
st.global.f32 [%rd604+31880], %f5919;
$L__BB0_1963:
@%p32 bra $L__BB0_1965;
shl.b32 %r7925, %r12, 4;
mov.u32 %r7926, -4;
sub.s32 %r7927, %r7926, %r7925;
add.s32 %r7928, %r13, -12;
setp.lt.s32 %p1705, %r7928, %r7927;
@%p1705 bra $L__BB0_1989;
bra.uni $L__BB0_1965;
$L__BB0_1989:
add.f32 %f5121, %f2168, %f5523;
mov.u32 %r7962, %ctaid.x;
mad.lo.s32 %r7963, %r7962, 63, %r1;
mad.lo.s32 %r7964, %r13, 2646, %r7963;
shl.b32 %r7965, %r12, 7;
add.s32 %r7966, %r7964, %r7965;
add.s32 %r7967, %r7966, 10584;
mul.wide.s32 %rd2490, %r7967, 4;
add.s64 %rd2491, %rd4, %rd2490;
st.global.f32 [%rd2491], %f5121;
add.f32 %f5122, %f2168, %f5522;
st.global.f32 [%rd2491+128], %f5122;
add.f32 %f5123, %f2168, %f5521;
st.global.f32 [%rd2491+10584], %f5123;
add.f32 %f5124, %f2168, %f5520;
st.global.f32 [%rd2491+10712], %f5124;
add.f32 %f5125, %f2168, %f5519;
st.global.f32 [%rd2491+21168], %f5125;
add.f32 %f5126, %f2168, %f5518;
st.global.f32 [%rd2491+21296], %f5126;
add.f32 %f5127, %f2168, %f5517;
st.global.f32 [%rd2491+31752], %f5127;
add.f32 %f5128, %f2168, %f5516;
st.global.f32 [%rd2491+31880], %f5128;
bra.uni $L__BB0_1990;
$L__BB0_1965:
mov.u32 %r7929, -4;
sub.s32 %r2567, %r7929, %r2546;
add.s32 %r7930, %r13, -15;
setp.ge.s32 %p1706, %r7930, %r2567;
setp.lt.s32 %p1707, %r7930, %r2567;
shl.b32 %r7931, %r12, 7;
neg.s32 %r2568, %r7931;
setp.lt.s32 %p1708, %r14, %r2568;
and.pred %p1709, %p1707, %p1708;
add.f32 %f5113, %f2168, %f5523;
selp.f32 %f5926, %f5113, %f5926, %p1709;
mov.u32 %r7932, -32;
sub.s32 %r7933, %r7932, %r7931;
setp.lt.s32 %p1710, %r14, %r7933;
and.pred %p1711, %p1707, %p1710;
add.f32 %f5114, %f2168, %f5522;
selp.f32 %f5925, %f5114, %f5925, %p1711;
mov.u32 %r7934, -5;
sub.s32 %r7935, %r7934, %r2546;
setp.lt.s32 %p1712, %r7930, %r7935;
and.pred %p1713, %p1712, %p1708;
add.f32 %f5115, %f2168, %f5521;
selp.f32 %f5924, %f5115, %f5924, %p1713;
and.pred %p1714, %p1712, %p1710;
add.f32 %f5116, %f2168, %f5520;
selp.f32 %f5923, %f5116, %f5923, %p1714;
mov.u32 %r7936, -6;
sub.s32 %r7937, %r7936, %r2546;
setp.lt.s32 %p1715, %r7930, %r7937;
and.pred %p1716, %p1715, %p1708;
add.f32 %f5117, %f2168, %f5519;
selp.f32 %f5922, %f5117, %f5922, %p1716;
and.pred %p1717, %p1715, %p1710;
add.f32 %f5118, %f2168, %f5518;
selp.f32 %f5921, %f5118, %f5921, %p1717;
mov.u32 %r7938, -7;
sub.s32 %r7939, %r7938, %r2546;
setp.lt.s32 %p1718, %r7930, %r7939;
and.pred %p1719, %p1718, %p1708;
add.f32 %f5119, %f2168, %f5517;
selp.f32 %f5920, %f5119, %f5920, %p1719;
and.pred %p1720, %p1718, %p1710;
add.f32 %f5120, %f2168, %f5516;
selp.f32 %f5919, %f5120, %f5919, %p1720;
mov.u32 %r7940, %ctaid.x;
mad.lo.s32 %r7941, %r7940, 63, %r1;
mad.lo.s32 %r7942, %r13, 2646, %r7941;
add.s32 %r7943, %r7942, %r7931;
add.s32 %r7944, %r7943, 10584;
mul.wide.s32 %rd2489, %r7944, 4;
add.s64 %rd605, %rd4, %rd2489;
@%p1706 bra $L__BB0_1968;
setp.ge.s32 %p1721, %r14, %r2568;
@%p1721 bra $L__BB0_1968;
st.global.f32 [%rd605], %f5926;
$L__BB0_1968:
@%p1706 bra $L__BB0_1971;
add.s32 %r7946, %r2568, -32;
setp.ge.s32 %p1723, %r14, %r7946;
@%p1723 bra $L__BB0_1971;
st.global.f32 [%rd605+128], %f5925;
$L__BB0_1971:
add.s32 %r7948, %r2567, -1;
setp.ge.s32 %p1724, %r7930, %r7948;
@%p1724 bra $L__BB0_1974;
setp.ge.s32 %p1725, %r14, %r2568;
@%p1725 bra $L__BB0_1974;
st.global.f32 [%rd605+10584], %f5924;
$L__BB0_1974:
@%p1724 bra $L__BB0_1977;
add.s32 %r7951, %r2568, -32;
setp.ge.s32 %p1727, %r14, %r7951;
@%p1727 bra $L__BB0_1977;
st.global.f32 [%rd605+10712], %f5923;
$L__BB0_1977:
add.s32 %r7953, %r2567, -2;
setp.ge.s32 %p1728, %r7930, %r7953;
@%p1728 bra $L__BB0_1980;
setp.ge.s32 %p1729, %r14, %r2568;
@%p1729 bra $L__BB0_1980;
st.global.f32 [%rd605+21168], %f5922;
$L__BB0_1980:
@%p1728 bra $L__BB0_1983;
add.s32 %r7956, %r2568, -32;
setp.ge.s32 %p1731, %r14, %r7956;
@%p1731 bra $L__BB0_1983;
st.global.f32 [%rd605+21296], %f5921;
$L__BB0_1983:
add.s32 %r7958, %r2567, -3;
setp.ge.s32 %p1732, %r7930, %r7958;
@%p1732 bra $L__BB0_1986;
setp.ge.s32 %p1733, %r14, %r2568;
@%p1733 bra $L__BB0_1986;
st.global.f32 [%rd605+31752], %f5920;
$L__BB0_1986:
@%p1732 bra $L__BB0_1990;
add.s32 %r7961, %r2568, -32;
setp.ge.s32 %p1735, %r14, %r7961;
@%p1735 bra $L__BB0_1990;
st.global.f32 [%rd605+31880], %f5919;
$L__BB0_1990:
@%p32 bra $L__BB0_1992;
shl.b32 %r7968, %r12, 4;
mov.u32 %r7969, -8;
sub.s32 %r7970, %r7969, %r7968;
add.s32 %r7971, %r13, -12;
setp.lt.s32 %p1737, %r7971, %r7970;
@%p1737 bra $L__BB0_2016;
bra.uni $L__BB0_1992;
$L__BB0_2016:
add.f32 %f5137, %f2168, %f5714;
mov.u32 %r8005, %ctaid.x;
mad.lo.s32 %r8006, %r8005, 63, %r1;
mad.lo.s32 %r8007, %r13, 2646, %r8006;
shl.b32 %r8008, %r12, 7;
add.s32 %r8009, %r8007, %r8008;
add.s32 %r8010, %r8009, 21168;
mul.wide.s32 %rd2493, %r8010, 4;
add.s64 %rd2494, %rd4, %rd2493;
st.global.f32 [%rd2494], %f5137;
add.f32 %f5138, %f2168, %f5713;
st.global.f32 [%rd2494+128], %f5138;
add.f32 %f5139, %f2168, %f5712;
st.global.f32 [%rd2494+10584], %f5139;
add.f32 %f5140, %f2168, %f5711;
st.global.f32 [%rd2494+10712], %f5140;
add.f32 %f5141, %f2168, %f5710;
st.global.f32 [%rd2494+21168], %f5141;
add.f32 %f5142, %f2168, %f5709;
st.global.f32 [%rd2494+21296], %f5142;
add.f32 %f5143, %f2168, %f5708;
st.global.f32 [%rd2494+31752], %f5143;
add.f32 %f5144, %f2168, %f5707;
st.global.f32 [%rd2494+31880], %f5144;
bra.uni $L__BB0_2017;
$L__BB0_1992:
mov.u32 %r7972, -8;
sub.s32 %r2569, %r7972, %r2546;
add.s32 %r7973, %r13, -15;
setp.ge.s32 %p1738, %r7973, %r2569;
setp.lt.s32 %p1739, %r7973, %r2569;
shl.b32 %r7974, %r12, 7;
neg.s32 %r2570, %r7974;
setp.lt.s32 %p1740, %r14, %r2570;
and.pred %p1741, %p1739, %p1740;
add.f32 %f5129, %f2168, %f5714;
selp.f32 %f5926, %f5129, %f5926, %p1741;
mov.u32 %r7975, -32;
sub.s32 %r7976, %r7975, %r7974;
setp.lt.s32 %p1742, %r14, %r7976;
and.pred %p1743, %p1739, %p1742;
add.f32 %f5130, %f2168, %f5713;
selp.f32 %f5925, %f5130, %f5925, %p1743;
mov.u32 %r7977, -9;
sub.s32 %r7978, %r7977, %r2546;
setp.lt.s32 %p1744, %r7973, %r7978;
and.pred %p1745, %p1744, %p1740;
add.f32 %f5131, %f2168, %f5712;
selp.f32 %f5924, %f5131, %f5924, %p1745;
and.pred %p1746, %p1744, %p1742;
add.f32 %f5132, %f2168, %f5711;
selp.f32 %f5923, %f5132, %f5923, %p1746;
mov.u32 %r7979, -10;
sub.s32 %r7980, %r7979, %r2546;
setp.lt.s32 %p1747, %r7973, %r7980;
and.pred %p1748, %p1747, %p1740;
add.f32 %f5133, %f2168, %f5710;
selp.f32 %f5922, %f5133, %f5922, %p1748;
and.pred %p1749, %p1747, %p1742;
add.f32 %f5134, %f2168, %f5709;
selp.f32 %f5921, %f5134, %f5921, %p1749;
mov.u32 %r7981, -11;
sub.s32 %r7982, %r7981, %r2546;
setp.lt.s32 %p1750, %r7973, %r7982;
and.pred %p1751, %p1750, %p1740;
add.f32 %f5135, %f2168, %f5708;
selp.f32 %f5920, %f5135, %f5920, %p1751;
and.pred %p1752, %p1750, %p1742;
add.f32 %f5136, %f2168, %f5707;
selp.f32 %f5919, %f5136, %f5919, %p1752;
mov.u32 %r7983, %ctaid.x;
mad.lo.s32 %r7984, %r7983, 63, %r1;
mad.lo.s32 %r7985, %r13, 2646, %r7984;
add.s32 %r7986, %r7985, %r7974;
add.s32 %r7987, %r7986, 21168;
mul.wide.s32 %rd2492, %r7987, 4;
add.s64 %rd606, %rd4, %rd2492;
@%p1738 bra $L__BB0_1995;
setp.ge.s32 %p1753, %r14, %r2570;
@%p1753 bra $L__BB0_1995;
st.global.f32 [%rd606], %f5926;
$L__BB0_1995:
@%p1738 bra $L__BB0_1998;
add.s32 %r7989, %r2570, -32;
setp.ge.s32 %p1755, %r14, %r7989;
@%p1755 bra $L__BB0_1998;
st.global.f32 [%rd606+128], %f5925;
$L__BB0_1998:
add.s32 %r7991, %r2569, -1;
setp.ge.s32 %p1756, %r7973, %r7991;
@%p1756 bra $L__BB0_2001;
setp.ge.s32 %p1757, %r14, %r2570;
@%p1757 bra $L__BB0_2001;
st.global.f32 [%rd606+10584], %f5924;
$L__BB0_2001:
@%p1756 bra $L__BB0_2004;
add.s32 %r7994, %r2570, -32;
setp.ge.s32 %p1759, %r14, %r7994;
@%p1759 bra $L__BB0_2004;
st.global.f32 [%rd606+10712], %f5923;
$L__BB0_2004:
add.s32 %r7996, %r2569, -2;
setp.ge.s32 %p1760, %r7973, %r7996;
@%p1760 bra $L__BB0_2007;
setp.ge.s32 %p1761, %r14, %r2570;
@%p1761 bra $L__BB0_2007;
st.global.f32 [%rd606+21168], %f5922;
$L__BB0_2007:
@%p1760 bra $L__BB0_2010;
add.s32 %r7999, %r2570, -32;
setp.ge.s32 %p1763, %r14, %r7999;
@%p1763 bra $L__BB0_2010;
st.global.f32 [%rd606+21296], %f5921;
$L__BB0_2010:
add.s32 %r8001, %r2569, -3;
setp.ge.s32 %p1764, %r7973, %r8001;
@%p1764 bra $L__BB0_2013;
setp.ge.s32 %p1765, %r14, %r2570;
@%p1765 bra $L__BB0_2013;
st.global.f32 [%rd606+31752], %f5920;
$L__BB0_2013:
@%p1764 bra $L__BB0_2017;
add.s32 %r8004, %r2570, -32;
setp.ge.s32 %p1767, %r14, %r8004;
@%p1767 bra $L__BB0_2017;
st.global.f32 [%rd606+31880], %f5919;
$L__BB0_2017:
@%p32 bra $L__BB0_2019;
shl.b32 %r8011, %r12, 4;
mov.u32 %r8012, -12;
sub.s32 %r8013, %r8012, %r8011;
add.s32 %r8014, %r13, -12;
setp.lt.s32 %p1769, %r8014, %r8013;
@%p1769 bra $L__BB0_2043;
bra.uni $L__BB0_2019;
$L__BB0_2043:
add.f32 %f5153, %f2168, %f5905;
mov.u32 %r8034, %ctaid.x;
mad.lo.s32 %r8035, %r8034, 63, %r1;
mad.lo.s32 %r8036, %r13, 2646, %r8035;
shl.b32 %r8037, %r12, 7;
add.s32 %r8038, %r8036, %r8037;
add.s32 %r8039, %r8038, 31752;
mul.wide.s32 %rd2496, %r8039, 4;
add.s64 %rd2497, %rd4, %rd2496;
st.global.f32 [%rd2497], %f5153;
add.f32 %f5154, %f2168, %f5904;
st.global.f32 [%rd2497+128], %f5154;
add.f32 %f5155, %f2168, %f5903;
st.global.f32 [%rd2497+10584], %f5155;
add.f32 %f5156, %f2168, %f5902;
st.global.f32 [%rd2497+10712], %f5156;
add.f32 %f5157, %f2168, %f5901;
st.global.f32 [%rd2497+21168], %f5157;
add.f32 %f5158, %f2168, %f5900;
st.global.f32 [%rd2497+21296], %f5158;
add.f32 %f5159, %f2168, %f5899;
st.global.f32 [%rd2497+31752], %f5159;
add.f32 %f5160, %f2168, %f5898;
st.global.f32 [%rd2497+31880], %f5160;
bra.uni $L__BB0_2044;
$L__BB0_2019:
mov.u32 %r8015, -12;
sub.s32 %r2571, %r8015, %r2546;
add.s32 %r8016, %r13, -15;
mov.u32 %r8017, -15;
setp.ge.s32 %p1770, %r8016, %r2571;
setp.lt.s32 %p1771, %r8016, %r2571;
shl.b32 %r8018, %r12, 7;
neg.s32 %r2572, %r8018;
setp.lt.s32 %p1772, %r14, %r2572;
and.pred %p1773, %p1771, %p1772;
add.f32 %f5145, %f2168, %f5905;
selp.f32 %f2217, %f5145, %f5926, %p1773;
mov.u32 %r8019, -32;
sub.s32 %r2573, %r8019, %r8018;
setp.lt.s32 %p1774, %r14, %r2573;
and.pred %p1775, %p1771, %p1774;
add.f32 %f5146, %f2168, %f5904;
selp.f32 %f2218, %f5146, %f5925, %p1775;
mov.u32 %r8020, -13;
sub.s32 %r2574, %r8020, %r2546;
setp.lt.s32 %p1776, %r8016, %r2574;
and.pred %p1777, %p1776, %p1772;
add.f32 %f5147, %f2168, %f5903;
selp.f32 %f2219, %f5147, %f5924, %p1777;
and.pred %p1778, %p1776, %p1774;
add.f32 %f5148, %f2168, %f5902;
selp.f32 %f2220, %f5148, %f5923, %p1778;
mov.u32 %r8021, -14;
sub.s32 %r2575, %r8021, %r2546;
setp.lt.s32 %p1779, %r8016, %r2575;
and.pred %p1780, %p1779, %p1772;
add.f32 %f5149, %f2168, %f5901;
selp.f32 %f2221, %f5149, %f5922, %p1780;
and.pred %p1781, %p1779, %p1774;
add.f32 %f5150, %f2168, %f5900;
selp.f32 %f2222, %f5150, %f5921, %p1781;
sub.s32 %r2576, %r8017, %r2546;
setp.lt.s32 %p1782, %r8016, %r2576;
and.pred %p1783, %p1782, %p1772;
add.f32 %f5151, %f2168, %f5899;
selp.f32 %f2223, %f5151, %f5920, %p1783;
and.pred %p1784, %p1782, %p1774;
add.f32 %f5152, %f2168, %f5898;
selp.f32 %f2224, %f5152, %f5919, %p1784;
mov.u32 %r8022, %ctaid.x;
mad.lo.s32 %r8023, %r8022, 63, %r1;
mad.lo.s32 %r8024, %r13, 2646, %r8023;
add.s32 %r8025, %r8024, %r8018;
add.s32 %r8026, %r8025, 31752;
mul.wide.s32 %rd2495, %r8026, 4;
add.s64 %rd607, %rd4, %rd2495;
@%p1770 bra $L__BB0_2022;
setp.ge.s32 %p1785, %r14, %r2572;
@%p1785 bra $L__BB0_2022;
st.global.f32 [%rd607], %f2217;
$L__BB0_2022:
@%p1770 bra $L__BB0_2025;
setp.ge.s32 %p1787, %r14, %r2573;
@%p1787 bra $L__BB0_2025;
st.global.f32 [%rd607+128], %f2218;
$L__BB0_2025:
setp.ge.s32 %p1788, %r8016, %r2574;
@%p1788 bra $L__BB0_2028;
setp.ge.s32 %p1789, %r14, %r2572;
@%p1789 bra $L__BB0_2028;
st.global.f32 [%rd607+10584], %f2219;
$L__BB0_2028:
@%p1788 bra $L__BB0_2031;
setp.ge.s32 %p1791, %r14, %r2573;
@%p1791 bra $L__BB0_2031;
st.global.f32 [%rd607+10712], %f2220;
$L__BB0_2031:
setp.ge.s32 %p1792, %r8016, %r2575;
@%p1792 bra $L__BB0_2034;
setp.ge.s32 %p1793, %r14, %r2572;
@%p1793 bra $L__BB0_2034;
st.global.f32 [%rd607+21168], %f2221;
$L__BB0_2034:
@%p1792 bra $L__BB0_2037;
setp.ge.s32 %p1795, %r14, %r2573;
@%p1795 bra $L__BB0_2037;
st.global.f32 [%rd607+21296], %f2222;
$L__BB0_2037:
setp.ge.s32 %p1796, %r8016, %r2576;
@%p1796 bra $L__BB0_2040;
setp.ge.s32 %p1797, %r14, %r2572;
@%p1797 bra $L__BB0_2040;
st.global.f32 [%rd607+31752], %f2223;
$L__BB0_2040:
@%p1796 bra $L__BB0_2044;
setp.ge.s32 %p1799, %r14, %r2573;
@%p1799 bra $L__BB0_2044;
st.global.f32 [%rd607+31880], %f2224;
$L__BB0_2044:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_6bbe2216_1601111nvfuser_368ENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEEE14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_6bbe2216_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_6bbe2216_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_6bbe2216_160113std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_6bbe2216_160115arrayE[];
.global .align 4 .b8 __cudart_i2opi_f[24] = {65, 144, 67, 60, 153, 149, 98, 219, 192, 221, 52, 245, 209, 87, 39, 252, 41, 21, 68, 78, 110, 131, 249, 162};
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_6bbe2216_1601111nvfuser_368ENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_6bbe2216_1601111nvfuser_368ENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_0[48],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_6bbe2216_1601111nvfuser_368ENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_1[48],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_6bbe2216_1601111nvfuser_368ENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_2[32]
)
{
.local .align 4 .b8 __local_depot0[28];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<1789>;
.reg .f32 %f<6059>;
.reg .b32 %r<8482>;
.reg .f64 %fd<257>;
.reg .b64 %rd<2793>;
// demoted variable
.shared .align 4 .u32 _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_6bbe2216_1601111nvfuser_368ENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEEE14nvfuser_zero_s;
mov.u64 %SPL, __local_depot0;
ld.param.v2.u32 {%r2575, %r2576}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_6bbe2216_1601111nvfuser_368ENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_0+24];
ld.param.v2.u32 {%r2577, %r2578}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_6bbe2216_1601111nvfuser_368ENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_0+32];
ld.param.v2.u32 {%r2579, %r2580}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_6bbe2216_1601111nvfuser_368ENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_0+40];
ld.param.v2.u32 {%r2585, %r2586}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_6bbe2216_1601111nvfuser_368ENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_1+24];
ld.param.v2.u32 {%r2587, %r2588}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_6bbe2216_1601111nvfuser_368ENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_1+32];
ld.param.v2.u32 {%r2589, %r2590}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_6bbe2216_1601111nvfuser_368ENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_1+40];
ld.param.u64 %rd585, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_6bbe2216_1601111nvfuser_368ENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_1];
ld.param.u64 %rd584, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_6bbe2216_1601111nvfuser_368ENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_0];
add.u64 %rd1, %SPL, 0;
cvta.to.global.u64 %rd2, %rd584;
cvta.to.global.u64 %rd3, %rd585;
mov.u32 %r1, %tid.x;
setp.ne.s32 %p31, %r1, 0;
@%p31 bra $L__BB0_2;
mov.u32 %r2597, 0;
st.shared.u32 [_ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_6bbe2216_1601111nvfuser_368ENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEEE14nvfuser_zero_s], %r2597;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd588, _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_6bbe2216_1601111nvfuser_368ENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEEE14nvfuser_zero_s;
atom.shared.min.s32 %r2598, [%rd588], %r1;
add.s32 %r11, %r1, -63;
ld.shared.u32 %r12, [_ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_6bbe2216_1601111nvfuser_368ENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEEE14nvfuser_zero_s];
setp.lt.s32 %p32, %r12, 3;
setp.lt.s32 %p33, %r1, 31;
and.pred %p34, %p33, %p32;
mov.u32 %r13, %ctaid.x;
mul.lo.s32 %r14, %r2588, %r13;
add.s64 %rd4, %rd1, 24;
@%p34 bra $L__BB0_274;
bra.uni $L__BB0_3;
$L__BB0_274:
shl.b32 %r3303, %r12, 5;
add.s32 %r3304, %r3303, %r1;
mul.hi.s32 %r3305, %r3304, -1840700269;
add.s32 %r3306, %r3305, %r3304;
shr.u32 %r3307, %r3306, 31;
shr.s32 %r3308, %r3306, 2;
add.s32 %r3309, %r3308, %r3307;
mad.lo.s32 %r3310, %r3309, %r2589, %r14;
mul.lo.s32 %r3311, %r3309, 7;
sub.s32 %r3312, %r3304, %r3311;
mad.lo.s32 %r3313, %r3312, %r2590, %r3310;
mul.wide.s32 %rd845, %r3313, 4;
add.s64 %rd846, %rd3, %rd845;
ld.global.f32 %f304, [%rd846];
add.s32 %r3314, %r3304, 32;
mul.hi.s32 %r3315, %r3314, -1840700269;
add.s32 %r3316, %r3315, %r3314;
shr.u32 %r3317, %r3316, 31;
shr.s32 %r3318, %r3316, 2;
add.s32 %r3319, %r3318, %r3317;
mad.lo.s32 %r3320, %r3319, %r2589, %r14;
mul.lo.s32 %r3321, %r3319, 7;
sub.s32 %r3322, %r3314, %r3321;
mad.lo.s32 %r3323, %r3322, %r2590, %r3320;
mul.wide.s32 %rd847, %r3323, 4;
add.s64 %rd848, %rd3, %rd847;
ld.global.f32 %f305, [%rd848];
mul.wide.s32 %rd849, %r2587, 4;
add.s64 %rd850, %rd846, %rd849;
ld.global.f32 %f306, [%rd850];
add.s64 %rd851, %rd848, %rd849;
ld.global.f32 %f307, [%rd851];
add.s64 %rd852, %rd850, %rd849;
ld.global.f32 %f308, [%rd852];
add.s64 %rd853, %rd851, %rd849;
ld.global.f32 %f309, [%rd853];
add.s64 %rd854, %rd852, %rd849;
ld.global.f32 %f310, [%rd854];
add.s64 %rd855, %rd853, %rd849;
ld.global.f32 %f311, [%rd855];
mul.hi.s32 %r3324, %r3304, 954437177;
shr.u32 %r3325, %r3324, 31;
shr.s32 %r3326, %r3324, 1;
add.s32 %r3327, %r3326, %r3325;
mul.lo.s32 %r3328, %r3327, %r2579;
mul.lo.s32 %r3329, %r2578, %r13;
add.s32 %r3330, %r3329, %r3328;
mul.lo.s32 %r3331, %r3327, 9;
sub.s32 %r3332, %r3304, %r3331;
mul.lo.s32 %r3333, %r3332, %r2580;
add.s32 %r3334, %r3330, %r3333;
mul.wide.s32 %rd856, %r3334, 4;
add.s64 %rd857, %rd2, %rd856;
ld.global.f32 %f312, [%rd857];
mul.hi.s32 %r3335, %r3314, 954437177;
shr.u32 %r3336, %r3335, 31;
shr.s32 %r3337, %r3335, 1;
add.s32 %r3338, %r3337, %r3336;
mul.lo.s32 %r3339, %r3338, %r2579;
add.s32 %r3340, %r3329, %r3339;
mul.lo.s32 %r3341, %r3338, 9;
sub.s32 %r3342, %r3314, %r3341;
mul.lo.s32 %r3343, %r3342, %r2580;
add.s32 %r3344, %r3340, %r3343;
mul.wide.s32 %rd858, %r3344, 4;
add.s64 %rd859, %rd2, %rd858;
ld.global.f32 %f313, [%rd859];
mul.wide.s32 %rd860, %r2577, 4;
add.s64 %rd861, %rd857, %rd860;
ld.global.f32 %f314, [%rd861];
add.s64 %rd862, %rd859, %rd860;
ld.global.f32 %f315, [%rd862];
add.s64 %rd863, %rd861, %rd860;
ld.global.f32 %f316, [%rd863];
add.s64 %rd864, %rd862, %rd860;
ld.global.f32 %f317, [%rd864];
add.s32 %r3345, %r2576, %r3329;
add.s32 %r3346, %r3345, %r3328;
add.s32 %r3347, %r3346, %r3333;
mul.wide.s32 %rd865, %r3347, 4;
add.s64 %rd866, %rd2, %rd865;
ld.global.f32 %f318, [%rd866];
add.s32 %r3348, %r3345, %r3339;
add.s32 %r3349, %r3348, %r3343;
mul.wide.s32 %rd867, %r3349, 4;
add.s64 %rd868, %rd2, %rd867;
ld.global.f32 %f319, [%rd868];
mul.f32 %f2666, %f312, 0f3F22F983;
cvt.rni.s32.f32 %r8034, %f2666;
cvt.rn.f32.s32 %f2667, %r8034;
mov.f32 %f2668, 0fBFC90FDA;
fma.rn.f32 %f2669, %f2667, %f2668, %f312;
mov.f32 %f2670, 0fB3A22168;
fma.rn.f32 %f2671, %f2667, %f2670, %f2669;
mov.f32 %f2672, 0fA7C234C5;
fma.rn.f32 %f5345, %f2667, %f2672, %f2671;
abs.f32 %f321, %f312;
setp.ltu.f32 %p257, %f321, 0f47CE4780;
@%p257 bra $L__BB0_282;
setp.eq.f32 %p258, %f321, 0f7F800000;
@%p258 bra $L__BB0_281;
bra.uni $L__BB0_276;
$L__BB0_281:
mov.f32 %f2675, 0f00000000;
mul.rn.f32 %f5345, %f312, %f2675;
mov.u32 %r8034, 0;
bra.uni $L__BB0_282;
$L__BB0_3:
setp.gt.s32 %p35, %r12, 14;
@%p35 bra $L__BB0_8;
shl.b32 %r15, %r12, 5;
neg.s32 %r2599, %r15;
setp.ge.s32 %p36, %r11, %r2599;
@%p36 bra $L__BB0_6;
add.s32 %r2600, %r15, %r1;
mul.hi.s32 %r2601, %r2600, -1840700269;
add.s32 %r2602, %r2601, %r2600;
shr.u32 %r2603, %r2602, 31;
shr.s32 %r2604, %r2602, 2;
add.s32 %r2605, %r2604, %r2603;
mad.lo.s32 %r2606, %r2605, %r2589, %r14;
mul.lo.s32 %r2607, %r2605, 7;
sub.s32 %r2608, %r2600, %r2607;
mad.lo.s32 %r2609, %r2608, %r2590, %r2606;
mul.wide.s32 %rd589, %r2609, 4;
add.s64 %rd590, %rd3, %rd589;
ld.global.f32 %f5607, [%rd590];
$L__BB0_6:
shl.b32 %r7788, %r12, 5;
mov.u32 %r2610, -32;
sub.s32 %r2611, %r2610, %r7788;
setp.ge.s32 %p37, %r11, %r2611;
@%p37 bra $L__BB0_8;
shl.b32 %r7790, %r12, 5;
add.s32 %r2612, %r7790, %r1;
add.s32 %r2613, %r2612, 32;
mul.hi.s32 %r2614, %r2613, -1840700269;
add.s32 %r2615, %r2614, %r2613;
shr.u32 %r2616, %r2615, 31;
shr.s32 %r2617, %r2615, 2;
add.s32 %r2618, %r2617, %r2616;
mad.lo.s32 %r2619, %r2618, %r2589, %r14;
mul.lo.s32 %r2620, %r2618, 7;
sub.s32 %r2621, %r2613, %r2620;
mad.lo.s32 %r2622, %r2621, %r2590, %r2619;
mul.wide.s32 %rd591, %r2622, 4;
add.s64 %rd592, %rd3, %rd591;
ld.global.f32 %f5606, [%rd592];
$L__BB0_8:
add.s32 %r16, %r12, 1;
setp.gt.s32 %p38, %r16, 14;
add.s32 %r17, %r14, %r2587;
@%p38 bra $L__BB0_13;
shl.b32 %r18, %r12, 5;
neg.s32 %r2623, %r18;
setp.ge.s32 %p39, %r11, %r2623;
@%p39 bra $L__BB0_11;
add.s32 %r2624, %r18, %r1;
mul.hi.s32 %r2625, %r2624, -1840700269;
add.s32 %r2626, %r2625, %r2624;
shr.u32 %r2627, %r2626, 31;
shr.s32 %r2628, %r2626, 2;
add.s32 %r2629, %r2628, %r2627;
mad.lo.s32 %r2630, %r2629, %r2589, %r17;
mul.lo.s32 %r2631, %r2629, 7;
sub.s32 %r2632, %r2624, %r2631;
mad.lo.s32 %r2633, %r2632, %r2590, %r2630;
mul.wide.s32 %rd593, %r2633, 4;
add.s64 %rd594, %rd3, %rd593;
ld.global.f32 %f5406, [%rd594];
$L__BB0_11:
shl.b32 %r7791, %r12, 5;
mov.u32 %r2634, -32;
sub.s32 %r2635, %r2634, %r7791;
setp.ge.s32 %p40, %r11, %r2635;
@%p40 bra $L__BB0_13;
shl.b32 %r7792, %r12, 5;
add.s32 %r2636, %r7792, %r1;
add.s32 %r2637, %r2636, 32;
mul.hi.s32 %r2638, %r2637, -1840700269;
add.s32 %r2639, %r2638, %r2637;
shr.u32 %r2640, %r2639, 31;
shr.s32 %r2641, %r2639, 2;
add.s32 %r2642, %r2641, %r2640;
mad.lo.s32 %r2643, %r2642, %r2589, %r17;
mul.lo.s32 %r2644, %r2642, 7;
sub.s32 %r2645, %r2637, %r2644;
mad.lo.s32 %r2646, %r2645, %r2590, %r2643;
mul.wide.s32 %rd595, %r2646, 4;
add.s64 %rd596, %rd3, %rd595;
ld.global.f32 %f5405, [%rd596];
$L__BB0_13:
add.s32 %r19, %r12, 2;
setp.gt.s32 %p41, %r19, 14;
add.s32 %r20, %r17, %r2587;
@%p41 bra $L__BB0_18;
shl.b32 %r21, %r12, 5;
neg.s32 %r2647, %r21;
setp.ge.s32 %p42, %r11, %r2647;
@%p42 bra $L__BB0_16;
add.s32 %r2648, %r21, %r1;
mul.hi.s32 %r2649, %r2648, -1840700269;
add.s32 %r2650, %r2649, %r2648;
shr.u32 %r2651, %r2650, 31;
shr.s32 %r2652, %r2650, 2;
add.s32 %r2653, %r2652, %r2651;
mad.lo.s32 %r2654, %r2653, %r2589, %r20;
mul.lo.s32 %r2655, %r2653, 7;
sub.s32 %r2656, %r2648, %r2655;
mad.lo.s32 %r2657, %r2656, %r2590, %r2654;
mul.wide.s32 %rd597, %r2657, 4;
add.s64 %rd598, %rd3, %rd597;
ld.global.f32 %f5404, [%rd598];
$L__BB0_16:
shl.b32 %r7793, %r12, 5;
mov.u32 %r2658, -32;
sub.s32 %r2659, %r2658, %r7793;
setp.ge.s32 %p43, %r11, %r2659;
@%p43 bra $L__BB0_18;
shl.b32 %r7795, %r12, 5;
add.s32 %r2660, %r7795, %r1;
add.s32 %r2661, %r2660, 32;
mul.hi.s32 %r2662, %r2661, -1840700269;
add.s32 %r2663, %r2662, %r2661;
shr.u32 %r2664, %r2663, 31;
shr.s32 %r2665, %r2663, 2;
add.s32 %r2666, %r2665, %r2664;
mad.lo.s32 %r2667, %r2666, %r2589, %r20;
mul.lo.s32 %r2668, %r2666, 7;
sub.s32 %r2669, %r2661, %r2668;
mad.lo.s32 %r2670, %r2669, %r2590, %r2667;
mul.wide.s32 %rd599, %r2670, 4;
add.s64 %rd600, %rd3, %rd599;
ld.global.f32 %f5403, [%rd600];
$L__BB0_18:
add.s32 %r22, %r12, 3;
setp.gt.s32 %p44, %r22, 14;
add.s32 %r23, %r20, %r2587;
@%p44 bra $L__BB0_23;
shl.b32 %r24, %r12, 5;
neg.s32 %r2671, %r24;
setp.ge.s32 %p45, %r11, %r2671;
@%p45 bra $L__BB0_21;
add.s32 %r2672, %r24, %r1;
mul.hi.s32 %r2673, %r2672, -1840700269;
add.s32 %r2674, %r2673, %r2672;
shr.u32 %r2675, %r2674, 31;
shr.s32 %r2676, %r2674, 2;
add.s32 %r2677, %r2676, %r2675;
mad.lo.s32 %r2678, %r2677, %r2589, %r23;
mul.lo.s32 %r2679, %r2677, 7;
sub.s32 %r2680, %r2672, %r2679;
mad.lo.s32 %r2681, %r2680, %r2590, %r2678;
mul.wide.s32 %rd601, %r2681, 4;
add.s64 %rd602, %rd3, %rd601;
ld.global.f32 %f5402, [%rd602];
$L__BB0_21:
shl.b32 %r7796, %r12, 5;
mov.u32 %r2682, -32;
sub.s32 %r2683, %r2682, %r7796;
setp.ge.s32 %p46, %r11, %r2683;
@%p46 bra $L__BB0_23;
shl.b32 %r7798, %r12, 5;
add.s32 %r2684, %r7798, %r1;
add.s32 %r2685, %r2684, 32;
mul.hi.s32 %r2686, %r2685, -1840700269;
add.s32 %r2687, %r2686, %r2685;
shr.u32 %r2688, %r2687, 31;
shr.s32 %r2689, %r2687, 2;
add.s32 %r2690, %r2689, %r2688;
mad.lo.s32 %r2691, %r2690, %r2589, %r23;
mul.lo.s32 %r2692, %r2690, 7;
sub.s32 %r2693, %r2685, %r2692;
mad.lo.s32 %r2694, %r2693, %r2590, %r2691;
mul.wide.s32 %rd603, %r2694, 4;
add.s64 %rd604, %rd3, %rd603;
ld.global.f32 %f5401, [%rd604];
$L__BB0_23:
setp.gt.s32 %p1767, %r12, 14;
@%p1767 bra $L__BB0_28;
shl.b32 %r26, %r12, 5;
neg.s32 %r2695, %r26;
setp.ge.s32 %p48, %r11, %r2695;
@%p48 bra $L__BB0_26;
mov.u32 %r7966, %ctaid.x;
mul.lo.s32 %r7965, %r2578, %r7966;
add.s32 %r2696, %r26, %r1;
mul.hi.s32 %r2697, %r2696, 954437177;
shr.u32 %r2698, %r2697, 31;
shr.s32 %r2699, %r2697, 1;
add.s32 %r2700, %r2699, %r2698;
mad.lo.s32 %r2701, %r2700, %r2579, %r7965;
mul.lo.s32 %r2702, %r2700, 9;
sub.s32 %r2703, %r2696, %r2702;
mad.lo.s32 %r2704, %r2703, %r2580, %r2701;
mul.wide.s32 %rd605, %r2704, 4;
add.s64 %rd606, %rd2, %rd605;
ld.global.f32 %f5416, [%rd606];
$L__BB0_26:
shl.b32 %r7799, %r12, 5;
mov.u32 %r2705, -32;
sub.s32 %r2706, %r2705, %r7799;
setp.ge.s32 %p49, %r11, %r2706;
@%p49 bra $L__BB0_28;
mov.u32 %r7942, %ctaid.x;
mul.lo.s32 %r7941, %r2578, %r7942;
shl.b32 %r7800, %r12, 5;
add.s32 %r2707, %r7800, %r1;
add.s32 %r2708, %r2707, 32;
mul.hi.s32 %r2709, %r2708, 954437177;
shr.u32 %r2710, %r2709, 31;
shr.s32 %r2711, %r2709, 1;
add.s32 %r2712, %r2711, %r2710;
mad.lo.s32 %r2713, %r2712, %r2579, %r7941;
mul.lo.s32 %r2714, %r2712, 9;
sub.s32 %r2715, %r2708, %r2714;
mad.lo.s32 %r2716, %r2715, %r2580, %r2713;
mul.wide.s32 %rd607, %r2716, 4;
add.s64 %rd608, %rd2, %rd607;
ld.global.f32 %f5415, [%rd608];
$L__BB0_28:
mov.u32 %r7940, %ctaid.x;
mul.lo.s32 %r7939, %r2578, %r7940;
add.s32 %r7938, %r12, 1;
setp.gt.s32 %p1785, %r7938, 14;
add.s32 %r27, %r7939, %r2577;
@%p1785 bra $L__BB0_33;
shl.b32 %r28, %r12, 5;
neg.s32 %r2717, %r28;
setp.ge.s32 %p51, %r11, %r2717;
@%p51 bra $L__BB0_31;
shl.b32 %r7807, %r12, 5;
add.s32 %r2718, %r7807, %r1;
mul.hi.s32 %r2719, %r2718, 954437177;
shr.u32 %r2720, %r2719, 31;
shr.s32 %r2721, %r2719, 1;
add.s32 %r2722, %r2721, %r2720;
mad.lo.s32 %r2723, %r2722, %r2579, %r27;
mul.lo.s32 %r2724, %r2722, 9;
sub.s32 %r2725, %r2718, %r2724;
mad.lo.s32 %r2726, %r2725, %r2580, %r2723;
mul.wide.s32 %rd609, %r2726, 4;
add.s64 %rd610, %rd2, %rd609;
ld.global.f32 %f5414, [%rd610];
$L__BB0_31:
shl.b32 %r7801, %r12, 5;
mov.u32 %r2727, -32;
sub.s32 %r2728, %r2727, %r7801;
setp.ge.s32 %p52, %r11, %r2728;
@%p52 bra $L__BB0_33;
shl.b32 %r7806, %r12, 5;
add.s32 %r2729, %r7806, %r1;
add.s32 %r2730, %r2729, 32;
mul.hi.s32 %r2731, %r2730, 954437177;
shr.u32 %r2732, %r2731, 31;
shr.s32 %r2733, %r2731, 1;
add.s32 %r2734, %r2733, %r2732;
mad.lo.s32 %r2735, %r2734, %r2579, %r27;
mul.lo.s32 %r2736, %r2734, 9;
sub.s32 %r2737, %r2730, %r2736;
mad.lo.s32 %r2738, %r2737, %r2580, %r2735;
mul.wide.s32 %rd611, %r2738, 4;
add.s64 %rd612, %rd2, %rd611;
ld.global.f32 %f5413, [%rd612];
$L__BB0_33:
add.s32 %r7794, %r12, 2;
setp.gt.s32 %p1769, %r7794, 14;
@%p1769 bra $L__BB0_38;
shl.b32 %r30, %r12, 5;
neg.s32 %r2739, %r30;
setp.ge.s32 %p54, %r11, %r2739;
@%p54 bra $L__BB0_36;
mov.u32 %r7946, %ctaid.x;
mul.lo.s32 %r7945, %r2578, %r7946;
add.s32 %r7944, %r7945, %r2577;
add.s32 %r7943, %r7944, %r2577;
shl.b32 %r7810, %r12, 5;
add.s32 %r2740, %r7810, %r1;
mul.hi.s32 %r2741, %r2740, 954437177;
shr.u32 %r2742, %r2741, 31;
shr.s32 %r2743, %r2741, 1;
add.s32 %r2744, %r2743, %r2742;
mad.lo.s32 %r2745, %r2744, %r2579, %r7943;
mul.lo.s32 %r2746, %r2744, 9;
sub.s32 %r2747, %r2740, %r2746;
mad.lo.s32 %r2748, %r2747, %r2580, %r2745;
mul.wide.s32 %rd613, %r2748, 4;
add.s64 %rd614, %rd2, %rd613;
ld.global.f32 %f5412, [%rd614];
$L__BB0_36:
shl.b32 %r7808, %r12, 5;
mov.u32 %r2749, -32;
sub.s32 %r2750, %r2749, %r7808;
setp.ge.s32 %p55, %r11, %r2750;
@%p55 bra $L__BB0_38;
mov.u32 %r7814, %ctaid.x;
mul.lo.s32 %r7813, %r2578, %r7814;
add.s32 %r7812, %r7813, %r2577;
add.s32 %r7811, %r7812, %r2577;
shl.b32 %r7809, %r12, 5;
add.s32 %r2751, %r7809, %r1;
add.s32 %r2752, %r2751, 32;
mul.hi.s32 %r2753, %r2752, 954437177;
shr.u32 %r2754, %r2753, 31;
shr.s32 %r2755, %r2753, 1;
add.s32 %r2756, %r2755, %r2754;
mad.lo.s32 %r2757, %r2756, %r2579, %r7811;
mul.lo.s32 %r2758, %r2756, 9;
sub.s32 %r2759, %r2752, %r2758;
mad.lo.s32 %r2760, %r2759, %r2580, %r2757;
mul.wide.s32 %rd615, %r2760, 4;
add.s64 %rd616, %rd2, %rd615;
ld.global.f32 %f5411, [%rd616];
$L__BB0_38:
add.s32 %r7797, %r12, 3;
setp.gt.s32 %p1770, %r7797, 14;
@%p1770 bra $L__BB0_43;
shl.b32 %r32, %r12, 5;
neg.s32 %r2761, %r32;
setp.ge.s32 %p57, %r11, %r2761;
@%p57 bra $L__BB0_41;
mov.u32 %r7949, %ctaid.x;
mul.lo.s32 %r7948, %r2578, %r7949;
add.s32 %r7947, %r2576, %r7948;
shl.b32 %r7817, %r12, 5;
add.s32 %r2762, %r7817, %r1;
mul.hi.s32 %r2763, %r2762, 954437177;
shr.u32 %r2764, %r2763, 31;
shr.s32 %r2765, %r2763, 1;
add.s32 %r2766, %r2765, %r2764;
mad.lo.s32 %r2767, %r2766, %r2579, %r7947;
mul.lo.s32 %r2768, %r2766, 9;
sub.s32 %r2769, %r2762, %r2768;
mad.lo.s32 %r2770, %r2769, %r2580, %r2767;
mul.wide.s32 %rd617, %r2770, 4;
add.s64 %rd618, %rd2, %rd617;
ld.global.f32 %f5410, [%rd618];
$L__BB0_41:
shl.b32 %r7815, %r12, 5;
mov.u32 %r2771, -32;
sub.s32 %r2772, %r2771, %r7815;
setp.ge.s32 %p58, %r11, %r2772;
@%p58 bra $L__BB0_43;
mov.u32 %r7820, %ctaid.x;
mul.lo.s32 %r7819, %r2578, %r7820;
add.s32 %r7818, %r2576, %r7819;
shl.b32 %r7816, %r12, 5;
add.s32 %r2773, %r7816, %r1;
add.s32 %r2774, %r2773, 32;
mul.hi.s32 %r2775, %r2774, 954437177;
shr.u32 %r2776, %r2775, 31;
shr.s32 %r2777, %r2775, 1;
add.s32 %r2778, %r2777, %r2776;
mad.lo.s32 %r2779, %r2778, %r2579, %r7818;
mul.lo.s32 %r2780, %r2778, 9;
sub.s32 %r2781, %r2774, %r2780;
mad.lo.s32 %r2782, %r2781, %r2580, %r2779;
mul.wide.s32 %rd619, %r2782, 4;
add.s64 %rd620, %rd2, %rd619;
ld.global.f32 %f5409, [%rd620];
$L__BB0_43:
setp.gt.s32 %p1768, %r12, 14;
@%p1768 bra $L__BB0_71;
shl.b32 %r2783, %r12, 5;
neg.s32 %r2784, %r2783;
setp.ge.s32 %p60, %r11, %r2784;
@%p60 bra $L__BB0_57;
mul.f32 %f2315, %f5416, 0f3F22F983;
cvt.rni.s32.f32 %r7970, %f2315;
cvt.rn.f32.s32 %f2316, %r7970;
mov.f32 %f2317, 0fBFC90FDA;
fma.rn.f32 %f2318, %f2316, %f2317, %f5416;
mov.f32 %f2319, 0fB3A22168;
fma.rn.f32 %f2320, %f2316, %f2319, %f2318;
mov.f32 %f2321, 0fA7C234C5;
fma.rn.f32 %f5248, %f2316, %f2321, %f2320;
abs.f32 %f42, %f5416;
setp.ltu.f32 %p61, %f42, 0f47CE4780;
@%p61 bra $L__BB0_53;
setp.eq.f32 %p62, %f42, 0f7F800000;
@%p62 bra $L__BB0_52;
bra.uni $L__BB0_47;
$L__BB0_52:
mov.f32 %f2324, 0f00000000;
mul.rn.f32 %f5248, %f5416, %f2324;
mov.u32 %r7970, 0;
bra.uni $L__BB0_53;
$L__BB0_276:
mov.b32 %r335, %f312;
shr.u32 %r3351, %r335, 23;
and.b32 %r3352, %r3351, 255;
add.s32 %r336, %r3352, -128;
shl.b32 %r3353, %r335, 8;
or.b32 %r337, %r3353, -2147483648;
shr.u32 %r338, %r336, 5;
mov.u64 %rd2537, 0;
mov.u32 %r8031, 0;
mov.u64 %rd872, __cudart_i2opi_f;
mov.u64 %rd2538, %rd2537;
$L__BB0_277:
.pragma "nounroll";
shl.b64 %rd871, %rd2537, 2;
add.s64 %rd873, %rd872, %rd871;
ld.global.nc.u32 %r3354, [%rd873];
mad.wide.u32 %rd874, %r3354, %r337, %rd2538;
shr.u64 %rd2538, %rd874, 32;
add.s64 %rd875, %rd1, %rd871;
st.local.u32 [%rd875], %rd874;
add.s32 %r8031, %r8031, 1;
cvt.s64.s32 %rd2537, %r8031;
setp.ne.s32 %p259, %r8031, 6;
@%p259 bra $L__BB0_277;
st.local.u32 [%rd4], %rd2538;
mov.u32 %r3355, 4;
sub.s32 %r341, %r3355, %r338;
mov.u32 %r3356, 6;
sub.s32 %r3357, %r3356, %r338;
mul.wide.s32 %rd876, %r3357, 4;
add.s64 %rd877, %rd1, %rd876;
ld.local.u32 %r8032, [%rd877];
ld.local.u32 %r8033, [%rd877+-4];
and.b32 %r344, %r336, 31;
setp.eq.s32 %p260, %r344, 0;
@%p260 bra $L__BB0_280;
mov.u32 %r3358, 32;
sub.s32 %r3359, %r3358, %r344;
shr.u32 %r3360, %r8033, %r3359;
shl.b32 %r3361, %r8032, %r344;
add.s32 %r8032, %r3360, %r3361;
mul.wide.s32 %rd878, %r341, 4;
add.s64 %rd879, %rd1, %rd878;
ld.local.u32 %r3362, [%rd879];
shr.u32 %r3363, %r3362, %r3359;
shl.b32 %r3364, %r8033, %r344;
add.s32 %r8033, %r3363, %r3364;
$L__BB0_280:
and.b32 %r3365, %r335, -2147483648;
shr.u32 %r3366, %r8033, 30;
shl.b32 %r3367, %r8032, 2;
or.b32 %r3368, %r3366, %r3367;
shr.u32 %r3369, %r3368, 31;
shr.u32 %r3370, %r8032, 30;
add.s32 %r3371, %r3369, %r3370;
neg.s32 %r3372, %r3371;
setp.eq.s32 %p261, %r3365, 0;
selp.b32 %r8034, %r3371, %r3372, %p261;
setp.ne.s32 %p262, %r3369, 0;
xor.b32 %r3373, %r3365, -2147483648;
selp.b32 %r3374, %r3373, %r3365, %p262;
selp.b32 %r3375, -1, 0, %p262;
xor.b32 %r3376, %r3368, %r3375;
shl.b32 %r3377, %r8033, 2;
xor.b32 %r3378, %r3377, %r3375;
cvt.u64.u32 %rd880, %r3376;
cvt.u64.u32 %rd881, %r3378;
bfi.b64 %rd882, %rd880, %rd881, 32, 32;
cvt.rn.f64.s64 %fd33, %rd882;
mul.f64 %fd34, %fd33, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2673, %fd34;
setp.eq.s32 %p263, %r3374, 0;
neg.f32 %f2674, %f2673;
selp.f32 %f5345, %f2673, %f2674, %p263;
$L__BB0_282:
and.b32 %r351, %r8034, 1;
setp.eq.s32 %p264, %r351, 0;
selp.f32 %f325, %f5345, 0f3F800000, %p264;
mul.rn.f32 %f326, %f5345, %f5345;
mov.f32 %f5346, 0fB94D4153;
@%p264 bra $L__BB0_284;
mov.f32 %f2677, 0fBAB607ED;
mov.f32 %f2678, 0f37CBAC00;
fma.rn.f32 %f5346, %f2678, %f326, %f2677;
$L__BB0_284:
selp.f32 %f2679, 0f3C0885E4, 0f3D2AAABB, %p264;
fma.rn.f32 %f2680, %f5346, %f326, %f2679;
selp.f32 %f2681, 0fBE2AAAA8, 0fBEFFFFFF, %p264;
fma.rn.f32 %f2682, %f2680, %f326, %f2681;
mov.f32 %f2683, 0f00000000;
fma.rn.f32 %f2684, %f326, %f325, %f2683;
fma.rn.f32 %f5347, %f2682, %f2684, %f325;
and.b32 %r3380, %r8034, 2;
setp.eq.s32 %p266, %r3380, 0;
@%p266 bra $L__BB0_286;
mov.f32 %f2686, 0fBF800000;
fma.rn.f32 %f5347, %f5347, %f2686, %f2683;
$L__BB0_286:
mul.f32 %f2687, %f304, 0f3F22F983;
cvt.rni.s32.f32 %r8038, %f2687;
cvt.rn.f32.s32 %f2688, %r8038;
mov.f32 %f2689, 0fBFC90FDA;
fma.rn.f32 %f2690, %f2688, %f2689, %f304;
mov.f32 %f2691, 0fB3A22168;
fma.rn.f32 %f2692, %f2688, %f2691, %f2690;
mov.f32 %f2693, 0fA7C234C5;
fma.rn.f32 %f5348, %f2688, %f2693, %f2692;
abs.f32 %f333, %f304;
setp.ltu.f32 %p267, %f333, 0f47CE4780;
@%p267 bra $L__BB0_294;
setp.eq.f32 %p268, %f333, 0f7F800000;
@%p268 bra $L__BB0_293;
bra.uni $L__BB0_288;
$L__BB0_293:
mov.f32 %f2696, 0f00000000;
mul.rn.f32 %f5348, %f304, %f2696;
mov.u32 %r8038, 0;
bra.uni $L__BB0_294;
$L__BB0_288:
mov.b32 %r353, %f304;
shr.u32 %r3382, %r353, 23;
and.b32 %r3383, %r3382, 255;
add.s32 %r354, %r3383, -128;
shl.b32 %r3384, %r353, 8;
or.b32 %r355, %r3384, -2147483648;
shr.u32 %r356, %r354, 5;
mov.u64 %rd2539, 0;
mov.u32 %r8035, 0;
mov.u64 %rd886, __cudart_i2opi_f;
mov.u64 %rd2540, %rd2539;
$L__BB0_289:
.pragma "nounroll";
shl.b64 %rd885, %rd2539, 2;
add.s64 %rd887, %rd886, %rd885;
ld.global.nc.u32 %r3385, [%rd887];
mad.wide.u32 %rd888, %r3385, %r355, %rd2540;
shr.u64 %rd2540, %rd888, 32;
add.s64 %rd889, %rd1, %rd885;
st.local.u32 [%rd889], %rd888;
add.s32 %r8035, %r8035, 1;
cvt.s64.s32 %rd2539, %r8035;
setp.ne.s32 %p269, %r8035, 6;
@%p269 bra $L__BB0_289;
st.local.u32 [%rd4], %rd2540;
mov.u32 %r3386, 4;
sub.s32 %r359, %r3386, %r356;
mov.u32 %r3387, 6;
sub.s32 %r3388, %r3387, %r356;
mul.wide.s32 %rd890, %r3388, 4;
add.s64 %rd891, %rd1, %rd890;
ld.local.u32 %r8036, [%rd891];
ld.local.u32 %r8037, [%rd891+-4];
and.b32 %r362, %r354, 31;
setp.eq.s32 %p270, %r362, 0;
@%p270 bra $L__BB0_292;
mov.u32 %r3389, 32;
sub.s32 %r3390, %r3389, %r362;
shr.u32 %r3391, %r8037, %r3390;
shl.b32 %r3392, %r8036, %r362;
add.s32 %r8036, %r3391, %r3392;
mul.wide.s32 %rd892, %r359, 4;
add.s64 %rd893, %rd1, %rd892;
ld.local.u32 %r3393, [%rd893];
shr.u32 %r3394, %r3393, %r3390;
shl.b32 %r3395, %r8037, %r362;
add.s32 %r8037, %r3394, %r3395;
$L__BB0_292:
and.b32 %r3396, %r353, -2147483648;
shr.u32 %r3397, %r8037, 30;
shl.b32 %r3398, %r8036, 2;
or.b32 %r3399, %r3397, %r3398;
shr.u32 %r3400, %r3399, 31;
shr.u32 %r3401, %r8036, 30;
add.s32 %r3402, %r3400, %r3401;
neg.s32 %r3403, %r3402;
setp.eq.s32 %p271, %r3396, 0;
selp.b32 %r8038, %r3402, %r3403, %p271;
setp.ne.s32 %p272, %r3400, 0;
xor.b32 %r3404, %r3396, -2147483648;
selp.b32 %r3405, %r3404, %r3396, %p272;
selp.b32 %r3406, -1, 0, %p272;
xor.b32 %r3407, %r3399, %r3406;
shl.b32 %r3408, %r8037, 2;
xor.b32 %r3409, %r3408, %r3406;
cvt.u64.u32 %rd894, %r3407;
cvt.u64.u32 %rd895, %r3409;
bfi.b64 %rd896, %rd894, %rd895, 32, 32;
cvt.rn.f64.s64 %fd35, %rd896;
mul.f64 %fd36, %fd35, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2694, %fd36;
setp.eq.s32 %p273, %r3405, 0;
neg.f32 %f2695, %f2694;
selp.f32 %f5348, %f2694, %f2695, %p273;
$L__BB0_294:
add.s32 %r369, %r8038, 1;
and.b32 %r370, %r369, 1;
setp.eq.s32 %p274, %r370, 0;
selp.f32 %f337, %f5348, 0f3F800000, %p274;
mul.rn.f32 %f338, %f5348, %f5348;
mov.f32 %f5349, 0fB94D4153;
@%p274 bra $L__BB0_296;
mov.f32 %f2698, 0fBAB607ED;
mov.f32 %f2699, 0f37CBAC00;
fma.rn.f32 %f5349, %f2699, %f338, %f2698;
$L__BB0_296:
selp.f32 %f2700, 0f3C0885E4, 0f3D2AAABB, %p274;
fma.rn.f32 %f2701, %f5349, %f338, %f2700;
selp.f32 %f2702, 0fBE2AAAA8, 0fBEFFFFFF, %p274;
fma.rn.f32 %f2703, %f2701, %f338, %f2702;
mov.f32 %f2704, 0f00000000;
fma.rn.f32 %f2705, %f338, %f337, %f2704;
fma.rn.f32 %f5350, %f2703, %f2705, %f337;
and.b32 %r3411, %r369, 2;
setp.eq.s32 %p276, %r3411, 0;
@%p276 bra $L__BB0_298;
mov.f32 %f2707, 0fBF800000;
fma.rn.f32 %f5350, %f5350, %f2707, %f2704;
$L__BB0_298:
add.f32 %f5400, %f5347, %f5350;
mul.f32 %f2708, %f313, 0f3F22F983;
cvt.rni.s32.f32 %r8042, %f2708;
cvt.rn.f32.s32 %f2709, %r8042;
mov.f32 %f2710, 0fBFC90FDA;
fma.rn.f32 %f2711, %f2709, %f2710, %f313;
mov.f32 %f2712, 0fB3A22168;
fma.rn.f32 %f2713, %f2709, %f2712, %f2711;
mov.f32 %f2714, 0fA7C234C5;
fma.rn.f32 %f5351, %f2709, %f2714, %f2713;
abs.f32 %f346, %f313;
setp.ltu.f32 %p277, %f346, 0f47CE4780;
@%p277 bra $L__BB0_306;
setp.eq.f32 %p278, %f346, 0f7F800000;
@%p278 bra $L__BB0_305;
bra.uni $L__BB0_300;
$L__BB0_305:
mov.f32 %f2717, 0f00000000;
mul.rn.f32 %f5351, %f313, %f2717;
mov.u32 %r8042, 0;
bra.uni $L__BB0_306;
$L__BB0_300:
mov.b32 %r372, %f313;
shr.u32 %r3413, %r372, 23;
and.b32 %r3414, %r3413, 255;
add.s32 %r373, %r3414, -128;
shl.b32 %r3415, %r372, 8;
or.b32 %r374, %r3415, -2147483648;
shr.u32 %r375, %r373, 5;
mov.u64 %rd2541, 0;
mov.u32 %r8039, 0;
mov.u64 %rd900, __cudart_i2opi_f;
mov.u64 %rd2542, %rd2541;
$L__BB0_301:
.pragma "nounroll";
shl.b64 %rd899, %rd2541, 2;
add.s64 %rd901, %rd900, %rd899;
ld.global.nc.u32 %r3416, [%rd901];
mad.wide.u32 %rd902, %r3416, %r374, %rd2542;
shr.u64 %rd2542, %rd902, 32;
add.s64 %rd903, %rd1, %rd899;
st.local.u32 [%rd903], %rd902;
add.s32 %r8039, %r8039, 1;
cvt.s64.s32 %rd2541, %r8039;
setp.ne.s32 %p279, %r8039, 6;
@%p279 bra $L__BB0_301;
st.local.u32 [%rd4], %rd2542;
mov.u32 %r3417, 4;
sub.s32 %r378, %r3417, %r375;
mov.u32 %r3418, 6;
sub.s32 %r3419, %r3418, %r375;
mul.wide.s32 %rd904, %r3419, 4;
add.s64 %rd905, %rd1, %rd904;
ld.local.u32 %r8040, [%rd905];
ld.local.u32 %r8041, [%rd905+-4];
and.b32 %r381, %r373, 31;
setp.eq.s32 %p280, %r381, 0;
@%p280 bra $L__BB0_304;
mov.u32 %r3420, 32;
sub.s32 %r3421, %r3420, %r381;
shr.u32 %r3422, %r8041, %r3421;
shl.b32 %r3423, %r8040, %r381;
add.s32 %r8040, %r3422, %r3423;
mul.wide.s32 %rd906, %r378, 4;
add.s64 %rd907, %rd1, %rd906;
ld.local.u32 %r3424, [%rd907];
shr.u32 %r3425, %r3424, %r3421;
shl.b32 %r3426, %r8041, %r381;
add.s32 %r8041, %r3425, %r3426;
$L__BB0_304:
and.b32 %r3427, %r372, -2147483648;
shr.u32 %r3428, %r8041, 30;
shl.b32 %r3429, %r8040, 2;
or.b32 %r3430, %r3428, %r3429;
shr.u32 %r3431, %r3430, 31;
shr.u32 %r3432, %r8040, 30;
add.s32 %r3433, %r3431, %r3432;
neg.s32 %r3434, %r3433;
setp.eq.s32 %p281, %r3427, 0;
selp.b32 %r8042, %r3433, %r3434, %p281;
setp.ne.s32 %p282, %r3431, 0;
xor.b32 %r3435, %r3427, -2147483648;
selp.b32 %r3436, %r3435, %r3427, %p282;
selp.b32 %r3437, -1, 0, %p282;
xor.b32 %r3438, %r3430, %r3437;
shl.b32 %r3439, %r8041, 2;
xor.b32 %r3440, %r3439, %r3437;
cvt.u64.u32 %rd908, %r3438;
cvt.u64.u32 %rd909, %r3440;
bfi.b64 %rd910, %rd908, %rd909, 32, 32;
cvt.rn.f64.s64 %fd37, %rd910;
mul.f64 %fd38, %fd37, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2715, %fd38;
setp.eq.s32 %p283, %r3436, 0;
neg.f32 %f2716, %f2715;
selp.f32 %f5351, %f2715, %f2716, %p283;
$L__BB0_306:
and.b32 %r388, %r8042, 1;
setp.eq.s32 %p284, %r388, 0;
selp.f32 %f350, %f5351, 0f3F800000, %p284;
mul.rn.f32 %f351, %f5351, %f5351;
mov.f32 %f5352, 0fB94D4153;
@%p284 bra $L__BB0_308;
mov.f32 %f2719, 0fBAB607ED;
mov.f32 %f2720, 0f37CBAC00;
fma.rn.f32 %f5352, %f2720, %f351, %f2719;
$L__BB0_308:
selp.f32 %f2721, 0f3C0885E4, 0f3D2AAABB, %p284;
fma.rn.f32 %f2722, %f5352, %f351, %f2721;
selp.f32 %f2723, 0fBE2AAAA8, 0fBEFFFFFF, %p284;
fma.rn.f32 %f2724, %f2722, %f351, %f2723;
mov.f32 %f2725, 0f00000000;
fma.rn.f32 %f2726, %f351, %f350, %f2725;
fma.rn.f32 %f5353, %f2724, %f2726, %f350;
and.b32 %r3442, %r8042, 2;
setp.eq.s32 %p286, %r3442, 0;
@%p286 bra $L__BB0_310;
mov.f32 %f2728, 0fBF800000;
fma.rn.f32 %f5353, %f5353, %f2728, %f2725;
$L__BB0_310:
mul.f32 %f2729, %f305, 0f3F22F983;
cvt.rni.s32.f32 %r8046, %f2729;
cvt.rn.f32.s32 %f2730, %r8046;
mov.f32 %f2731, 0fBFC90FDA;
fma.rn.f32 %f2732, %f2730, %f2731, %f305;
mov.f32 %f2733, 0fB3A22168;
fma.rn.f32 %f2734, %f2730, %f2733, %f2732;
mov.f32 %f2735, 0fA7C234C5;
fma.rn.f32 %f5354, %f2730, %f2735, %f2734;
abs.f32 %f358, %f305;
setp.ltu.f32 %p287, %f358, 0f47CE4780;
@%p287 bra $L__BB0_318;
setp.eq.f32 %p288, %f358, 0f7F800000;
@%p288 bra $L__BB0_317;
bra.uni $L__BB0_312;
$L__BB0_317:
mov.f32 %f2738, 0f00000000;
mul.rn.f32 %f5354, %f305, %f2738;
mov.u32 %r8046, 0;
bra.uni $L__BB0_318;
$L__BB0_312:
mov.b32 %r390, %f305;
shr.u32 %r3444, %r390, 23;
and.b32 %r3445, %r3444, 255;
add.s32 %r391, %r3445, -128;
shl.b32 %r3446, %r390, 8;
or.b32 %r392, %r3446, -2147483648;
shr.u32 %r393, %r391, 5;
mov.u64 %rd2543, 0;
mov.u32 %r8043, 0;
mov.u64 %rd914, __cudart_i2opi_f;
mov.u64 %rd2544, %rd2543;
$L__BB0_313:
.pragma "nounroll";
shl.b64 %rd913, %rd2543, 2;
add.s64 %rd915, %rd914, %rd913;
ld.global.nc.u32 %r3447, [%rd915];
mad.wide.u32 %rd916, %r3447, %r392, %rd2544;
shr.u64 %rd2544, %rd916, 32;
add.s64 %rd917, %rd1, %rd913;
st.local.u32 [%rd917], %rd916;
add.s32 %r8043, %r8043, 1;
cvt.s64.s32 %rd2543, %r8043;
setp.ne.s32 %p289, %r8043, 6;
@%p289 bra $L__BB0_313;
st.local.u32 [%rd4], %rd2544;
mov.u32 %r3448, 4;
sub.s32 %r396, %r3448, %r393;
mov.u32 %r3449, 6;
sub.s32 %r3450, %r3449, %r393;
mul.wide.s32 %rd918, %r3450, 4;
add.s64 %rd919, %rd1, %rd918;
ld.local.u32 %r8044, [%rd919];
ld.local.u32 %r8045, [%rd919+-4];
and.b32 %r399, %r391, 31;
setp.eq.s32 %p290, %r399, 0;
@%p290 bra $L__BB0_316;
mov.u32 %r3451, 32;
sub.s32 %r3452, %r3451, %r399;
shr.u32 %r3453, %r8045, %r3452;
shl.b32 %r3454, %r8044, %r399;
add.s32 %r8044, %r3453, %r3454;
mul.wide.s32 %rd920, %r396, 4;
add.s64 %rd921, %rd1, %rd920;
ld.local.u32 %r3455, [%rd921];
shr.u32 %r3456, %r3455, %r3452;
shl.b32 %r3457, %r8045, %r399;
add.s32 %r8045, %r3456, %r3457;
$L__BB0_316:
and.b32 %r3458, %r390, -2147483648;
shr.u32 %r3459, %r8045, 30;
shl.b32 %r3460, %r8044, 2;
or.b32 %r3461, %r3459, %r3460;
shr.u32 %r3462, %r3461, 31;
shr.u32 %r3463, %r8044, 30;
add.s32 %r3464, %r3462, %r3463;
neg.s32 %r3465, %r3464;
setp.eq.s32 %p291, %r3458, 0;
selp.b32 %r8046, %r3464, %r3465, %p291;
setp.ne.s32 %p292, %r3462, 0;
xor.b32 %r3466, %r3458, -2147483648;
selp.b32 %r3467, %r3466, %r3458, %p292;
selp.b32 %r3468, -1, 0, %p292;
xor.b32 %r3469, %r3461, %r3468;
shl.b32 %r3470, %r8045, 2;
xor.b32 %r3471, %r3470, %r3468;
cvt.u64.u32 %rd922, %r3469;
cvt.u64.u32 %rd923, %r3471;
bfi.b64 %rd924, %rd922, %rd923, 32, 32;
cvt.rn.f64.s64 %fd39, %rd924;
mul.f64 %fd40, %fd39, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2736, %fd40;
setp.eq.s32 %p293, %r3467, 0;
neg.f32 %f2737, %f2736;
selp.f32 %f5354, %f2736, %f2737, %p293;
$L__BB0_318:
add.s32 %r406, %r8046, 1;
and.b32 %r407, %r406, 1;
setp.eq.s32 %p294, %r407, 0;
selp.f32 %f362, %f5354, 0f3F800000, %p294;
mul.rn.f32 %f363, %f5354, %f5354;
mov.f32 %f5355, 0fB94D4153;
@%p294 bra $L__BB0_320;
mov.f32 %f2740, 0fBAB607ED;
mov.f32 %f2741, 0f37CBAC00;
fma.rn.f32 %f5355, %f2741, %f363, %f2740;
$L__BB0_320:
selp.f32 %f2742, 0f3C0885E4, 0f3D2AAABB, %p294;
fma.rn.f32 %f2743, %f5355, %f363, %f2742;
selp.f32 %f2744, 0fBE2AAAA8, 0fBEFFFFFF, %p294;
fma.rn.f32 %f2745, %f2743, %f363, %f2744;
mov.f32 %f2746, 0f00000000;
fma.rn.f32 %f2747, %f363, %f362, %f2746;
fma.rn.f32 %f5356, %f2745, %f2747, %f362;
and.b32 %r3473, %r406, 2;
setp.eq.s32 %p296, %r3473, 0;
@%p296 bra $L__BB0_322;
mov.f32 %f2749, 0fBF800000;
fma.rn.f32 %f5356, %f5356, %f2749, %f2746;
$L__BB0_322:
add.f32 %f5399, %f5353, %f5356;
mul.f32 %f2750, %f314, 0f3F22F983;
cvt.rni.s32.f32 %r8050, %f2750;
cvt.rn.f32.s32 %f2751, %r8050;
mov.f32 %f2752, 0fBFC90FDA;
fma.rn.f32 %f2753, %f2751, %f2752, %f314;
mov.f32 %f2754, 0fB3A22168;
fma.rn.f32 %f2755, %f2751, %f2754, %f2753;
mov.f32 %f2756, 0fA7C234C5;
fma.rn.f32 %f5357, %f2751, %f2756, %f2755;
abs.f32 %f371, %f314;
setp.ltu.f32 %p297, %f371, 0f47CE4780;
@%p297 bra $L__BB0_330;
setp.eq.f32 %p298, %f371, 0f7F800000;
@%p298 bra $L__BB0_329;
bra.uni $L__BB0_324;
$L__BB0_329:
mov.f32 %f2759, 0f00000000;
mul.rn.f32 %f5357, %f314, %f2759;
mov.u32 %r8050, 0;
bra.uni $L__BB0_330;
$L__BB0_324:
mov.b32 %r409, %f314;
shr.u32 %r3475, %r409, 23;
and.b32 %r3476, %r3475, 255;
add.s32 %r410, %r3476, -128;
shl.b32 %r3477, %r409, 8;
or.b32 %r411, %r3477, -2147483648;
shr.u32 %r412, %r410, 5;
mov.u64 %rd2545, 0;
mov.u32 %r8047, 0;
mov.u64 %rd928, __cudart_i2opi_f;
mov.u64 %rd2546, %rd2545;
$L__BB0_325:
.pragma "nounroll";
shl.b64 %rd927, %rd2545, 2;
add.s64 %rd929, %rd928, %rd927;
ld.global.nc.u32 %r3478, [%rd929];
mad.wide.u32 %rd930, %r3478, %r411, %rd2546;
shr.u64 %rd2546, %rd930, 32;
add.s64 %rd931, %rd1, %rd927;
st.local.u32 [%rd931], %rd930;
add.s32 %r8047, %r8047, 1;
cvt.s64.s32 %rd2545, %r8047;
setp.ne.s32 %p299, %r8047, 6;
@%p299 bra $L__BB0_325;
st.local.u32 [%rd4], %rd2546;
mov.u32 %r3479, 4;
sub.s32 %r415, %r3479, %r412;
mov.u32 %r3480, 6;
sub.s32 %r3481, %r3480, %r412;
mul.wide.s32 %rd932, %r3481, 4;
add.s64 %rd933, %rd1, %rd932;
ld.local.u32 %r8048, [%rd933];
ld.local.u32 %r8049, [%rd933+-4];
and.b32 %r418, %r410, 31;
setp.eq.s32 %p300, %r418, 0;
@%p300 bra $L__BB0_328;
mov.u32 %r3482, 32;
sub.s32 %r3483, %r3482, %r418;
shr.u32 %r3484, %r8049, %r3483;
shl.b32 %r3485, %r8048, %r418;
add.s32 %r8048, %r3484, %r3485;
mul.wide.s32 %rd934, %r415, 4;
add.s64 %rd935, %rd1, %rd934;
ld.local.u32 %r3486, [%rd935];
shr.u32 %r3487, %r3486, %r3483;
shl.b32 %r3488, %r8049, %r418;
add.s32 %r8049, %r3487, %r3488;
$L__BB0_328:
and.b32 %r3489, %r409, -2147483648;
shr.u32 %r3490, %r8049, 30;
shl.b32 %r3491, %r8048, 2;
or.b32 %r3492, %r3490, %r3491;
shr.u32 %r3493, %r3492, 31;
shr.u32 %r3494, %r8048, 30;
add.s32 %r3495, %r3493, %r3494;
neg.s32 %r3496, %r3495;
setp.eq.s32 %p301, %r3489, 0;
selp.b32 %r8050, %r3495, %r3496, %p301;
setp.ne.s32 %p302, %r3493, 0;
xor.b32 %r3497, %r3489, -2147483648;
selp.b32 %r3498, %r3497, %r3489, %p302;
selp.b32 %r3499, -1, 0, %p302;
xor.b32 %r3500, %r3492, %r3499;
shl.b32 %r3501, %r8049, 2;
xor.b32 %r3502, %r3501, %r3499;
cvt.u64.u32 %rd936, %r3500;
cvt.u64.u32 %rd937, %r3502;
bfi.b64 %rd938, %rd936, %rd937, 32, 32;
cvt.rn.f64.s64 %fd41, %rd938;
mul.f64 %fd42, %fd41, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2757, %fd42;
setp.eq.s32 %p303, %r3498, 0;
neg.f32 %f2758, %f2757;
selp.f32 %f5357, %f2757, %f2758, %p303;
$L__BB0_330:
and.b32 %r425, %r8050, 1;
setp.eq.s32 %p304, %r425, 0;
selp.f32 %f375, %f5357, 0f3F800000, %p304;
mul.rn.f32 %f376, %f5357, %f5357;
mov.f32 %f5358, 0fB94D4153;
@%p304 bra $L__BB0_332;
mov.f32 %f2761, 0fBAB607ED;
mov.f32 %f2762, 0f37CBAC00;
fma.rn.f32 %f5358, %f2762, %f376, %f2761;
$L__BB0_332:
selp.f32 %f2763, 0f3C0885E4, 0f3D2AAABB, %p304;
fma.rn.f32 %f2764, %f5358, %f376, %f2763;
selp.f32 %f2765, 0fBE2AAAA8, 0fBEFFFFFF, %p304;
fma.rn.f32 %f2766, %f2764, %f376, %f2765;
mov.f32 %f2767, 0f00000000;
fma.rn.f32 %f2768, %f376, %f375, %f2767;
fma.rn.f32 %f5359, %f2766, %f2768, %f375;
and.b32 %r3504, %r8050, 2;
setp.eq.s32 %p306, %r3504, 0;
@%p306 bra $L__BB0_334;
mov.f32 %f2770, 0fBF800000;
fma.rn.f32 %f5359, %f5359, %f2770, %f2767;
$L__BB0_334:
mul.f32 %f2771, %f306, 0f3F22F983;
cvt.rni.s32.f32 %r8054, %f2771;
cvt.rn.f32.s32 %f2772, %r8054;
mov.f32 %f2773, 0fBFC90FDA;
fma.rn.f32 %f2774, %f2772, %f2773, %f306;
mov.f32 %f2775, 0fB3A22168;
fma.rn.f32 %f2776, %f2772, %f2775, %f2774;
mov.f32 %f2777, 0fA7C234C5;
fma.rn.f32 %f5360, %f2772, %f2777, %f2776;
abs.f32 %f383, %f306;
setp.ltu.f32 %p307, %f383, 0f47CE4780;
@%p307 bra $L__BB0_342;
setp.eq.f32 %p308, %f383, 0f7F800000;
@%p308 bra $L__BB0_341;
bra.uni $L__BB0_336;
$L__BB0_341:
mov.f32 %f2780, 0f00000000;
mul.rn.f32 %f5360, %f306, %f2780;
mov.u32 %r8054, 0;
bra.uni $L__BB0_342;
$L__BB0_336:
mov.b32 %r427, %f306;
shr.u32 %r3506, %r427, 23;
and.b32 %r3507, %r3506, 255;
add.s32 %r428, %r3507, -128;
shl.b32 %r3508, %r427, 8;
or.b32 %r429, %r3508, -2147483648;
shr.u32 %r430, %r428, 5;
mov.u64 %rd2547, 0;
mov.u32 %r8051, 0;
mov.u64 %rd942, __cudart_i2opi_f;
mov.u64 %rd2548, %rd2547;
$L__BB0_337:
.pragma "nounroll";
shl.b64 %rd941, %rd2547, 2;
add.s64 %rd943, %rd942, %rd941;
ld.global.nc.u32 %r3509, [%rd943];
mad.wide.u32 %rd944, %r3509, %r429, %rd2548;
shr.u64 %rd2548, %rd944, 32;
add.s64 %rd945, %rd1, %rd941;
st.local.u32 [%rd945], %rd944;
add.s32 %r8051, %r8051, 1;
cvt.s64.s32 %rd2547, %r8051;
setp.ne.s32 %p309, %r8051, 6;
@%p309 bra $L__BB0_337;
st.local.u32 [%rd4], %rd2548;
mov.u32 %r3510, 4;
sub.s32 %r433, %r3510, %r430;
mov.u32 %r3511, 6;
sub.s32 %r3512, %r3511, %r430;
mul.wide.s32 %rd946, %r3512, 4;
add.s64 %rd947, %rd1, %rd946;
ld.local.u32 %r8052, [%rd947];
ld.local.u32 %r8053, [%rd947+-4];
and.b32 %r436, %r428, 31;
setp.eq.s32 %p310, %r436, 0;
@%p310 bra $L__BB0_340;
mov.u32 %r3513, 32;
sub.s32 %r3514, %r3513, %r436;
shr.u32 %r3515, %r8053, %r3514;
shl.b32 %r3516, %r8052, %r436;
add.s32 %r8052, %r3515, %r3516;
mul.wide.s32 %rd948, %r433, 4;
add.s64 %rd949, %rd1, %rd948;
ld.local.u32 %r3517, [%rd949];
shr.u32 %r3518, %r3517, %r3514;
shl.b32 %r3519, %r8053, %r436;
add.s32 %r8053, %r3518, %r3519;
$L__BB0_340:
and.b32 %r3520, %r427, -2147483648;
shr.u32 %r3521, %r8053, 30;
shl.b32 %r3522, %r8052, 2;
or.b32 %r3523, %r3521, %r3522;
shr.u32 %r3524, %r3523, 31;
shr.u32 %r3525, %r8052, 30;
add.s32 %r3526, %r3524, %r3525;
neg.s32 %r3527, %r3526;
setp.eq.s32 %p311, %r3520, 0;
selp.b32 %r8054, %r3526, %r3527, %p311;
setp.ne.s32 %p312, %r3524, 0;
xor.b32 %r3528, %r3520, -2147483648;
selp.b32 %r3529, %r3528, %r3520, %p312;
selp.b32 %r3530, -1, 0, %p312;
xor.b32 %r3531, %r3523, %r3530;
shl.b32 %r3532, %r8053, 2;
xor.b32 %r3533, %r3532, %r3530;
cvt.u64.u32 %rd950, %r3531;
cvt.u64.u32 %rd951, %r3533;
bfi.b64 %rd952, %rd950, %rd951, 32, 32;
cvt.rn.f64.s64 %fd43, %rd952;
mul.f64 %fd44, %fd43, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2778, %fd44;
setp.eq.s32 %p313, %r3529, 0;
neg.f32 %f2779, %f2778;
selp.f32 %f5360, %f2778, %f2779, %p313;
$L__BB0_342:
add.s32 %r443, %r8054, 1;
and.b32 %r444, %r443, 1;
setp.eq.s32 %p314, %r444, 0;
selp.f32 %f387, %f5360, 0f3F800000, %p314;
mul.rn.f32 %f388, %f5360, %f5360;
mov.f32 %f5361, 0fB94D4153;
@%p314 bra $L__BB0_344;
mov.f32 %f2782, 0fBAB607ED;
mov.f32 %f2783, 0f37CBAC00;
fma.rn.f32 %f5361, %f2783, %f388, %f2782;
$L__BB0_344:
selp.f32 %f2784, 0f3C0885E4, 0f3D2AAABB, %p314;
fma.rn.f32 %f2785, %f5361, %f388, %f2784;
selp.f32 %f2786, 0fBE2AAAA8, 0fBEFFFFFF, %p314;
fma.rn.f32 %f2787, %f2785, %f388, %f2786;
mov.f32 %f2788, 0f00000000;
fma.rn.f32 %f2789, %f388, %f387, %f2788;
fma.rn.f32 %f5362, %f2787, %f2789, %f387;
and.b32 %r3535, %r443, 2;
setp.eq.s32 %p316, %r3535, 0;
@%p316 bra $L__BB0_346;
mov.f32 %f2791, 0fBF800000;
fma.rn.f32 %f5362, %f5362, %f2791, %f2788;
$L__BB0_346:
add.f32 %f5398, %f5359, %f5362;
mul.f32 %f2792, %f315, 0f3F22F983;
cvt.rni.s32.f32 %r8058, %f2792;
cvt.rn.f32.s32 %f2793, %r8058;
mov.f32 %f2794, 0fBFC90FDA;
fma.rn.f32 %f2795, %f2793, %f2794, %f315;
mov.f32 %f2796, 0fB3A22168;
fma.rn.f32 %f2797, %f2793, %f2796, %f2795;
mov.f32 %f2798, 0fA7C234C5;
fma.rn.f32 %f5363, %f2793, %f2798, %f2797;
abs.f32 %f396, %f315;
setp.ltu.f32 %p317, %f396, 0f47CE4780;
@%p317 bra $L__BB0_354;
setp.eq.f32 %p318, %f396, 0f7F800000;
@%p318 bra $L__BB0_353;
bra.uni $L__BB0_348;
$L__BB0_353:
mov.f32 %f2801, 0f00000000;
mul.rn.f32 %f5363, %f315, %f2801;
mov.u32 %r8058, 0;
bra.uni $L__BB0_354;
$L__BB0_348:
mov.b32 %r446, %f315;
shr.u32 %r3537, %r446, 23;
and.b32 %r3538, %r3537, 255;
add.s32 %r447, %r3538, -128;
shl.b32 %r3539, %r446, 8;
or.b32 %r448, %r3539, -2147483648;
shr.u32 %r449, %r447, 5;
mov.u64 %rd2549, 0;
mov.u32 %r8055, 0;
mov.u64 %rd956, __cudart_i2opi_f;
mov.u64 %rd2550, %rd2549;
$L__BB0_349:
.pragma "nounroll";
shl.b64 %rd955, %rd2549, 2;
add.s64 %rd957, %rd956, %rd955;
ld.global.nc.u32 %r3540, [%rd957];
mad.wide.u32 %rd958, %r3540, %r448, %rd2550;
shr.u64 %rd2550, %rd958, 32;
add.s64 %rd959, %rd1, %rd955;
st.local.u32 [%rd959], %rd958;
add.s32 %r8055, %r8055, 1;
cvt.s64.s32 %rd2549, %r8055;
setp.ne.s32 %p319, %r8055, 6;
@%p319 bra $L__BB0_349;
st.local.u32 [%rd4], %rd2550;
mov.u32 %r3541, 4;
sub.s32 %r452, %r3541, %r449;
mov.u32 %r3542, 6;
sub.s32 %r3543, %r3542, %r449;
mul.wide.s32 %rd960, %r3543, 4;
add.s64 %rd961, %rd1, %rd960;
ld.local.u32 %r8056, [%rd961];
ld.local.u32 %r8057, [%rd961+-4];
and.b32 %r455, %r447, 31;
setp.eq.s32 %p320, %r455, 0;
@%p320 bra $L__BB0_352;
mov.u32 %r3544, 32;
sub.s32 %r3545, %r3544, %r455;
shr.u32 %r3546, %r8057, %r3545;
shl.b32 %r3547, %r8056, %r455;
add.s32 %r8056, %r3546, %r3547;
mul.wide.s32 %rd962, %r452, 4;
add.s64 %rd963, %rd1, %rd962;
ld.local.u32 %r3548, [%rd963];
shr.u32 %r3549, %r3548, %r3545;
shl.b32 %r3550, %r8057, %r455;
add.s32 %r8057, %r3549, %r3550;
$L__BB0_352:
and.b32 %r3551, %r446, -2147483648;
shr.u32 %r3552, %r8057, 30;
shl.b32 %r3553, %r8056, 2;
or.b32 %r3554, %r3552, %r3553;
shr.u32 %r3555, %r3554, 31;
shr.u32 %r3556, %r8056, 30;
add.s32 %r3557, %r3555, %r3556;
neg.s32 %r3558, %r3557;
setp.eq.s32 %p321, %r3551, 0;
selp.b32 %r8058, %r3557, %r3558, %p321;
setp.ne.s32 %p322, %r3555, 0;
xor.b32 %r3559, %r3551, -2147483648;
selp.b32 %r3560, %r3559, %r3551, %p322;
selp.b32 %r3561, -1, 0, %p322;
xor.b32 %r3562, %r3554, %r3561;
shl.b32 %r3563, %r8057, 2;
xor.b32 %r3564, %r3563, %r3561;
cvt.u64.u32 %rd964, %r3562;
cvt.u64.u32 %rd965, %r3564;
bfi.b64 %rd966, %rd964, %rd965, 32, 32;
cvt.rn.f64.s64 %fd45, %rd966;
mul.f64 %fd46, %fd45, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2799, %fd46;
setp.eq.s32 %p323, %r3560, 0;
neg.f32 %f2800, %f2799;
selp.f32 %f5363, %f2799, %f2800, %p323;
$L__BB0_354:
and.b32 %r462, %r8058, 1;
setp.eq.s32 %p324, %r462, 0;
selp.f32 %f400, %f5363, 0f3F800000, %p324;
mul.rn.f32 %f401, %f5363, %f5363;
mov.f32 %f5364, 0fB94D4153;
@%p324 bra $L__BB0_356;
mov.f32 %f2803, 0fBAB607ED;
mov.f32 %f2804, 0f37CBAC00;
fma.rn.f32 %f5364, %f2804, %f401, %f2803;
$L__BB0_356:
selp.f32 %f2805, 0f3C0885E4, 0f3D2AAABB, %p324;
fma.rn.f32 %f2806, %f5364, %f401, %f2805;
selp.f32 %f2807, 0fBE2AAAA8, 0fBEFFFFFF, %p324;
fma.rn.f32 %f2808, %f2806, %f401, %f2807;
mov.f32 %f2809, 0f00000000;
fma.rn.f32 %f2810, %f401, %f400, %f2809;
fma.rn.f32 %f5365, %f2808, %f2810, %f400;
and.b32 %r3566, %r8058, 2;
setp.eq.s32 %p326, %r3566, 0;
@%p326 bra $L__BB0_358;
mov.f32 %f2812, 0fBF800000;
fma.rn.f32 %f5365, %f5365, %f2812, %f2809;
$L__BB0_358:
mul.f32 %f2813, %f307, 0f3F22F983;
cvt.rni.s32.f32 %r8062, %f2813;
cvt.rn.f32.s32 %f2814, %r8062;
mov.f32 %f2815, 0fBFC90FDA;
fma.rn.f32 %f2816, %f2814, %f2815, %f307;
mov.f32 %f2817, 0fB3A22168;
fma.rn.f32 %f2818, %f2814, %f2817, %f2816;
mov.f32 %f2819, 0fA7C234C5;
fma.rn.f32 %f5366, %f2814, %f2819, %f2818;
abs.f32 %f408, %f307;
setp.ltu.f32 %p327, %f408, 0f47CE4780;
@%p327 bra $L__BB0_366;
setp.eq.f32 %p328, %f408, 0f7F800000;
@%p328 bra $L__BB0_365;
bra.uni $L__BB0_360;
$L__BB0_365:
mov.f32 %f2822, 0f00000000;
mul.rn.f32 %f5366, %f307, %f2822;
mov.u32 %r8062, 0;
bra.uni $L__BB0_366;
$L__BB0_360:
mov.b32 %r464, %f307;
shr.u32 %r3568, %r464, 23;
and.b32 %r3569, %r3568, 255;
add.s32 %r465, %r3569, -128;
shl.b32 %r3570, %r464, 8;
or.b32 %r466, %r3570, -2147483648;
shr.u32 %r467, %r465, 5;
mov.u64 %rd2551, 0;
mov.u32 %r8059, 0;
mov.u64 %rd970, __cudart_i2opi_f;
mov.u64 %rd2552, %rd2551;
$L__BB0_361:
.pragma "nounroll";
shl.b64 %rd969, %rd2551, 2;
add.s64 %rd971, %rd970, %rd969;
ld.global.nc.u32 %r3571, [%rd971];
mad.wide.u32 %rd972, %r3571, %r466, %rd2552;
shr.u64 %rd2552, %rd972, 32;
add.s64 %rd973, %rd1, %rd969;
st.local.u32 [%rd973], %rd972;
add.s32 %r8059, %r8059, 1;
cvt.s64.s32 %rd2551, %r8059;
setp.ne.s32 %p329, %r8059, 6;
@%p329 bra $L__BB0_361;
st.local.u32 [%rd4], %rd2552;
mov.u32 %r3572, 4;
sub.s32 %r470, %r3572, %r467;
mov.u32 %r3573, 6;
sub.s32 %r3574, %r3573, %r467;
mul.wide.s32 %rd974, %r3574, 4;
add.s64 %rd975, %rd1, %rd974;
ld.local.u32 %r8060, [%rd975];
ld.local.u32 %r8061, [%rd975+-4];
and.b32 %r473, %r465, 31;
setp.eq.s32 %p330, %r473, 0;
@%p330 bra $L__BB0_364;
mov.u32 %r3575, 32;
sub.s32 %r3576, %r3575, %r473;
shr.u32 %r3577, %r8061, %r3576;
shl.b32 %r3578, %r8060, %r473;
add.s32 %r8060, %r3577, %r3578;
mul.wide.s32 %rd976, %r470, 4;
add.s64 %rd977, %rd1, %rd976;
ld.local.u32 %r3579, [%rd977];
shr.u32 %r3580, %r3579, %r3576;
shl.b32 %r3581, %r8061, %r473;
add.s32 %r8061, %r3580, %r3581;
$L__BB0_364:
and.b32 %r3582, %r464, -2147483648;
shr.u32 %r3583, %r8061, 30;
shl.b32 %r3584, %r8060, 2;
or.b32 %r3585, %r3583, %r3584;
shr.u32 %r3586, %r3585, 31;
shr.u32 %r3587, %r8060, 30;
add.s32 %r3588, %r3586, %r3587;
neg.s32 %r3589, %r3588;
setp.eq.s32 %p331, %r3582, 0;
selp.b32 %r8062, %r3588, %r3589, %p331;
setp.ne.s32 %p332, %r3586, 0;
xor.b32 %r3590, %r3582, -2147483648;
selp.b32 %r3591, %r3590, %r3582, %p332;
selp.b32 %r3592, -1, 0, %p332;
xor.b32 %r3593, %r3585, %r3592;
shl.b32 %r3594, %r8061, 2;
xor.b32 %r3595, %r3594, %r3592;
cvt.u64.u32 %rd978, %r3593;
cvt.u64.u32 %rd979, %r3595;
bfi.b64 %rd980, %rd978, %rd979, 32, 32;
cvt.rn.f64.s64 %fd47, %rd980;
mul.f64 %fd48, %fd47, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2820, %fd48;
setp.eq.s32 %p333, %r3591, 0;
neg.f32 %f2821, %f2820;
selp.f32 %f5366, %f2820, %f2821, %p333;
$L__BB0_366:
add.s32 %r480, %r8062, 1;
and.b32 %r481, %r480, 1;
setp.eq.s32 %p334, %r481, 0;
selp.f32 %f412, %f5366, 0f3F800000, %p334;
mul.rn.f32 %f413, %f5366, %f5366;
mov.f32 %f5367, 0fB94D4153;
@%p334 bra $L__BB0_368;
mov.f32 %f2824, 0fBAB607ED;
mov.f32 %f2825, 0f37CBAC00;
fma.rn.f32 %f5367, %f2825, %f413, %f2824;
$L__BB0_368:
selp.f32 %f2826, 0f3C0885E4, 0f3D2AAABB, %p334;
fma.rn.f32 %f2827, %f5367, %f413, %f2826;
selp.f32 %f2828, 0fBE2AAAA8, 0fBEFFFFFF, %p334;
fma.rn.f32 %f2829, %f2827, %f413, %f2828;
mov.f32 %f2830, 0f00000000;
fma.rn.f32 %f2831, %f413, %f412, %f2830;
fma.rn.f32 %f5368, %f2829, %f2831, %f412;
and.b32 %r3597, %r480, 2;
setp.eq.s32 %p336, %r3597, 0;
@%p336 bra $L__BB0_370;
mov.f32 %f2833, 0fBF800000;
fma.rn.f32 %f5368, %f5368, %f2833, %f2830;
$L__BB0_370:
add.f32 %f5397, %f5365, %f5368;
mul.f32 %f2834, %f316, 0f3F22F983;
cvt.rni.s32.f32 %r8066, %f2834;
cvt.rn.f32.s32 %f2835, %r8066;
mov.f32 %f2836, 0fBFC90FDA;
fma.rn.f32 %f2837, %f2835, %f2836, %f316;
mov.f32 %f2838, 0fB3A22168;
fma.rn.f32 %f2839, %f2835, %f2838, %f2837;
mov.f32 %f2840, 0fA7C234C5;
fma.rn.f32 %f5369, %f2835, %f2840, %f2839;
abs.f32 %f421, %f316;
setp.ltu.f32 %p337, %f421, 0f47CE4780;
@%p337 bra $L__BB0_378;
setp.eq.f32 %p338, %f421, 0f7F800000;
@%p338 bra $L__BB0_377;
bra.uni $L__BB0_372;
$L__BB0_377:
mov.f32 %f2843, 0f00000000;
mul.rn.f32 %f5369, %f316, %f2843;
mov.u32 %r8066, 0;
bra.uni $L__BB0_378;
$L__BB0_372:
mov.b32 %r483, %f316;
shr.u32 %r3599, %r483, 23;
and.b32 %r3600, %r3599, 255;
add.s32 %r484, %r3600, -128;
shl.b32 %r3601, %r483, 8;
or.b32 %r485, %r3601, -2147483648;
shr.u32 %r486, %r484, 5;
mov.u64 %rd2553, 0;
mov.u32 %r8063, 0;
mov.u64 %rd984, __cudart_i2opi_f;
mov.u64 %rd2554, %rd2553;
$L__BB0_373:
.pragma "nounroll";
shl.b64 %rd983, %rd2553, 2;
add.s64 %rd985, %rd984, %rd983;
ld.global.nc.u32 %r3602, [%rd985];
mad.wide.u32 %rd986, %r3602, %r485, %rd2554;
shr.u64 %rd2554, %rd986, 32;
add.s64 %rd987, %rd1, %rd983;
st.local.u32 [%rd987], %rd986;
add.s32 %r8063, %r8063, 1;
cvt.s64.s32 %rd2553, %r8063;
setp.ne.s32 %p339, %r8063, 6;
@%p339 bra $L__BB0_373;
st.local.u32 [%rd4], %rd2554;
mov.u32 %r3603, 4;
sub.s32 %r489, %r3603, %r486;
mov.u32 %r3604, 6;
sub.s32 %r3605, %r3604, %r486;
mul.wide.s32 %rd988, %r3605, 4;
add.s64 %rd989, %rd1, %rd988;
ld.local.u32 %r8064, [%rd989];
ld.local.u32 %r8065, [%rd989+-4];
and.b32 %r492, %r484, 31;
setp.eq.s32 %p340, %r492, 0;
@%p340 bra $L__BB0_376;
mov.u32 %r3606, 32;
sub.s32 %r3607, %r3606, %r492;
shr.u32 %r3608, %r8065, %r3607;
shl.b32 %r3609, %r8064, %r492;
add.s32 %r8064, %r3608, %r3609;
mul.wide.s32 %rd990, %r489, 4;
add.s64 %rd991, %rd1, %rd990;
ld.local.u32 %r3610, [%rd991];
shr.u32 %r3611, %r3610, %r3607;
shl.b32 %r3612, %r8065, %r492;
add.s32 %r8065, %r3611, %r3612;
$L__BB0_376:
and.b32 %r3613, %r483, -2147483648;
shr.u32 %r3614, %r8065, 30;
shl.b32 %r3615, %r8064, 2;
or.b32 %r3616, %r3614, %r3615;
shr.u32 %r3617, %r3616, 31;
shr.u32 %r3618, %r8064, 30;
add.s32 %r3619, %r3617, %r3618;
neg.s32 %r3620, %r3619;
setp.eq.s32 %p341, %r3613, 0;
selp.b32 %r8066, %r3619, %r3620, %p341;
setp.ne.s32 %p342, %r3617, 0;
xor.b32 %r3621, %r3613, -2147483648;
selp.b32 %r3622, %r3621, %r3613, %p342;
selp.b32 %r3623, -1, 0, %p342;
xor.b32 %r3624, %r3616, %r3623;
shl.b32 %r3625, %r8065, 2;
xor.b32 %r3626, %r3625, %r3623;
cvt.u64.u32 %rd992, %r3624;
cvt.u64.u32 %rd993, %r3626;
bfi.b64 %rd994, %rd992, %rd993, 32, 32;
cvt.rn.f64.s64 %fd49, %rd994;
mul.f64 %fd50, %fd49, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2841, %fd50;
setp.eq.s32 %p343, %r3622, 0;
neg.f32 %f2842, %f2841;
selp.f32 %f5369, %f2841, %f2842, %p343;
$L__BB0_378:
and.b32 %r499, %r8066, 1;
setp.eq.s32 %p344, %r499, 0;
selp.f32 %f425, %f5369, 0f3F800000, %p344;
mul.rn.f32 %f426, %f5369, %f5369;
mov.f32 %f5370, 0fB94D4153;
@%p344 bra $L__BB0_380;
mov.f32 %f2845, 0fBAB607ED;
mov.f32 %f2846, 0f37CBAC00;
fma.rn.f32 %f5370, %f2846, %f426, %f2845;
$L__BB0_380:
selp.f32 %f2847, 0f3C0885E4, 0f3D2AAABB, %p344;
fma.rn.f32 %f2848, %f5370, %f426, %f2847;
selp.f32 %f2849, 0fBE2AAAA8, 0fBEFFFFFF, %p344;
fma.rn.f32 %f2850, %f2848, %f426, %f2849;
mov.f32 %f2851, 0f00000000;
fma.rn.f32 %f2852, %f426, %f425, %f2851;
fma.rn.f32 %f5371, %f2850, %f2852, %f425;
and.b32 %r3628, %r8066, 2;
setp.eq.s32 %p346, %r3628, 0;
@%p346 bra $L__BB0_382;
mov.f32 %f2854, 0fBF800000;
fma.rn.f32 %f5371, %f5371, %f2854, %f2851;
$L__BB0_382:
mul.f32 %f2855, %f308, 0f3F22F983;
cvt.rni.s32.f32 %r8070, %f2855;
cvt.rn.f32.s32 %f2856, %r8070;
mov.f32 %f2857, 0fBFC90FDA;
fma.rn.f32 %f2858, %f2856, %f2857, %f308;
mov.f32 %f2859, 0fB3A22168;
fma.rn.f32 %f2860, %f2856, %f2859, %f2858;
mov.f32 %f2861, 0fA7C234C5;
fma.rn.f32 %f5372, %f2856, %f2861, %f2860;
abs.f32 %f433, %f308;
setp.ltu.f32 %p347, %f433, 0f47CE4780;
@%p347 bra $L__BB0_390;
setp.eq.f32 %p348, %f433, 0f7F800000;
@%p348 bra $L__BB0_389;
bra.uni $L__BB0_384;
$L__BB0_389:
mov.f32 %f2864, 0f00000000;
mul.rn.f32 %f5372, %f308, %f2864;
mov.u32 %r8070, 0;
bra.uni $L__BB0_390;
$L__BB0_384:
mov.b32 %r501, %f308;
shr.u32 %r3630, %r501, 23;
and.b32 %r3631, %r3630, 255;
add.s32 %r502, %r3631, -128;
shl.b32 %r3632, %r501, 8;
or.b32 %r503, %r3632, -2147483648;
shr.u32 %r504, %r502, 5;
mov.u64 %rd2555, 0;
mov.u32 %r8067, 0;
mov.u64 %rd998, __cudart_i2opi_f;
mov.u64 %rd2556, %rd2555;
$L__BB0_385:
.pragma "nounroll";
shl.b64 %rd997, %rd2555, 2;
add.s64 %rd999, %rd998, %rd997;
ld.global.nc.u32 %r3633, [%rd999];
mad.wide.u32 %rd1000, %r3633, %r503, %rd2556;
shr.u64 %rd2556, %rd1000, 32;
add.s64 %rd1001, %rd1, %rd997;
st.local.u32 [%rd1001], %rd1000;
add.s32 %r8067, %r8067, 1;
cvt.s64.s32 %rd2555, %r8067;
setp.ne.s32 %p349, %r8067, 6;
@%p349 bra $L__BB0_385;
st.local.u32 [%rd4], %rd2556;
mov.u32 %r3634, 4;
sub.s32 %r507, %r3634, %r504;
mov.u32 %r3635, 6;
sub.s32 %r3636, %r3635, %r504;
mul.wide.s32 %rd1002, %r3636, 4;
add.s64 %rd1003, %rd1, %rd1002;
ld.local.u32 %r8068, [%rd1003];
ld.local.u32 %r8069, [%rd1003+-4];
and.b32 %r510, %r502, 31;
setp.eq.s32 %p350, %r510, 0;
@%p350 bra $L__BB0_388;
mov.u32 %r3637, 32;
sub.s32 %r3638, %r3637, %r510;
shr.u32 %r3639, %r8069, %r3638;
shl.b32 %r3640, %r8068, %r510;
add.s32 %r8068, %r3639, %r3640;
mul.wide.s32 %rd1004, %r507, 4;
add.s64 %rd1005, %rd1, %rd1004;
ld.local.u32 %r3641, [%rd1005];
shr.u32 %r3642, %r3641, %r3638;
shl.b32 %r3643, %r8069, %r510;
add.s32 %r8069, %r3642, %r3643;
$L__BB0_388:
and.b32 %r3644, %r501, -2147483648;
shr.u32 %r3645, %r8069, 30;
shl.b32 %r3646, %r8068, 2;
or.b32 %r3647, %r3645, %r3646;
shr.u32 %r3648, %r3647, 31;
shr.u32 %r3649, %r8068, 30;
add.s32 %r3650, %r3648, %r3649;
neg.s32 %r3651, %r3650;
setp.eq.s32 %p351, %r3644, 0;
selp.b32 %r8070, %r3650, %r3651, %p351;
setp.ne.s32 %p352, %r3648, 0;
xor.b32 %r3652, %r3644, -2147483648;
selp.b32 %r3653, %r3652, %r3644, %p352;
selp.b32 %r3654, -1, 0, %p352;
xor.b32 %r3655, %r3647, %r3654;
shl.b32 %r3656, %r8069, 2;
xor.b32 %r3657, %r3656, %r3654;
cvt.u64.u32 %rd1006, %r3655;
cvt.u64.u32 %rd1007, %r3657;
bfi.b64 %rd1008, %rd1006, %rd1007, 32, 32;
cvt.rn.f64.s64 %fd51, %rd1008;
mul.f64 %fd52, %fd51, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2862, %fd52;
setp.eq.s32 %p353, %r3653, 0;
neg.f32 %f2863, %f2862;
selp.f32 %f5372, %f2862, %f2863, %p353;
$L__BB0_390:
add.s32 %r517, %r8070, 1;
and.b32 %r518, %r517, 1;
setp.eq.s32 %p354, %r518, 0;
selp.f32 %f437, %f5372, 0f3F800000, %p354;
mul.rn.f32 %f438, %f5372, %f5372;
mov.f32 %f5373, 0fB94D4153;
@%p354 bra $L__BB0_392;
mov.f32 %f2866, 0fBAB607ED;
mov.f32 %f2867, 0f37CBAC00;
fma.rn.f32 %f5373, %f2867, %f438, %f2866;
$L__BB0_392:
selp.f32 %f2868, 0f3C0885E4, 0f3D2AAABB, %p354;
fma.rn.f32 %f2869, %f5373, %f438, %f2868;
selp.f32 %f2870, 0fBE2AAAA8, 0fBEFFFFFF, %p354;
fma.rn.f32 %f2871, %f2869, %f438, %f2870;
mov.f32 %f2872, 0f00000000;
fma.rn.f32 %f2873, %f438, %f437, %f2872;
fma.rn.f32 %f5374, %f2871, %f2873, %f437;
and.b32 %r3659, %r517, 2;
setp.eq.s32 %p356, %r3659, 0;
@%p356 bra $L__BB0_394;
mov.f32 %f2875, 0fBF800000;
fma.rn.f32 %f5374, %f5374, %f2875, %f2872;
$L__BB0_394:
add.f32 %f5396, %f5371, %f5374;
mul.f32 %f2876, %f317, 0f3F22F983;
cvt.rni.s32.f32 %r8074, %f2876;
cvt.rn.f32.s32 %f2877, %r8074;
mov.f32 %f2878, 0fBFC90FDA;
fma.rn.f32 %f2879, %f2877, %f2878, %f317;
mov.f32 %f2880, 0fB3A22168;
fma.rn.f32 %f2881, %f2877, %f2880, %f2879;
mov.f32 %f2882, 0fA7C234C5;
fma.rn.f32 %f5375, %f2877, %f2882, %f2881;
abs.f32 %f446, %f317;
setp.ltu.f32 %p357, %f446, 0f47CE4780;
@%p357 bra $L__BB0_402;
setp.eq.f32 %p358, %f446, 0f7F800000;
@%p358 bra $L__BB0_401;
bra.uni $L__BB0_396;
$L__BB0_401:
mov.f32 %f2885, 0f00000000;
mul.rn.f32 %f5375, %f317, %f2885;
mov.u32 %r8074, 0;
bra.uni $L__BB0_402;
$L__BB0_396:
mov.b32 %r520, %f317;
shr.u32 %r3661, %r520, 23;
and.b32 %r3662, %r3661, 255;
add.s32 %r521, %r3662, -128;
shl.b32 %r3663, %r520, 8;
or.b32 %r522, %r3663, -2147483648;
shr.u32 %r523, %r521, 5;
mov.u64 %rd2557, 0;
mov.u32 %r8071, 0;
mov.u64 %rd1012, __cudart_i2opi_f;
mov.u64 %rd2558, %rd2557;
$L__BB0_397:
.pragma "nounroll";
shl.b64 %rd1011, %rd2557, 2;
add.s64 %rd1013, %rd1012, %rd1011;
ld.global.nc.u32 %r3664, [%rd1013];
mad.wide.u32 %rd1014, %r3664, %r522, %rd2558;
shr.u64 %rd2558, %rd1014, 32;
add.s64 %rd1015, %rd1, %rd1011;
st.local.u32 [%rd1015], %rd1014;
add.s32 %r8071, %r8071, 1;
cvt.s64.s32 %rd2557, %r8071;
setp.ne.s32 %p359, %r8071, 6;
@%p359 bra $L__BB0_397;
st.local.u32 [%rd4], %rd2558;
mov.u32 %r3665, 4;
sub.s32 %r526, %r3665, %r523;
mov.u32 %r3666, 6;
sub.s32 %r3667, %r3666, %r523;
mul.wide.s32 %rd1016, %r3667, 4;
add.s64 %rd1017, %rd1, %rd1016;
ld.local.u32 %r8072, [%rd1017];
ld.local.u32 %r8073, [%rd1017+-4];
and.b32 %r529, %r521, 31;
setp.eq.s32 %p360, %r529, 0;
@%p360 bra $L__BB0_400;
mov.u32 %r3668, 32;
sub.s32 %r3669, %r3668, %r529;
shr.u32 %r3670, %r8073, %r3669;
shl.b32 %r3671, %r8072, %r529;
add.s32 %r8072, %r3670, %r3671;
mul.wide.s32 %rd1018, %r526, 4;
add.s64 %rd1019, %rd1, %rd1018;
ld.local.u32 %r3672, [%rd1019];
shr.u32 %r3673, %r3672, %r3669;
shl.b32 %r3674, %r8073, %r529;
add.s32 %r8073, %r3673, %r3674;
$L__BB0_400:
and.b32 %r3675, %r520, -2147483648;
shr.u32 %r3676, %r8073, 30;
shl.b32 %r3677, %r8072, 2;
or.b32 %r3678, %r3676, %r3677;
shr.u32 %r3679, %r3678, 31;
shr.u32 %r3680, %r8072, 30;
add.s32 %r3681, %r3679, %r3680;
neg.s32 %r3682, %r3681;
setp.eq.s32 %p361, %r3675, 0;
selp.b32 %r8074, %r3681, %r3682, %p361;
setp.ne.s32 %p362, %r3679, 0;
xor.b32 %r3683, %r3675, -2147483648;
selp.b32 %r3684, %r3683, %r3675, %p362;
selp.b32 %r3685, -1, 0, %p362;
xor.b32 %r3686, %r3678, %r3685;
shl.b32 %r3687, %r8073, 2;
xor.b32 %r3688, %r3687, %r3685;
cvt.u64.u32 %rd1020, %r3686;
cvt.u64.u32 %rd1021, %r3688;
bfi.b64 %rd1022, %rd1020, %rd1021, 32, 32;
cvt.rn.f64.s64 %fd53, %rd1022;
mul.f64 %fd54, %fd53, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2883, %fd54;
setp.eq.s32 %p363, %r3684, 0;
neg.f32 %f2884, %f2883;
selp.f32 %f5375, %f2883, %f2884, %p363;
$L__BB0_402:
and.b32 %r536, %r8074, 1;
setp.eq.s32 %p364, %r536, 0;
selp.f32 %f450, %f5375, 0f3F800000, %p364;
mul.rn.f32 %f451, %f5375, %f5375;
mov.f32 %f5376, 0fB94D4153;
@%p364 bra $L__BB0_404;
mov.f32 %f2887, 0fBAB607ED;
mov.f32 %f2888, 0f37CBAC00;
fma.rn.f32 %f5376, %f2888, %f451, %f2887;
$L__BB0_404:
selp.f32 %f2889, 0f3C0885E4, 0f3D2AAABB, %p364;
fma.rn.f32 %f2890, %f5376, %f451, %f2889;
selp.f32 %f2891, 0fBE2AAAA8, 0fBEFFFFFF, %p364;
fma.rn.f32 %f2892, %f2890, %f451, %f2891;
mov.f32 %f2893, 0f00000000;
fma.rn.f32 %f2894, %f451, %f450, %f2893;
fma.rn.f32 %f5377, %f2892, %f2894, %f450;
and.b32 %r3690, %r8074, 2;
setp.eq.s32 %p366, %r3690, 0;
@%p366 bra $L__BB0_406;
mov.f32 %f2896, 0fBF800000;
fma.rn.f32 %f5377, %f5377, %f2896, %f2893;
$L__BB0_406:
mul.f32 %f2897, %f309, 0f3F22F983;
cvt.rni.s32.f32 %r8078, %f2897;
cvt.rn.f32.s32 %f2898, %r8078;
mov.f32 %f2899, 0fBFC90FDA;
fma.rn.f32 %f2900, %f2898, %f2899, %f309;
mov.f32 %f2901, 0fB3A22168;
fma.rn.f32 %f2902, %f2898, %f2901, %f2900;
mov.f32 %f2903, 0fA7C234C5;
fma.rn.f32 %f5378, %f2898, %f2903, %f2902;
abs.f32 %f458, %f309;
setp.ltu.f32 %p367, %f458, 0f47CE4780;
@%p367 bra $L__BB0_414;
setp.eq.f32 %p368, %f458, 0f7F800000;
@%p368 bra $L__BB0_413;
bra.uni $L__BB0_408;
$L__BB0_413:
mov.f32 %f2906, 0f00000000;
mul.rn.f32 %f5378, %f309, %f2906;
mov.u32 %r8078, 0;
bra.uni $L__BB0_414;
$L__BB0_408:
mov.b32 %r538, %f309;
shr.u32 %r3692, %r538, 23;
and.b32 %r3693, %r3692, 255;
add.s32 %r539, %r3693, -128;
shl.b32 %r3694, %r538, 8;
or.b32 %r540, %r3694, -2147483648;
shr.u32 %r541, %r539, 5;
mov.u64 %rd2559, 0;
mov.u32 %r8075, 0;
mov.u64 %rd1026, __cudart_i2opi_f;
mov.u64 %rd2560, %rd2559;
$L__BB0_409:
.pragma "nounroll";
shl.b64 %rd1025, %rd2559, 2;
add.s64 %rd1027, %rd1026, %rd1025;
ld.global.nc.u32 %r3695, [%rd1027];
mad.wide.u32 %rd1028, %r3695, %r540, %rd2560;
shr.u64 %rd2560, %rd1028, 32;
add.s64 %rd1029, %rd1, %rd1025;
st.local.u32 [%rd1029], %rd1028;
add.s32 %r8075, %r8075, 1;
cvt.s64.s32 %rd2559, %r8075;
setp.ne.s32 %p369, %r8075, 6;
@%p369 bra $L__BB0_409;
st.local.u32 [%rd4], %rd2560;
mov.u32 %r3696, 4;
sub.s32 %r544, %r3696, %r541;
mov.u32 %r3697, 6;
sub.s32 %r3698, %r3697, %r541;
mul.wide.s32 %rd1030, %r3698, 4;
add.s64 %rd1031, %rd1, %rd1030;
ld.local.u32 %r8076, [%rd1031];
ld.local.u32 %r8077, [%rd1031+-4];
and.b32 %r547, %r539, 31;
setp.eq.s32 %p370, %r547, 0;
@%p370 bra $L__BB0_412;
mov.u32 %r3699, 32;
sub.s32 %r3700, %r3699, %r547;
shr.u32 %r3701, %r8077, %r3700;
shl.b32 %r3702, %r8076, %r547;
add.s32 %r8076, %r3701, %r3702;
mul.wide.s32 %rd1032, %r544, 4;
add.s64 %rd1033, %rd1, %rd1032;
ld.local.u32 %r3703, [%rd1033];
shr.u32 %r3704, %r3703, %r3700;
shl.b32 %r3705, %r8077, %r547;
add.s32 %r8077, %r3704, %r3705;
$L__BB0_412:
and.b32 %r3706, %r538, -2147483648;
shr.u32 %r3707, %r8077, 30;
shl.b32 %r3708, %r8076, 2;
or.b32 %r3709, %r3707, %r3708;
shr.u32 %r3710, %r3709, 31;
shr.u32 %r3711, %r8076, 30;
add.s32 %r3712, %r3710, %r3711;
neg.s32 %r3713, %r3712;
setp.eq.s32 %p371, %r3706, 0;
selp.b32 %r8078, %r3712, %r3713, %p371;
setp.ne.s32 %p372, %r3710, 0;
xor.b32 %r3714, %r3706, -2147483648;
selp.b32 %r3715, %r3714, %r3706, %p372;
selp.b32 %r3716, -1, 0, %p372;
xor.b32 %r3717, %r3709, %r3716;
shl.b32 %r3718, %r8077, 2;
xor.b32 %r3719, %r3718, %r3716;
cvt.u64.u32 %rd1034, %r3717;
cvt.u64.u32 %rd1035, %r3719;
bfi.b64 %rd1036, %rd1034, %rd1035, 32, 32;
cvt.rn.f64.s64 %fd55, %rd1036;
mul.f64 %fd56, %fd55, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2904, %fd56;
setp.eq.s32 %p373, %r3715, 0;
neg.f32 %f2905, %f2904;
selp.f32 %f5378, %f2904, %f2905, %p373;
$L__BB0_414:
add.s32 %r554, %r8078, 1;
and.b32 %r555, %r554, 1;
setp.eq.s32 %p374, %r555, 0;
selp.f32 %f462, %f5378, 0f3F800000, %p374;
mul.rn.f32 %f463, %f5378, %f5378;
mov.f32 %f5379, 0fB94D4153;
@%p374 bra $L__BB0_416;
mov.f32 %f2908, 0fBAB607ED;
mov.f32 %f2909, 0f37CBAC00;
fma.rn.f32 %f5379, %f2909, %f463, %f2908;
$L__BB0_416:
selp.f32 %f2910, 0f3C0885E4, 0f3D2AAABB, %p374;
fma.rn.f32 %f2911, %f5379, %f463, %f2910;
selp.f32 %f2912, 0fBE2AAAA8, 0fBEFFFFFF, %p374;
fma.rn.f32 %f2913, %f2911, %f463, %f2912;
mov.f32 %f2914, 0f00000000;
fma.rn.f32 %f2915, %f463, %f462, %f2914;
fma.rn.f32 %f5380, %f2913, %f2915, %f462;
and.b32 %r3721, %r554, 2;
setp.eq.s32 %p376, %r3721, 0;
@%p376 bra $L__BB0_418;
mov.f32 %f2917, 0fBF800000;
fma.rn.f32 %f5380, %f5380, %f2917, %f2914;
$L__BB0_418:
add.f32 %f5395, %f5377, %f5380;
mul.f32 %f2918, %f318, 0f3F22F983;
cvt.rni.s32.f32 %r8082, %f2918;
cvt.rn.f32.s32 %f2919, %r8082;
mov.f32 %f2920, 0fBFC90FDA;
fma.rn.f32 %f2921, %f2919, %f2920, %f318;
mov.f32 %f2922, 0fB3A22168;
fma.rn.f32 %f2923, %f2919, %f2922, %f2921;
mov.f32 %f2924, 0fA7C234C5;
fma.rn.f32 %f5381, %f2919, %f2924, %f2923;
abs.f32 %f471, %f318;
setp.ltu.f32 %p377, %f471, 0f47CE4780;
@%p377 bra $L__BB0_426;
setp.eq.f32 %p378, %f471, 0f7F800000;
@%p378 bra $L__BB0_425;
bra.uni $L__BB0_420;
$L__BB0_425:
mov.f32 %f2927, 0f00000000;
mul.rn.f32 %f5381, %f318, %f2927;
mov.u32 %r8082, 0;
bra.uni $L__BB0_426;
$L__BB0_420:
mov.b32 %r557, %f318;
shr.u32 %r3723, %r557, 23;
and.b32 %r3724, %r3723, 255;
add.s32 %r558, %r3724, -128;
shl.b32 %r3725, %r557, 8;
or.b32 %r559, %r3725, -2147483648;
shr.u32 %r560, %r558, 5;
mov.u64 %rd2561, 0;
mov.u32 %r8079, 0;
mov.u64 %rd1040, __cudart_i2opi_f;
mov.u64 %rd2562, %rd2561;
$L__BB0_421:
.pragma "nounroll";
shl.b64 %rd1039, %rd2561, 2;
add.s64 %rd1041, %rd1040, %rd1039;
ld.global.nc.u32 %r3726, [%rd1041];
mad.wide.u32 %rd1042, %r3726, %r559, %rd2562;
shr.u64 %rd2562, %rd1042, 32;
add.s64 %rd1043, %rd1, %rd1039;
st.local.u32 [%rd1043], %rd1042;
add.s32 %r8079, %r8079, 1;
cvt.s64.s32 %rd2561, %r8079;
setp.ne.s32 %p379, %r8079, 6;
@%p379 bra $L__BB0_421;
st.local.u32 [%rd4], %rd2562;
mov.u32 %r3727, 4;
sub.s32 %r563, %r3727, %r560;
mov.u32 %r3728, 6;
sub.s32 %r3729, %r3728, %r560;
mul.wide.s32 %rd1044, %r3729, 4;
add.s64 %rd1045, %rd1, %rd1044;
ld.local.u32 %r8080, [%rd1045];
ld.local.u32 %r8081, [%rd1045+-4];
and.b32 %r566, %r558, 31;
setp.eq.s32 %p380, %r566, 0;
@%p380 bra $L__BB0_424;
mov.u32 %r3730, 32;
sub.s32 %r3731, %r3730, %r566;
shr.u32 %r3732, %r8081, %r3731;
shl.b32 %r3733, %r8080, %r566;
add.s32 %r8080, %r3732, %r3733;
mul.wide.s32 %rd1046, %r563, 4;
add.s64 %rd1047, %rd1, %rd1046;
ld.local.u32 %r3734, [%rd1047];
shr.u32 %r3735, %r3734, %r3731;
shl.b32 %r3736, %r8081, %r566;
add.s32 %r8081, %r3735, %r3736;
$L__BB0_424:
and.b32 %r3737, %r557, -2147483648;
shr.u32 %r3738, %r8081, 30;
shl.b32 %r3739, %r8080, 2;
or.b32 %r3740, %r3738, %r3739;
shr.u32 %r3741, %r3740, 31;
shr.u32 %r3742, %r8080, 30;
add.s32 %r3743, %r3741, %r3742;
neg.s32 %r3744, %r3743;
setp.eq.s32 %p381, %r3737, 0;
selp.b32 %r8082, %r3743, %r3744, %p381;
setp.ne.s32 %p382, %r3741, 0;
xor.b32 %r3745, %r3737, -2147483648;
selp.b32 %r3746, %r3745, %r3737, %p382;
selp.b32 %r3747, -1, 0, %p382;
xor.b32 %r3748, %r3740, %r3747;
shl.b32 %r3749, %r8081, 2;
xor.b32 %r3750, %r3749, %r3747;
cvt.u64.u32 %rd1048, %r3748;
cvt.u64.u32 %rd1049, %r3750;
bfi.b64 %rd1050, %rd1048, %rd1049, 32, 32;
cvt.rn.f64.s64 %fd57, %rd1050;
mul.f64 %fd58, %fd57, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2925, %fd58;
setp.eq.s32 %p383, %r3746, 0;
neg.f32 %f2926, %f2925;
selp.f32 %f5381, %f2925, %f2926, %p383;
$L__BB0_426:
and.b32 %r573, %r8082, 1;
setp.eq.s32 %p384, %r573, 0;
selp.f32 %f475, %f5381, 0f3F800000, %p384;
mul.rn.f32 %f476, %f5381, %f5381;
mov.f32 %f5382, 0fB94D4153;
@%p384 bra $L__BB0_428;
mov.f32 %f2929, 0fBAB607ED;
mov.f32 %f2930, 0f37CBAC00;
fma.rn.f32 %f5382, %f2930, %f476, %f2929;
$L__BB0_428:
selp.f32 %f2931, 0f3C0885E4, 0f3D2AAABB, %p384;
fma.rn.f32 %f2932, %f5382, %f476, %f2931;
selp.f32 %f2933, 0fBE2AAAA8, 0fBEFFFFFF, %p384;
fma.rn.f32 %f2934, %f2932, %f476, %f2933;
mov.f32 %f2935, 0f00000000;
fma.rn.f32 %f2936, %f476, %f475, %f2935;
fma.rn.f32 %f5383, %f2934, %f2936, %f475;
and.b32 %r3752, %r8082, 2;
setp.eq.s32 %p386, %r3752, 0;
@%p386 bra $L__BB0_430;
mov.f32 %f2938, 0fBF800000;
fma.rn.f32 %f5383, %f5383, %f2938, %f2935;
$L__BB0_430:
mul.f32 %f2939, %f310, 0f3F22F983;
cvt.rni.s32.f32 %r8086, %f2939;
cvt.rn.f32.s32 %f2940, %r8086;
mov.f32 %f2941, 0fBFC90FDA;
fma.rn.f32 %f2942, %f2940, %f2941, %f310;
mov.f32 %f2943, 0fB3A22168;
fma.rn.f32 %f2944, %f2940, %f2943, %f2942;
mov.f32 %f2945, 0fA7C234C5;
fma.rn.f32 %f5384, %f2940, %f2945, %f2944;
abs.f32 %f483, %f310;
setp.ltu.f32 %p387, %f483, 0f47CE4780;
@%p387 bra $L__BB0_438;
setp.eq.f32 %p388, %f483, 0f7F800000;
@%p388 bra $L__BB0_437;
bra.uni $L__BB0_432;
$L__BB0_437:
mov.f32 %f2948, 0f00000000;
mul.rn.f32 %f5384, %f310, %f2948;
mov.u32 %r8086, 0;
bra.uni $L__BB0_438;
$L__BB0_432:
mov.b32 %r575, %f310;
shr.u32 %r3754, %r575, 23;
and.b32 %r3755, %r3754, 255;
add.s32 %r576, %r3755, -128;
shl.b32 %r3756, %r575, 8;
or.b32 %r577, %r3756, -2147483648;
shr.u32 %r578, %r576, 5;
mov.u64 %rd2563, 0;
mov.u32 %r8083, 0;
mov.u64 %rd1054, __cudart_i2opi_f;
mov.u64 %rd2564, %rd2563;
$L__BB0_433:
.pragma "nounroll";
shl.b64 %rd1053, %rd2563, 2;
add.s64 %rd1055, %rd1054, %rd1053;
ld.global.nc.u32 %r3757, [%rd1055];
mad.wide.u32 %rd1056, %r3757, %r577, %rd2564;
shr.u64 %rd2564, %rd1056, 32;
add.s64 %rd1057, %rd1, %rd1053;
st.local.u32 [%rd1057], %rd1056;
add.s32 %r8083, %r8083, 1;
cvt.s64.s32 %rd2563, %r8083;
setp.ne.s32 %p389, %r8083, 6;
@%p389 bra $L__BB0_433;
st.local.u32 [%rd4], %rd2564;
mov.u32 %r3758, 4;
sub.s32 %r581, %r3758, %r578;
mov.u32 %r3759, 6;
sub.s32 %r3760, %r3759, %r578;
mul.wide.s32 %rd1058, %r3760, 4;
add.s64 %rd1059, %rd1, %rd1058;
ld.local.u32 %r8084, [%rd1059];
ld.local.u32 %r8085, [%rd1059+-4];
and.b32 %r584, %r576, 31;
setp.eq.s32 %p390, %r584, 0;
@%p390 bra $L__BB0_436;
mov.u32 %r3761, 32;
sub.s32 %r3762, %r3761, %r584;
shr.u32 %r3763, %r8085, %r3762;
shl.b32 %r3764, %r8084, %r584;
add.s32 %r8084, %r3763, %r3764;
mul.wide.s32 %rd1060, %r581, 4;
add.s64 %rd1061, %rd1, %rd1060;
ld.local.u32 %r3765, [%rd1061];
shr.u32 %r3766, %r3765, %r3762;
shl.b32 %r3767, %r8085, %r584;
add.s32 %r8085, %r3766, %r3767;
$L__BB0_436:
and.b32 %r3768, %r575, -2147483648;
shr.u32 %r3769, %r8085, 30;
shl.b32 %r3770, %r8084, 2;
or.b32 %r3771, %r3769, %r3770;
shr.u32 %r3772, %r3771, 31;
shr.u32 %r3773, %r8084, 30;
add.s32 %r3774, %r3772, %r3773;
neg.s32 %r3775, %r3774;
setp.eq.s32 %p391, %r3768, 0;
selp.b32 %r8086, %r3774, %r3775, %p391;
setp.ne.s32 %p392, %r3772, 0;
xor.b32 %r3776, %r3768, -2147483648;
selp.b32 %r3777, %r3776, %r3768, %p392;
selp.b32 %r3778, -1, 0, %p392;
xor.b32 %r3779, %r3771, %r3778;
shl.b32 %r3780, %r8085, 2;
xor.b32 %r3781, %r3780, %r3778;
cvt.u64.u32 %rd1062, %r3779;
cvt.u64.u32 %rd1063, %r3781;
bfi.b64 %rd1064, %rd1062, %rd1063, 32, 32;
cvt.rn.f64.s64 %fd59, %rd1064;
mul.f64 %fd60, %fd59, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2946, %fd60;
setp.eq.s32 %p393, %r3777, 0;
neg.f32 %f2947, %f2946;
selp.f32 %f5384, %f2946, %f2947, %p393;
$L__BB0_438:
add.s32 %r591, %r8086, 1;
and.b32 %r592, %r591, 1;
setp.eq.s32 %p394, %r592, 0;
selp.f32 %f487, %f5384, 0f3F800000, %p394;
mul.rn.f32 %f488, %f5384, %f5384;
mov.f32 %f5385, 0fB94D4153;
@%p394 bra $L__BB0_440;
mov.f32 %f2950, 0fBAB607ED;
mov.f32 %f2951, 0f37CBAC00;
fma.rn.f32 %f5385, %f2951, %f488, %f2950;
$L__BB0_440:
selp.f32 %f2952, 0f3C0885E4, 0f3D2AAABB, %p394;
fma.rn.f32 %f2953, %f5385, %f488, %f2952;
selp.f32 %f2954, 0fBE2AAAA8, 0fBEFFFFFF, %p394;
fma.rn.f32 %f2955, %f2953, %f488, %f2954;
mov.f32 %f2956, 0f00000000;
fma.rn.f32 %f2957, %f488, %f487, %f2956;
fma.rn.f32 %f5386, %f2955, %f2957, %f487;
and.b32 %r3783, %r591, 2;
setp.eq.s32 %p396, %r3783, 0;
@%p396 bra $L__BB0_442;
mov.f32 %f2959, 0fBF800000;
fma.rn.f32 %f5386, %f5386, %f2959, %f2956;
$L__BB0_442:
add.f32 %f5394, %f5383, %f5386;
mul.f32 %f2960, %f319, 0f3F22F983;
cvt.rni.s32.f32 %r8090, %f2960;
cvt.rn.f32.s32 %f2961, %r8090;
mov.f32 %f2962, 0fBFC90FDA;
fma.rn.f32 %f2963, %f2961, %f2962, %f319;
mov.f32 %f2964, 0fB3A22168;
fma.rn.f32 %f2965, %f2961, %f2964, %f2963;
mov.f32 %f2966, 0fA7C234C5;
fma.rn.f32 %f5387, %f2961, %f2966, %f2965;
abs.f32 %f496, %f319;
setp.ltu.f32 %p397, %f496, 0f47CE4780;
@%p397 bra $L__BB0_450;
setp.eq.f32 %p398, %f496, 0f7F800000;
@%p398 bra $L__BB0_449;
bra.uni $L__BB0_444;
$L__BB0_449:
mov.f32 %f2969, 0f00000000;
mul.rn.f32 %f5387, %f319, %f2969;
mov.u32 %r8090, 0;
bra.uni $L__BB0_450;
$L__BB0_444:
mov.b32 %r594, %f319;
shr.u32 %r3785, %r594, 23;
and.b32 %r3786, %r3785, 255;
add.s32 %r595, %r3786, -128;
shl.b32 %r3787, %r594, 8;
or.b32 %r596, %r3787, -2147483648;
shr.u32 %r597, %r595, 5;
mov.u64 %rd2565, 0;
mov.u32 %r8087, 0;
mov.u64 %rd1068, __cudart_i2opi_f;
mov.u64 %rd2566, %rd2565;
$L__BB0_445:
.pragma "nounroll";
shl.b64 %rd1067, %rd2565, 2;
add.s64 %rd1069, %rd1068, %rd1067;
ld.global.nc.u32 %r3788, [%rd1069];
mad.wide.u32 %rd1070, %r3788, %r596, %rd2566;
shr.u64 %rd2566, %rd1070, 32;
add.s64 %rd1071, %rd1, %rd1067;
st.local.u32 [%rd1071], %rd1070;
add.s32 %r8087, %r8087, 1;
cvt.s64.s32 %rd2565, %r8087;
setp.ne.s32 %p399, %r8087, 6;
@%p399 bra $L__BB0_445;
st.local.u32 [%rd4], %rd2566;
mov.u32 %r3789, 4;
sub.s32 %r600, %r3789, %r597;
mov.u32 %r3790, 6;
sub.s32 %r3791, %r3790, %r597;
mul.wide.s32 %rd1072, %r3791, 4;
add.s64 %rd1073, %rd1, %rd1072;
ld.local.u32 %r8088, [%rd1073];
ld.local.u32 %r8089, [%rd1073+-4];
and.b32 %r603, %r595, 31;
setp.eq.s32 %p400, %r603, 0;
@%p400 bra $L__BB0_448;
mov.u32 %r3792, 32;
sub.s32 %r3793, %r3792, %r603;
shr.u32 %r3794, %r8089, %r3793;
shl.b32 %r3795, %r8088, %r603;
add.s32 %r8088, %r3794, %r3795;
mul.wide.s32 %rd1074, %r600, 4;
add.s64 %rd1075, %rd1, %rd1074;
ld.local.u32 %r3796, [%rd1075];
shr.u32 %r3797, %r3796, %r3793;
shl.b32 %r3798, %r8089, %r603;
add.s32 %r8089, %r3797, %r3798;
$L__BB0_448:
and.b32 %r3799, %r594, -2147483648;
shr.u32 %r3800, %r8089, 30;
shl.b32 %r3801, %r8088, 2;
or.b32 %r3802, %r3800, %r3801;
shr.u32 %r3803, %r3802, 31;
shr.u32 %r3804, %r8088, 30;
add.s32 %r3805, %r3803, %r3804;
neg.s32 %r3806, %r3805;
setp.eq.s32 %p401, %r3799, 0;
selp.b32 %r8090, %r3805, %r3806, %p401;
setp.ne.s32 %p402, %r3803, 0;
xor.b32 %r3807, %r3799, -2147483648;
selp.b32 %r3808, %r3807, %r3799, %p402;
selp.b32 %r3809, -1, 0, %p402;
xor.b32 %r3810, %r3802, %r3809;
shl.b32 %r3811, %r8089, 2;
xor.b32 %r3812, %r3811, %r3809;
cvt.u64.u32 %rd1076, %r3810;
cvt.u64.u32 %rd1077, %r3812;
bfi.b64 %rd1078, %rd1076, %rd1077, 32, 32;
cvt.rn.f64.s64 %fd61, %rd1078;
mul.f64 %fd62, %fd61, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2967, %fd62;
setp.eq.s32 %p403, %r3808, 0;
neg.f32 %f2968, %f2967;
selp.f32 %f5387, %f2967, %f2968, %p403;
$L__BB0_450:
and.b32 %r610, %r8090, 1;
setp.eq.s32 %p404, %r610, 0;
selp.f32 %f500, %f5387, 0f3F800000, %p404;
mul.rn.f32 %f501, %f5387, %f5387;
mov.f32 %f5388, 0fB94D4153;
@%p404 bra $L__BB0_452;
mov.f32 %f2971, 0fBAB607ED;
mov.f32 %f2972, 0f37CBAC00;
fma.rn.f32 %f5388, %f2972, %f501, %f2971;
$L__BB0_452:
selp.f32 %f2973, 0f3C0885E4, 0f3D2AAABB, %p404;
fma.rn.f32 %f2974, %f5388, %f501, %f2973;
selp.f32 %f2975, 0fBE2AAAA8, 0fBEFFFFFF, %p404;
fma.rn.f32 %f2976, %f2974, %f501, %f2975;
mov.f32 %f2977, 0f00000000;
fma.rn.f32 %f2978, %f501, %f500, %f2977;
fma.rn.f32 %f5389, %f2976, %f2978, %f500;
and.b32 %r3814, %r8090, 2;
setp.eq.s32 %p406, %r3814, 0;
@%p406 bra $L__BB0_454;
mov.f32 %f2980, 0fBF800000;
fma.rn.f32 %f5389, %f5389, %f2980, %f2977;
$L__BB0_454:
mul.f32 %f2981, %f311, 0f3F22F983;
cvt.rni.s32.f32 %r8094, %f2981;
cvt.rn.f32.s32 %f2982, %r8094;
mov.f32 %f2983, 0fBFC90FDA;
fma.rn.f32 %f2984, %f2982, %f2983, %f311;
mov.f32 %f2985, 0fB3A22168;
fma.rn.f32 %f2986, %f2982, %f2985, %f2984;
mov.f32 %f2987, 0fA7C234C5;
fma.rn.f32 %f5390, %f2982, %f2987, %f2986;
abs.f32 %f508, %f311;
setp.ltu.f32 %p407, %f508, 0f47CE4780;
@%p407 bra $L__BB0_462;
setp.eq.f32 %p408, %f508, 0f7F800000;
@%p408 bra $L__BB0_461;
bra.uni $L__BB0_456;
$L__BB0_461:
mov.f32 %f2990, 0f00000000;
mul.rn.f32 %f5390, %f311, %f2990;
mov.u32 %r8094, 0;
bra.uni $L__BB0_462;
$L__BB0_456:
mov.b32 %r612, %f311;
shr.u32 %r3816, %r612, 23;
and.b32 %r3817, %r3816, 255;
add.s32 %r613, %r3817, -128;
shl.b32 %r3818, %r612, 8;
or.b32 %r614, %r3818, -2147483648;
shr.u32 %r615, %r613, 5;
mov.u64 %rd2567, 0;
mov.u32 %r8091, 0;
mov.u64 %rd1082, __cudart_i2opi_f;
mov.u64 %rd2568, %rd2567;
$L__BB0_457:
.pragma "nounroll";
shl.b64 %rd1081, %rd2567, 2;
add.s64 %rd1083, %rd1082, %rd1081;
ld.global.nc.u32 %r3819, [%rd1083];
mad.wide.u32 %rd1084, %r3819, %r614, %rd2568;
shr.u64 %rd2568, %rd1084, 32;
add.s64 %rd1085, %rd1, %rd1081;
st.local.u32 [%rd1085], %rd1084;
add.s32 %r8091, %r8091, 1;
cvt.s64.s32 %rd2567, %r8091;
setp.ne.s32 %p409, %r8091, 6;
@%p409 bra $L__BB0_457;
st.local.u32 [%rd4], %rd2568;
mov.u32 %r3820, 4;
sub.s32 %r618, %r3820, %r615;
mov.u32 %r3821, 6;
sub.s32 %r3822, %r3821, %r615;
mul.wide.s32 %rd1086, %r3822, 4;
add.s64 %rd1087, %rd1, %rd1086;
ld.local.u32 %r8092, [%rd1087];
ld.local.u32 %r8093, [%rd1087+-4];
and.b32 %r621, %r613, 31;
setp.eq.s32 %p410, %r621, 0;
@%p410 bra $L__BB0_460;
mov.u32 %r3823, 32;
sub.s32 %r3824, %r3823, %r621;
shr.u32 %r3825, %r8093, %r3824;
shl.b32 %r3826, %r8092, %r621;
add.s32 %r8092, %r3825, %r3826;
mul.wide.s32 %rd1088, %r618, 4;
add.s64 %rd1089, %rd1, %rd1088;
ld.local.u32 %r3827, [%rd1089];
shr.u32 %r3828, %r3827, %r3824;
shl.b32 %r3829, %r8093, %r621;
add.s32 %r8093, %r3828, %r3829;
$L__BB0_460:
and.b32 %r3830, %r612, -2147483648;
shr.u32 %r3831, %r8093, 30;
shl.b32 %r3832, %r8092, 2;
or.b32 %r3833, %r3831, %r3832;
shr.u32 %r3834, %r3833, 31;
shr.u32 %r3835, %r8092, 30;
add.s32 %r3836, %r3834, %r3835;
neg.s32 %r3837, %r3836;
setp.eq.s32 %p411, %r3830, 0;
selp.b32 %r8094, %r3836, %r3837, %p411;
setp.ne.s32 %p412, %r3834, 0;
xor.b32 %r3838, %r3830, -2147483648;
selp.b32 %r3839, %r3838, %r3830, %p412;
selp.b32 %r3840, -1, 0, %p412;
xor.b32 %r3841, %r3833, %r3840;
shl.b32 %r3842, %r8093, 2;
xor.b32 %r3843, %r3842, %r3840;
cvt.u64.u32 %rd1090, %r3841;
cvt.u64.u32 %rd1091, %r3843;
bfi.b64 %rd1092, %rd1090, %rd1091, 32, 32;
cvt.rn.f64.s64 %fd63, %rd1092;
mul.f64 %fd64, %fd63, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2988, %fd64;
setp.eq.s32 %p413, %r3839, 0;
neg.f32 %f2989, %f2988;
selp.f32 %f5390, %f2988, %f2989, %p413;
$L__BB0_462:
add.s32 %r628, %r8094, 1;
and.b32 %r629, %r628, 1;
setp.eq.s32 %p414, %r629, 0;
selp.f32 %f512, %f5390, 0f3F800000, %p414;
mul.rn.f32 %f513, %f5390, %f5390;
mov.f32 %f5391, 0fB94D4153;
@%p414 bra $L__BB0_464;
mov.f32 %f2992, 0fBAB607ED;
mov.f32 %f2993, 0f37CBAC00;
fma.rn.f32 %f5391, %f2993, %f513, %f2992;
$L__BB0_464:
selp.f32 %f2994, 0f3C0885E4, 0f3D2AAABB, %p414;
fma.rn.f32 %f2995, %f5391, %f513, %f2994;
selp.f32 %f2996, 0fBE2AAAA8, 0fBEFFFFFF, %p414;
fma.rn.f32 %f2997, %f2995, %f513, %f2996;
mov.f32 %f2998, 0f00000000;
fma.rn.f32 %f2999, %f513, %f512, %f2998;
fma.rn.f32 %f5392, %f2997, %f2999, %f512;
and.b32 %r3845, %r628, 2;
setp.eq.s32 %p416, %r3845, 0;
@%p416 bra $L__BB0_466;
mov.f32 %f3001, 0fBF800000;
fma.rn.f32 %f5392, %f5392, %f3001, %f2998;
$L__BB0_466:
add.f32 %f5393, %f5389, %f5392;
bra.uni $L__BB0_467;
$L__BB0_47:
mov.b32 %r2786, %f5416;
shl.b32 %r2787, %r2786, 8;
or.b32 %r34, %r2787, -2147483648;
mov.u64 %rd2505, 0;
mov.u32 %r7967, 0;
mov.u64 %rd624, __cudart_i2opi_f;
mov.u64 %rd2506, %rd2505;
$L__BB0_48:
.pragma "nounroll";
shl.b64 %rd623, %rd2505, 2;
add.s64 %rd625, %rd624, %rd623;
ld.global.nc.u32 %r2788, [%rd625];
mad.wide.u32 %rd626, %r2788, %r34, %rd2506;
shr.u64 %rd2506, %rd626, 32;
add.s64 %rd627, %rd1, %rd623;
st.local.u32 [%rd627], %rd626;
add.s32 %r7967, %r7967, 1;
cvt.s64.s32 %rd2505, %r7967;
setp.ne.s32 %p63, %r7967, 6;
@%p63 bra $L__BB0_48;
mov.b32 %r7823, %f5416;
shr.u32 %r2789, %r7823, 23;
and.b32 %r2790, %r2789, 255;
add.s32 %r2791, %r2790, -128;
shr.u32 %r2792, %r2791, 5;
st.local.u32 [%rd4], %rd2506;
and.b32 %r39, %r2791, 31;
mov.u32 %r2794, 6;
sub.s32 %r2795, %r2794, %r2792;
mul.wide.s32 %rd628, %r2795, 4;
add.s64 %rd629, %rd1, %rd628;
ld.local.u32 %r7968, [%rd629];
ld.local.u32 %r7969, [%rd629+-4];
setp.eq.s32 %p64, %r39, 0;
@%p64 bra $L__BB0_51;
mov.b32 %r7831, %f5416;
shr.u32 %r7830, %r7831, 23;
and.b32 %r7829, %r7830, 255;
add.s32 %r7828, %r7829, -128;
shr.u32 %r7827, %r7828, 5;
mov.u32 %r7826, 4;
sub.s32 %r7825, %r7826, %r7827;
mov.u32 %r2796, 32;
sub.s32 %r2797, %r2796, %r39;
shr.u32 %r2798, %r7969, %r2797;
shl.b32 %r2799, %r7968, %r39;
add.s32 %r7968, %r2798, %r2799;
mul.wide.s32 %rd630, %r7825, 4;
add.s64 %rd631, %rd1, %rd630;
ld.local.u32 %r2800, [%rd631];
shr.u32 %r2801, %r2800, %r2797;
shl.b32 %r2802, %r7969, %r39;
add.s32 %r7969, %r2801, %r2802;
$L__BB0_51:
mov.b32 %r7824, %f5416;
and.b32 %r2803, %r7824, -2147483648;
shr.u32 %r2804, %r7969, 30;
shl.b32 %r2805, %r7968, 2;
or.b32 %r2806, %r2804, %r2805;
shr.u32 %r2807, %r2806, 31;
shr.u32 %r2808, %r7968, 30;
add.s32 %r2809, %r2807, %r2808;
neg.s32 %r2810, %r2809;
setp.eq.s32 %p65, %r2803, 0;
selp.b32 %r7970, %r2809, %r2810, %p65;
setp.ne.s32 %p66, %r2807, 0;
xor.b32 %r2811, %r2803, -2147483648;
selp.b32 %r2812, %r2811, %r2803, %p66;
selp.b32 %r2813, -1, 0, %p66;
xor.b32 %r2814, %r2806, %r2813;
shl.b32 %r2815, %r7969, 2;
xor.b32 %r2816, %r2815, %r2813;
cvt.u64.u32 %rd632, %r2814;
cvt.u64.u32 %rd633, %r2816;
bfi.b64 %rd634, %rd632, %rd633, 32, 32;
cvt.rn.f64.s64 %fd1, %rd634;
mul.f64 %fd2, %fd1, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2322, %fd2;
setp.eq.s32 %p67, %r2812, 0;
neg.f32 %f2323, %f2322;
selp.f32 %f5248, %f2322, %f2323, %p67;
$L__BB0_53:
and.b32 %r48, %r7970, 1;
setp.eq.s32 %p68, %r48, 0;
mul.rn.f32 %f47, %f5248, %f5248;
mov.f32 %f5249, 0fB94D4153;
@%p68 bra $L__BB0_55;
mov.f32 %f2326, 0fBAB607ED;
mov.f32 %f2327, 0f37CBAC00;
fma.rn.f32 %f5249, %f2327, %f47, %f2326;
$L__BB0_55:
and.b32 %r7832, %r7970, 1;
setp.eq.s32 %p1774, %r7832, 0;
selp.f32 %f5221, %f5248, 0f3F800000, %p1774;
selp.f32 %f2328, 0f3C0885E4, 0f3D2AAABB, %p1774;
fma.rn.f32 %f2329, %f5249, %f47, %f2328;
selp.f32 %f2330, 0fBE2AAAA8, 0fBEFFFFFF, %p1774;
fma.rn.f32 %f2331, %f2329, %f47, %f2330;
mov.f32 %f2332, 0f00000000;
fma.rn.f32 %f2333, %f47, %f5221, %f2332;
fma.rn.f32 %f5282, %f2331, %f2333, %f5221;
and.b32 %r2818, %r7970, 2;
setp.eq.s32 %p70, %r2818, 0;
@%p70 bra $L__BB0_57;
mov.f32 %f2335, 0fBF800000;
fma.rn.f32 %f5282, %f5282, %f2335, %f2332;
$L__BB0_57:
shl.b32 %r7822, %r12, 5;
neg.s32 %r7821, %r7822;
setp.ge.s32 %p1773, %r11, %r7821;
@%p1773 bra $L__BB0_70;
mul.f32 %f2337, %f5607, 0f3F22F983;
cvt.rni.s32.f32 %r7974, %f2337;
cvt.rn.f32.s32 %f2338, %r7974;
mov.f32 %f2339, 0fBFC90FDA;
fma.rn.f32 %f2340, %f2338, %f2339, %f5607;
mov.f32 %f2341, 0fB3A22168;
fma.rn.f32 %f2342, %f2338, %f2341, %f2340;
mov.f32 %f2343, 0fA7C234C5;
fma.rn.f32 %f5252, %f2338, %f2343, %f2342;
abs.f32 %f55, %f5607;
setp.ltu.f32 %p72, %f55, 0f47CE4780;
@%p72 bra $L__BB0_66;
setp.eq.f32 %p73, %f55, 0f7F800000;
@%p73 bra $L__BB0_65;
bra.uni $L__BB0_60;
$L__BB0_65:
mov.f32 %f2346, 0f00000000;
mul.rn.f32 %f5252, %f5607, %f2346;
mov.u32 %r7974, 0;
bra.uni $L__BB0_66;
$L__BB0_60:
mov.b32 %r50, %f5607;
shr.u32 %r2822, %r50, 23;
and.b32 %r2823, %r2822, 255;
shl.b32 %r2824, %r50, 8;
or.b32 %r52, %r2824, -2147483648;
mov.u64 %rd2507, 0;
mov.u32 %r7971, 0;
mov.u64 %rd638, __cudart_i2opi_f;
mov.u64 %rd2508, %rd2507;
$L__BB0_61:
.pragma "nounroll";
shl.b64 %rd637, %rd2507, 2;
add.s64 %rd639, %rd638, %rd637;
ld.global.nc.u32 %r2825, [%rd639];
mad.wide.u32 %rd640, %r2825, %r52, %rd2508;
shr.u64 %rd2508, %rd640, 32;
add.s64 %rd641, %rd1, %rd637;
st.local.u32 [%rd641], %rd640;
add.s32 %r7971, %r7971, 1;
cvt.s64.s32 %rd2507, %r7971;
setp.ne.s32 %p74, %r7971, 6;
@%p74 bra $L__BB0_61;
add.s32 %r7775, %r2823, -128;
mov.b32 %r7774, %f5607;
shr.u32 %r7773, %r7774, 23;
and.b32 %r7772, %r7773, 255;
add.s32 %r7771, %r7772, -128;
shr.u32 %r7770, %r7771, 5;
st.local.u32 [%rd4], %rd2508;
mov.u32 %r2827, 6;
sub.s32 %r2828, %r2827, %r7770;
mul.wide.s32 %rd642, %r2828, 4;
add.s64 %rd643, %rd1, %rd642;
ld.local.u32 %r7972, [%rd643];
ld.local.u32 %r7973, [%rd643+-4];
and.b32 %r59, %r7771, 31;
setp.eq.s32 %p75, %r59, 0;
@%p75 bra $L__BB0_64;
mov.b32 %r7839, %f5607;
shr.u32 %r7838, %r7839, 23;
and.b32 %r7837, %r7838, 255;
add.s32 %r7836, %r7837, -128;
shr.u32 %r7835, %r7836, 5;
mov.u32 %r7834, 4;
sub.s32 %r7833, %r7834, %r7835;
mov.u32 %r2829, 32;
sub.s32 %r2830, %r2829, %r59;
shr.u32 %r2831, %r7973, %r2830;
shl.b32 %r2832, %r7972, %r59;
add.s32 %r7972, %r2831, %r2832;
mul.wide.s32 %rd644, %r7833, 4;
add.s64 %rd645, %rd1, %rd644;
ld.local.u32 %r2833, [%rd645];
shr.u32 %r2834, %r2833, %r2830;
shl.b32 %r2835, %r7973, %r59;
add.s32 %r7973, %r2834, %r2835;
$L__BB0_64:
mov.b32 %r7776, %f5607;
and.b32 %r2836, %r7776, -2147483648;
shr.u32 %r2837, %r7973, 30;
shl.b32 %r2838, %r7972, 2;
or.b32 %r2839, %r2837, %r2838;
shr.u32 %r2840, %r2839, 31;
shr.u32 %r2841, %r7972, 30;
add.s32 %r2842, %r2840, %r2841;
neg.s32 %r2843, %r2842;
setp.eq.s32 %p76, %r2836, 0;
selp.b32 %r7974, %r2842, %r2843, %p76;
setp.ne.s32 %p77, %r2840, 0;
xor.b32 %r2844, %r2836, -2147483648;
selp.b32 %r2845, %r2844, %r2836, %p77;
selp.b32 %r2846, -1, 0, %p77;
xor.b32 %r2847, %r2839, %r2846;
shl.b32 %r2848, %r7973, 2;
xor.b32 %r2849, %r2848, %r2846;
cvt.u64.u32 %rd646, %r2847;
cvt.u64.u32 %rd647, %r2849;
bfi.b64 %rd648, %rd646, %rd647, 32, 32;
cvt.rn.f64.s64 %fd3, %rd648;
mul.f64 %fd4, %fd3, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2344, %fd4;
setp.eq.s32 %p78, %r2845, 0;
neg.f32 %f2345, %f2344;
selp.f32 %f5252, %f2344, %f2345, %p78;
$L__BB0_66:
add.s32 %r66, %r7974, 1;
and.b32 %r67, %r66, 1;
setp.eq.s32 %p79, %r67, 0;
selp.f32 %f59, %f5252, 0f3F800000, %p79;
mul.rn.f32 %f60, %f5252, %f5252;
mov.f32 %f5253, 0fB94D4153;
@%p79 bra $L__BB0_68;
mov.f32 %f2348, 0fBAB607ED;
mov.f32 %f2349, 0f37CBAC00;
fma.rn.f32 %f5253, %f2349, %f60, %f2348;
$L__BB0_68:
add.s32 %r7842, %r7974, 1;
add.s32 %r7841, %r7974, 1;
and.b32 %r7840, %r7841, 1;
setp.eq.s32 %p1775, %r7840, 0;
selp.f32 %f2350, 0f3C0885E4, 0f3D2AAABB, %p1775;
fma.rn.f32 %f2351, %f5253, %f60, %f2350;
selp.f32 %f2352, 0fBE2AAAA8, 0fBEFFFFFF, %p1775;
fma.rn.f32 %f2353, %f2351, %f60, %f2352;
mov.f32 %f2354, 0f00000000;
fma.rn.f32 %f2355, %f60, %f59, %f2354;
fma.rn.f32 %f5284, %f2353, %f2355, %f59;
and.b32 %r2851, %r7841, 2;
setp.eq.s32 %p81, %r2851, 0;
@%p81 bra $L__BB0_70;
mov.f32 %f2357, 0fBF800000;
fma.rn.f32 %f5284, %f5284, %f2357, %f2354;
$L__BB0_70:
add.f32 %f5400, %f5282, %f5284;
$L__BB0_71:
setp.gt.s32 %p1762, %r12, 14;
mov.f32 %f5281, %f5282;
mov.f32 %f5283, %f5284;
@%p1762 bra $L__BB0_100;
shl.b32 %r2852, %r12, 5;
mov.u32 %r2853, -32;
sub.s32 %r68, %r2853, %r2852;
setp.ge.s32 %p83, %r11, %r68;
mov.f32 %f5281, %f5282;
@%p83 bra $L__BB0_85;
mul.f32 %f2359, %f5415, 0f3F22F983;
cvt.rni.s32.f32 %r7978, %f2359;
cvt.rn.f32.s32 %f2360, %r7978;
mov.f32 %f2361, 0fBFC90FDA;
fma.rn.f32 %f2362, %f2360, %f2361, %f5415;
mov.f32 %f2363, 0fB3A22168;
fma.rn.f32 %f2364, %f2360, %f2363, %f2362;
mov.f32 %f2365, 0fA7C234C5;
fma.rn.f32 %f5259, %f2360, %f2365, %f2364;
abs.f32 %f72, %f5415;
setp.ltu.f32 %p84, %f72, 0f47CE4780;
@%p84 bra $L__BB0_81;
setp.eq.f32 %p85, %f72, 0f7F800000;
@%p85 bra $L__BB0_80;
bra.uni $L__BB0_75;
$L__BB0_80:
mov.f32 %f2368, 0f00000000;
mul.rn.f32 %f5259, %f5415, %f2368;
mov.u32 %r7978, 0;
bra.uni $L__BB0_81;
$L__BB0_75:
mov.b32 %r70, %f5415;
shr.u32 %r2855, %r70, 23;
and.b32 %r2856, %r2855, 255;
shl.b32 %r2857, %r70, 8;
or.b32 %r72, %r2857, -2147483648;
mov.u64 %rd2509, 0;
mov.u32 %r7975, 0;
mov.u64 %rd652, __cudart_i2opi_f;
mov.u64 %rd2510, %rd2509;
$L__BB0_76:
.pragma "nounroll";
shl.b64 %rd651, %rd2509, 2;
add.s64 %rd653, %rd652, %rd651;
ld.global.nc.u32 %r2858, [%rd653];
mad.wide.u32 %rd654, %r2858, %r72, %rd2510;
shr.u64 %rd2510, %rd654, 32;
add.s64 %rd655, %rd1, %rd651;
st.local.u32 [%rd655], %rd654;
add.s32 %r7975, %r7975, 1;
cvt.s64.s32 %rd2509, %r7975;
setp.ne.s32 %p86, %r7975, 6;
@%p86 bra $L__BB0_76;
add.s32 %r7851, %r2856, -128;
mov.b32 %r7850, %f5415;
shr.u32 %r7849, %r7850, 23;
and.b32 %r7848, %r7849, 255;
add.s32 %r7847, %r7848, -128;
shr.u32 %r7846, %r7847, 5;
st.local.u32 [%rd4], %rd2510;
mov.u32 %r2860, 6;
sub.s32 %r2861, %r2860, %r7846;
mul.wide.s32 %rd656, %r2861, 4;
add.s64 %rd657, %rd1, %rd656;
ld.local.u32 %r7976, [%rd657];
ld.local.u32 %r7977, [%rd657+-4];
and.b32 %r79, %r7847, 31;
setp.eq.s32 %p87, %r79, 0;
@%p87 bra $L__BB0_79;
mov.b32 %r7859, %f5415;
shr.u32 %r7858, %r7859, 23;
and.b32 %r7857, %r7858, 255;
add.s32 %r7856, %r7857, -128;
shr.u32 %r7855, %r7856, 5;
mov.u32 %r7854, 4;
sub.s32 %r7853, %r7854, %r7855;
mov.u32 %r2862, 32;
sub.s32 %r2863, %r2862, %r79;
shr.u32 %r2864, %r7977, %r2863;
shl.b32 %r2865, %r7976, %r79;
add.s32 %r7976, %r2864, %r2865;
mul.wide.s32 %rd658, %r7853, 4;
add.s64 %rd659, %rd1, %rd658;
ld.local.u32 %r2866, [%rd659];
shr.u32 %r2867, %r2866, %r2863;
shl.b32 %r2868, %r7977, %r79;
add.s32 %r7977, %r2867, %r2868;
$L__BB0_79:
mov.b32 %r7852, %f5415;
and.b32 %r2869, %r7852, -2147483648;
shr.u32 %r2870, %r7977, 30;
shl.b32 %r2871, %r7976, 2;
or.b32 %r2872, %r2870, %r2871;
shr.u32 %r2873, %r2872, 31;
shr.u32 %r2874, %r7976, 30;
add.s32 %r2875, %r2873, %r2874;
neg.s32 %r2876, %r2875;
setp.eq.s32 %p88, %r2869, 0;
selp.b32 %r7978, %r2875, %r2876, %p88;
setp.ne.s32 %p89, %r2873, 0;
xor.b32 %r2877, %r2869, -2147483648;
selp.b32 %r2878, %r2877, %r2869, %p89;
selp.b32 %r2879, -1, 0, %p89;
xor.b32 %r2880, %r2872, %r2879;
shl.b32 %r2881, %r7977, 2;
xor.b32 %r2882, %r2881, %r2879;
cvt.u64.u32 %rd660, %r2880;
cvt.u64.u32 %rd661, %r2882;
bfi.b64 %rd662, %rd660, %rd661, 32, 32;
cvt.rn.f64.s64 %fd5, %rd662;
mul.f64 %fd6, %fd5, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2366, %fd6;
setp.eq.s32 %p90, %r2878, 0;
neg.f32 %f2367, %f2366;
selp.f32 %f5259, %f2366, %f2367, %p90;
$L__BB0_81:
and.b32 %r86, %r7978, 1;
setp.eq.s32 %p91, %r86, 0;
mul.rn.f32 %f77, %f5259, %f5259;
mov.f32 %f5260, 0fB94D4153;
@%p91 bra $L__BB0_83;
mov.f32 %f2370, 0fBAB607ED;
mov.f32 %f2371, 0f37CBAC00;
fma.rn.f32 %f5260, %f2371, %f77, %f2370;
$L__BB0_83:
and.b32 %r7860, %r7978, 1;
setp.eq.s32 %p1778, %r7860, 0;
selp.f32 %f5222, %f5259, 0f3F800000, %p1778;
selp.f32 %f2372, 0f3C0885E4, 0f3D2AAABB, %p1778;
fma.rn.f32 %f2373, %f5260, %f77, %f2372;
selp.f32 %f2374, 0fBE2AAAA8, 0fBEFFFFFF, %p1778;
fma.rn.f32 %f2375, %f2373, %f77, %f2374;
mov.f32 %f2376, 0f00000000;
fma.rn.f32 %f2377, %f77, %f5222, %f2376;
fma.rn.f32 %f5281, %f2375, %f2377, %f5222;
and.b32 %r2884, %r7978, 2;
setp.eq.s32 %p93, %r2884, 0;
@%p93 bra $L__BB0_85;
mov.f32 %f2379, 0fBF800000;
fma.rn.f32 %f5281, %f5281, %f2379, %f2376;
$L__BB0_85:
shl.b32 %r7845, %r12, 5;
mov.u32 %r7844, -32;
sub.s32 %r7843, %r7844, %r7845;
setp.ge.s32 %p1776, %r11, %r7843;
mov.f32 %f5283, %f5284;
@%p1776 bra $L__BB0_98;
mul.f32 %f2380, %f5606, 0f3F22F983;
cvt.rni.s32.f32 %r7982, %f2380;
cvt.rn.f32.s32 %f2381, %r7982;
mov.f32 %f2382, 0fBFC90FDA;
fma.rn.f32 %f2383, %f2381, %f2382, %f5606;
mov.f32 %f2384, 0fB3A22168;
fma.rn.f32 %f2385, %f2381, %f2384, %f2383;
mov.f32 %f2386, 0fA7C234C5;
fma.rn.f32 %f5263, %f2381, %f2386, %f2385;
abs.f32 %f85, %f5606;
setp.ltu.f32 %p95, %f85, 0f47CE4780;
@%p95 bra $L__BB0_94;
setp.eq.f32 %p96, %f85, 0f7F800000;
@%p96 bra $L__BB0_93;
bra.uni $L__BB0_88;
$L__BB0_93:
mov.f32 %f2389, 0f00000000;
mul.rn.f32 %f5263, %f5606, %f2389;
mov.u32 %r7982, 0;
bra.uni $L__BB0_94;
$L__BB0_88:
mov.b32 %r88, %f5606;
shr.u32 %r2886, %r88, 23;
and.b32 %r2887, %r2886, 255;
shl.b32 %r2888, %r88, 8;
or.b32 %r90, %r2888, -2147483648;
mov.u64 %rd2511, 0;
mov.u32 %r7979, 0;
mov.u64 %rd666, __cudart_i2opi_f;
mov.u64 %rd2512, %rd2511;
$L__BB0_89:
.pragma "nounroll";
shl.b64 %rd665, %rd2511, 2;
add.s64 %rd667, %rd666, %rd665;
ld.global.nc.u32 %r2889, [%rd667];
mad.wide.u32 %rd668, %r2889, %r90, %rd2512;
shr.u64 %rd2512, %rd668, 32;
add.s64 %rd669, %rd1, %rd665;
st.local.u32 [%rd669], %rd668;
add.s32 %r7979, %r7979, 1;
cvt.s64.s32 %rd2511, %r7979;
setp.ne.s32 %p97, %r7979, 6;
@%p97 bra $L__BB0_89;
add.s32 %r7869, %r2887, -128;
mov.b32 %r7868, %f5606;
shr.u32 %r7867, %r7868, 23;
and.b32 %r7866, %r7867, 255;
add.s32 %r7865, %r7866, -128;
shr.u32 %r7864, %r7865, 5;
st.local.u32 [%rd4], %rd2512;
mov.u32 %r2891, 6;
sub.s32 %r2892, %r2891, %r7864;
mul.wide.s32 %rd670, %r2892, 4;
add.s64 %rd671, %rd1, %rd670;
ld.local.u32 %r7980, [%rd671];
ld.local.u32 %r7981, [%rd671+-4];
and.b32 %r97, %r7865, 31;
setp.eq.s32 %p98, %r97, 0;
@%p98 bra $L__BB0_92;
mov.b32 %r7877, %f5606;
shr.u32 %r7876, %r7877, 23;
and.b32 %r7875, %r7876, 255;
add.s32 %r7874, %r7875, -128;
shr.u32 %r7873, %r7874, 5;
mov.u32 %r7872, 4;
sub.s32 %r7871, %r7872, %r7873;
mov.u32 %r2893, 32;
sub.s32 %r2894, %r2893, %r97;
shr.u32 %r2895, %r7981, %r2894;
shl.b32 %r2896, %r7980, %r97;
add.s32 %r7980, %r2895, %r2896;
mul.wide.s32 %rd672, %r7871, 4;
add.s64 %rd673, %rd1, %rd672;
ld.local.u32 %r2897, [%rd673];
shr.u32 %r2898, %r2897, %r2894;
shl.b32 %r2899, %r7981, %r97;
add.s32 %r7981, %r2898, %r2899;
$L__BB0_92:
mov.b32 %r7870, %f5606;
and.b32 %r2900, %r7870, -2147483648;
shr.u32 %r2901, %r7981, 30;
shl.b32 %r2902, %r7980, 2;
or.b32 %r2903, %r2901, %r2902;
shr.u32 %r2904, %r2903, 31;
shr.u32 %r2905, %r7980, 30;
add.s32 %r2906, %r2904, %r2905;
neg.s32 %r2907, %r2906;
setp.eq.s32 %p99, %r2900, 0;
selp.b32 %r7982, %r2906, %r2907, %p99;
setp.ne.s32 %p100, %r2904, 0;
xor.b32 %r2908, %r2900, -2147483648;
selp.b32 %r2909, %r2908, %r2900, %p100;
selp.b32 %r2910, -1, 0, %p100;
xor.b32 %r2911, %r2903, %r2910;
shl.b32 %r2912, %r7981, 2;
xor.b32 %r2913, %r2912, %r2910;
cvt.u64.u32 %rd674, %r2911;
cvt.u64.u32 %rd675, %r2913;
bfi.b64 %rd676, %rd674, %rd675, 32, 32;
cvt.rn.f64.s64 %fd7, %rd676;
mul.f64 %fd8, %fd7, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2387, %fd8;
setp.eq.s32 %p101, %r2909, 0;
neg.f32 %f2388, %f2387;
selp.f32 %f5263, %f2387, %f2388, %p101;
$L__BB0_94:
add.s32 %r104, %r7982, 1;
and.b32 %r105, %r104, 1;
setp.eq.s32 %p102, %r105, 0;
selp.f32 %f89, %f5263, 0f3F800000, %p102;
mul.rn.f32 %f90, %f5263, %f5263;
mov.f32 %f5264, 0fB94D4153;
@%p102 bra $L__BB0_96;
mov.f32 %f2391, 0fBAB607ED;
mov.f32 %f2392, 0f37CBAC00;
fma.rn.f32 %f5264, %f2392, %f90, %f2391;
$L__BB0_96:
add.s32 %r7880, %r7982, 1;
add.s32 %r7879, %r7982, 1;
and.b32 %r7878, %r7879, 1;
setp.eq.s32 %p1780, %r7878, 0;
selp.f32 %f2393, 0f3C0885E4, 0f3D2AAABB, %p1780;
fma.rn.f32 %f2394, %f5264, %f90, %f2393;
selp.f32 %f2395, 0fBE2AAAA8, 0fBEFFFFFF, %p1780;
fma.rn.f32 %f2396, %f2394, %f90, %f2395;
mov.f32 %f2397, 0f00000000;
fma.rn.f32 %f2398, %f90, %f89, %f2397;
fma.rn.f32 %f5283, %f2396, %f2398, %f89;
and.b32 %r2915, %r7879, 2;
setp.eq.s32 %p104, %r2915, 0;
@%p104 bra $L__BB0_98;
mov.f32 %f2400, 0fBF800000;
fma.rn.f32 %f5283, %f5283, %f2400, %f2397;
$L__BB0_98:
shl.b32 %r7863, %r12, 5;
mov.u32 %r7862, -32;
sub.s32 %r7861, %r7862, %r7863;
setp.lt.s32 %p1779, %r11, %r7861;
setp.ge.s32 %p1777, %r11, %r68;
selp.f32 %f97, %f5283, %f5284, %p1779;
selp.f32 %f98, %f5281, %f5282, %p1779;
@%p1777 bra $L__BB0_100;
add.f32 %f5399, %f98, %f97;
mov.f32 %f5282, %f5281;
mov.f32 %f5284, %f5283;
$L__BB0_100:
add.s32 %r7804, %r12, 1;
setp.gt.s32 %p1771, %r7804, 14;
@%p1771 bra $L__BB0_129;
shl.b32 %r2917, %r12, 5;
neg.s32 %r106, %r2917;
setp.ge.s32 %p108, %r11, %r106;
@%p108 bra $L__BB0_114;
mul.f32 %f2403, %f5414, 0f3F22F983;
cvt.rni.s32.f32 %r7986, %f2403;
cvt.rn.f32.s32 %f2404, %r7986;
mov.f32 %f2405, 0fBFC90FDA;
fma.rn.f32 %f2406, %f2404, %f2405, %f5414;
mov.f32 %f2407, 0fB3A22168;
fma.rn.f32 %f2408, %f2404, %f2407, %f2406;
mov.f32 %f2409, 0fA7C234C5;
fma.rn.f32 %f5272, %f2404, %f2409, %f2408;
abs.f32 %f106, %f5414;
setp.ltu.f32 %p109, %f106, 0f47CE4780;
@%p109 bra $L__BB0_110;
setp.eq.f32 %p110, %f106, 0f7F800000;
@%p110 bra $L__BB0_109;
bra.uni $L__BB0_104;
$L__BB0_109:
mov.f32 %f2412, 0f00000000;
mul.rn.f32 %f5272, %f5414, %f2412;
mov.u32 %r7986, 0;
bra.uni $L__BB0_110;
$L__BB0_104:
mov.b32 %r108, %f5414;
shr.u32 %r2919, %r108, 23;
and.b32 %r2920, %r2919, 255;
shl.b32 %r2921, %r108, 8;
or.b32 %r110, %r2921, -2147483648;
mov.u64 %rd2513, 0;
mov.u32 %r7983, 0;
mov.u64 %rd680, __cudart_i2opi_f;
mov.u64 %rd2514, %rd2513;
$L__BB0_105:
.pragma "nounroll";
shl.b64 %rd679, %rd2513, 2;
add.s64 %rd681, %rd680, %rd679;
ld.global.nc.u32 %r2922, [%rd681];
mad.wide.u32 %rd682, %r2922, %r110, %rd2514;
shr.u64 %rd2514, %rd682, 32;
add.s64 %rd683, %rd1, %rd679;
st.local.u32 [%rd683], %rd682;
add.s32 %r7983, %r7983, 1;
cvt.s64.s32 %rd2513, %r7983;
setp.ne.s32 %p111, %r7983, 6;
@%p111 bra $L__BB0_105;
add.s32 %r7886, %r2920, -128;
mov.b32 %r7885, %f5414;
shr.u32 %r7884, %r7885, 23;
and.b32 %r7883, %r7884, 255;
add.s32 %r7882, %r7883, -128;
shr.u32 %r7881, %r7882, 5;
st.local.u32 [%rd4], %rd2514;
mov.u32 %r2924, 6;
sub.s32 %r2925, %r2924, %r7881;
mul.wide.s32 %rd684, %r2925, 4;
add.s64 %rd685, %rd1, %rd684;
ld.local.u32 %r7984, [%rd685];
ld.local.u32 %r7985, [%rd685+-4];
and.b32 %r117, %r7882, 31;
setp.eq.s32 %p112, %r117, 0;
@%p112 bra $L__BB0_108;
mov.b32 %r7896, %f5414;
shr.u32 %r7895, %r7896, 23;
and.b32 %r7894, %r7895, 255;
add.s32 %r7893, %r7894, -128;
shr.u32 %r7892, %r7893, 5;
mov.u32 %r7891, 4;
sub.s32 %r7890, %r7891, %r7892;
mov.u32 %r2926, 32;
sub.s32 %r2927, %r2926, %r117;
shr.u32 %r2928, %r7985, %r2927;
shl.b32 %r2929, %r7984, %r117;
add.s32 %r7984, %r2928, %r2929;
mul.wide.s32 %rd686, %r7890, 4;
add.s64 %rd687, %rd1, %rd686;
ld.local.u32 %r2930, [%rd687];
shr.u32 %r2931, %r2930, %r2927;
shl.b32 %r2932, %r7985, %r117;
add.s32 %r7985, %r2931, %r2932;
$L__BB0_108:
mov.b32 %r7887, %f5414;
and.b32 %r2933, %r7887, -2147483648;
shr.u32 %r2934, %r7985, 30;
shl.b32 %r2935, %r7984, 2;
or.b32 %r2936, %r2934, %r2935;
shr.u32 %r2937, %r2936, 31;
shr.u32 %r2938, %r7984, 30;
add.s32 %r2939, %r2937, %r2938;
neg.s32 %r2940, %r2939;
setp.eq.s32 %p113, %r2933, 0;
selp.b32 %r7986, %r2939, %r2940, %p113;
setp.ne.s32 %p114, %r2937, 0;
xor.b32 %r2941, %r2933, -2147483648;
selp.b32 %r2942, %r2941, %r2933, %p114;
selp.b32 %r2943, -1, 0, %p114;
xor.b32 %r2944, %r2936, %r2943;
shl.b32 %r2945, %r7985, 2;
xor.b32 %r2946, %r2945, %r2943;
cvt.u64.u32 %rd688, %r2944;
cvt.u64.u32 %rd689, %r2946;
bfi.b64 %rd690, %rd688, %rd689, 32, 32;
cvt.rn.f64.s64 %fd9, %rd690;
mul.f64 %fd10, %fd9, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2410, %fd10;
setp.eq.s32 %p115, %r2942, 0;
neg.f32 %f2411, %f2410;
selp.f32 %f5272, %f2410, %f2411, %p115;
$L__BB0_110:
and.b32 %r124, %r7986, 1;
setp.eq.s32 %p116, %r124, 0;
mul.rn.f32 %f111, %f5272, %f5272;
mov.f32 %f5273, 0fB94D4153;
@%p116 bra $L__BB0_112;
mov.f32 %f2414, 0fBAB607ED;
mov.f32 %f2415, 0f37CBAC00;
fma.rn.f32 %f5273, %f2415, %f111, %f2414;
$L__BB0_112:
and.b32 %r7952, %r7986, 1;
setp.eq.s32 %p1787, %r7952, 0;
selp.f32 %f5223, %f5272, 0f3F800000, %p1787;
selp.f32 %f2416, 0f3C0885E4, 0f3D2AAABB, %p1787;
fma.rn.f32 %f2417, %f5273, %f111, %f2416;
selp.f32 %f2418, 0fBE2AAAA8, 0fBEFFFFFF, %p1787;
fma.rn.f32 %f2419, %f2417, %f111, %f2418;
mov.f32 %f2420, 0f00000000;
fma.rn.f32 %f2421, %f111, %f5223, %f2420;
fma.rn.f32 %f5281, %f2419, %f2421, %f5223;
and.b32 %r2948, %r7986, 2;
setp.eq.s32 %p118, %r2948, 0;
@%p118 bra $L__BB0_114;
mov.f32 %f2423, 0fBF800000;
fma.rn.f32 %f5281, %f5281, %f2423, %f2420;
$L__BB0_114:
shl.b32 %r7951, %r12, 5;
neg.s32 %r7950, %r7951;
setp.ge.s32 %p1786, %r11, %r7950;
@%p1786 bra $L__BB0_127;
mul.f32 %f2424, %f5406, 0f3F22F983;
cvt.rni.s32.f32 %r7990, %f2424;
cvt.rn.f32.s32 %f2425, %r7990;
mov.f32 %f2426, 0fBFC90FDA;
fma.rn.f32 %f2427, %f2425, %f2426, %f5406;
mov.f32 %f2428, 0fB3A22168;
fma.rn.f32 %f2429, %f2425, %f2428, %f2427;
mov.f32 %f2430, 0fA7C234C5;
fma.rn.f32 %f5276, %f2425, %f2430, %f2429;
abs.f32 %f119, %f5406;
setp.ltu.f32 %p120, %f119, 0f47CE4780;
@%p120 bra $L__BB0_123;
setp.eq.f32 %p121, %f119, 0f7F800000;
@%p121 bra $L__BB0_122;
bra.uni $L__BB0_117;
$L__BB0_122:
mov.f32 %f2433, 0f00000000;
mul.rn.f32 %f5276, %f5406, %f2433;
mov.u32 %r7990, 0;
bra.uni $L__BB0_123;
$L__BB0_117:
mov.b32 %r126, %f5406;
shr.u32 %r2950, %r126, 23;
and.b32 %r2951, %r2950, 255;
shl.b32 %r2952, %r126, 8;
or.b32 %r128, %r2952, -2147483648;
mov.u64 %rd2515, 0;
mov.u32 %r7987, 0;
mov.u64 %rd694, __cudart_i2opi_f;
mov.u64 %rd2516, %rd2515;
$L__BB0_118:
.pragma "nounroll";
shl.b64 %rd693, %rd2515, 2;
add.s64 %rd695, %rd694, %rd693;
ld.global.nc.u32 %r2953, [%rd695];
mad.wide.u32 %rd696, %r2953, %r128, %rd2516;
shr.u64 %rd2516, %rd696, 32;
add.s64 %rd697, %rd1, %rd693;
st.local.u32 [%rd697], %rd696;
add.s32 %r7987, %r7987, 1;
cvt.s64.s32 %rd2515, %r7987;
setp.ne.s32 %p122, %r7987, 6;
@%p122 bra $L__BB0_118;
add.s32 %r7902, %r2951, -128;
mov.b32 %r7901, %f5406;
shr.u32 %r7900, %r7901, 23;
and.b32 %r7899, %r7900, 255;
add.s32 %r7898, %r7899, -128;
shr.u32 %r7897, %r7898, 5;
st.local.u32 [%rd4], %rd2516;
mov.u32 %r2955, 6;
sub.s32 %r2956, %r2955, %r7897;
mul.wide.s32 %rd698, %r2956, 4;
add.s64 %rd699, %rd1, %rd698;
ld.local.u32 %r7988, [%rd699];
ld.local.u32 %r7989, [%rd699+-4];
and.b32 %r135, %r7898, 31;
setp.eq.s32 %p123, %r135, 0;
@%p123 bra $L__BB0_121;
mov.b32 %r7914, %f5406;
shr.u32 %r7913, %r7914, 23;
and.b32 %r7912, %r7913, 255;
add.s32 %r7911, %r7912, -128;
shr.u32 %r7910, %r7911, 5;
mov.u32 %r7909, 4;
sub.s32 %r7908, %r7909, %r7910;
mov.u32 %r2957, 32;
sub.s32 %r2958, %r2957, %r135;
shr.u32 %r2959, %r7989, %r2958;
shl.b32 %r2960, %r7988, %r135;
add.s32 %r7988, %r2959, %r2960;
mul.wide.s32 %rd700, %r7908, 4;
add.s64 %rd701, %rd1, %rd700;
ld.local.u32 %r2961, [%rd701];
shr.u32 %r2962, %r2961, %r2958;
shl.b32 %r2963, %r7989, %r135;
add.s32 %r7989, %r2962, %r2963;
$L__BB0_121:
mov.b32 %r7903, %f5406;
and.b32 %r2964, %r7903, -2147483648;
shr.u32 %r2965, %r7989, 30;
shl.b32 %r2966, %r7988, 2;
or.b32 %r2967, %r2965, %r2966;
shr.u32 %r2968, %r2967, 31;
shr.u32 %r2969, %r7988, 30;
add.s32 %r2970, %r2968, %r2969;
neg.s32 %r2971, %r2970;
setp.eq.s32 %p124, %r2964, 0;
selp.b32 %r7990, %r2970, %r2971, %p124;
setp.ne.s32 %p125, %r2968, 0;
xor.b32 %r2972, %r2964, -2147483648;
selp.b32 %r2973, %r2972, %r2964, %p125;
selp.b32 %r2974, -1, 0, %p125;
xor.b32 %r2975, %r2967, %r2974;
shl.b32 %r2976, %r7989, 2;
xor.b32 %r2977, %r2976, %r2974;
cvt.u64.u32 %rd702, %r2975;
cvt.u64.u32 %rd703, %r2977;
bfi.b64 %rd704, %rd702, %rd703, 32, 32;
cvt.rn.f64.s64 %fd11, %rd704;
mul.f64 %fd12, %fd11, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2431, %fd12;
setp.eq.s32 %p126, %r2973, 0;
neg.f32 %f2432, %f2431;
selp.f32 %f5276, %f2431, %f2432, %p126;
$L__BB0_123:
add.s32 %r142, %r7990, 1;
and.b32 %r143, %r142, 1;
setp.eq.s32 %p127, %r143, 0;
selp.f32 %f123, %f5276, 0f3F800000, %p127;
mul.rn.f32 %f124, %f5276, %f5276;
mov.f32 %f5277, 0fB94D4153;
@%p127 bra $L__BB0_125;
mov.f32 %f2435, 0fBAB607ED;
mov.f32 %f2436, 0f37CBAC00;
fma.rn.f32 %f5277, %f2436, %f124, %f2435;
$L__BB0_125:
selp.f32 %f2437, 0f3C0885E4, 0f3D2AAABB, %p127;
fma.rn.f32 %f2438, %f5277, %f124, %f2437;
selp.f32 %f2439, 0fBE2AAAA8, 0fBEFFFFFF, %p127;
fma.rn.f32 %f2440, %f2438, %f124, %f2439;
mov.f32 %f2441, 0f00000000;
fma.rn.f32 %f2442, %f124, %f123, %f2441;
fma.rn.f32 %f5283, %f2440, %f2442, %f123;
and.b32 %r2979, %r142, 2;
setp.eq.s32 %p129, %r2979, 0;
@%p129 bra $L__BB0_127;
mov.f32 %f2444, 0fBF800000;
fma.rn.f32 %f5283, %f5283, %f2444, %f2441;
$L__BB0_127:
shl.b32 %r7907, %r12, 5;
neg.s32 %r7906, %r7907;
setp.lt.s32 %p1782, %r11, %r7906;
shl.b32 %r7905, %r12, 5;
neg.s32 %r7904, %r7905;
setp.ge.s32 %p1781, %r11, %r7904;
selp.f32 %f131, %f5283, %f5284, %p1782;
selp.f32 %f132, %f5281, %f5282, %p1782;
@%p1781 bra $L__BB0_129;
add.f32 %f5398, %f132, %f131;
mov.f32 %f5282, %f5281;
mov.f32 %f5284, %f5283;
$L__BB0_129:
add.s32 %r7805, %r12, 1;
setp.gt.s32 %p1772, %r7805, 14;
@%p1772 bra $L__BB0_158;
shl.b32 %r2981, %r12, 5;
mov.u32 %r2982, -32;
sub.s32 %r144, %r2982, %r2981;
setp.ge.s32 %p133, %r11, %r144;
@%p133 bra $L__BB0_143;
mul.f32 %f2447, %f5413, 0f3F22F983;
cvt.rni.s32.f32 %r7994, %f2447;
cvt.rn.f32.s32 %f2448, %r7994;
mov.f32 %f2449, 0fBFC90FDA;
fma.rn.f32 %f2450, %f2448, %f2449, %f5413;
mov.f32 %f2451, 0fB3A22168;
fma.rn.f32 %f2452, %f2448, %f2451, %f2450;
mov.f32 %f2453, 0fA7C234C5;
fma.rn.f32 %f5285, %f2448, %f2453, %f2452;
abs.f32 %f140, %f5413;
setp.ltu.f32 %p134, %f140, 0f47CE4780;
@%p134 bra $L__BB0_139;
setp.eq.f32 %p135, %f140, 0f7F800000;
@%p135 bra $L__BB0_138;
bra.uni $L__BB0_133;
$L__BB0_138:
mov.f32 %f2456, 0f00000000;
mul.rn.f32 %f5285, %f5413, %f2456;
mov.u32 %r7994, 0;
bra.uni $L__BB0_139;
$L__BB0_133:
mov.b32 %r146, %f5413;
shr.u32 %r2984, %r146, 23;
and.b32 %r2985, %r2984, 255;
shl.b32 %r2986, %r146, 8;
or.b32 %r148, %r2986, -2147483648;
mov.u64 %rd2517, 0;
mov.u32 %r7991, 0;
mov.u64 %rd708, __cudart_i2opi_f;
mov.u64 %rd2518, %rd2517;
$L__BB0_134:
.pragma "nounroll";
shl.b64 %rd707, %rd2517, 2;
add.s64 %rd709, %rd708, %rd707;
ld.global.nc.u32 %r2987, [%rd709];
mad.wide.u32 %rd710, %r2987, %r148, %rd2518;
shr.u64 %rd2518, %rd710, 32;
add.s64 %rd711, %rd1, %rd707;
st.local.u32 [%rd711], %rd710;
add.s32 %r7991, %r7991, 1;
cvt.s64.s32 %rd2517, %r7991;
setp.ne.s32 %p136, %r7991, 6;
@%p136 bra $L__BB0_134;
add.s32 %r7920, %r2985, -128;
mov.b32 %r7919, %f5413;
shr.u32 %r7918, %r7919, 23;
and.b32 %r7917, %r7918, 255;
add.s32 %r7916, %r7917, -128;
shr.u32 %r7915, %r7916, 5;
st.local.u32 [%rd4], %rd2518;
mov.u32 %r2988, 4;
sub.s32 %r152, %r2988, %r7915;
mov.u32 %r2989, 6;
sub.s32 %r2990, %r2989, %r7915;
mul.wide.s32 %rd712, %r2990, 4;
add.s64 %rd713, %rd1, %rd712;
ld.local.u32 %r7992, [%rd713];
ld.local.u32 %r7993, [%rd713+-4];
and.b32 %r155, %r7916, 31;
setp.eq.s32 %p137, %r155, 0;
@%p137 bra $L__BB0_137;
mov.u32 %r2991, 32;
sub.s32 %r2992, %r2991, %r155;
shr.u32 %r2993, %r7993, %r2992;
shl.b32 %r2994, %r7992, %r155;
add.s32 %r7992, %r2993, %r2994;
mul.wide.s32 %rd714, %r152, 4;
add.s64 %rd715, %rd1, %rd714;
ld.local.u32 %r2995, [%rd715];
shr.u32 %r2996, %r2995, %r2992;
shl.b32 %r2997, %r7993, %r155;
add.s32 %r7993, %r2996, %r2997;
$L__BB0_137:
mov.b32 %r7921, %f5413;
and.b32 %r2998, %r7921, -2147483648;
shr.u32 %r2999, %r7993, 30;
shl.b32 %r3000, %r7992, 2;
or.b32 %r3001, %r2999, %r3000;
shr.u32 %r3002, %r3001, 31;
shr.u32 %r3003, %r7992, 30;
add.s32 %r3004, %r3002, %r3003;
neg.s32 %r3005, %r3004;
setp.eq.s32 %p138, %r2998, 0;
selp.b32 %r7994, %r3004, %r3005, %p138;
setp.ne.s32 %p139, %r3002, 0;
xor.b32 %r3006, %r2998, -2147483648;
selp.b32 %r3007, %r3006, %r2998, %p139;
selp.b32 %r3008, -1, 0, %p139;
xor.b32 %r3009, %r3001, %r3008;
shl.b32 %r3010, %r7993, 2;
xor.b32 %r3011, %r3010, %r3008;
cvt.u64.u32 %rd716, %r3009;
cvt.u64.u32 %rd717, %r3011;
bfi.b64 %rd718, %rd716, %rd717, 32, 32;
cvt.rn.f64.s64 %fd13, %rd718;
mul.f64 %fd14, %fd13, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2454, %fd14;
setp.eq.s32 %p140, %r3007, 0;
neg.f32 %f2455, %f2454;
selp.f32 %f5285, %f2454, %f2455, %p140;
$L__BB0_139:
and.b32 %r162, %r7994, 1;
setp.eq.s32 %p141, %r162, 0;
selp.f32 %f144, %f5285, 0f3F800000, %p141;
mul.rn.f32 %f145, %f5285, %f5285;
mov.f32 %f5286, 0fB94D4153;
@%p141 bra $L__BB0_141;
mov.f32 %f2458, 0fBAB607ED;
mov.f32 %f2459, 0f37CBAC00;
fma.rn.f32 %f5286, %f2459, %f145, %f2458;
$L__BB0_141:
selp.f32 %f2460, 0f3C0885E4, 0f3D2AAABB, %p141;
fma.rn.f32 %f2461, %f5286, %f145, %f2460;
selp.f32 %f2462, 0fBE2AAAA8, 0fBEFFFFFF, %p141;
fma.rn.f32 %f2463, %f2461, %f145, %f2462;
mov.f32 %f2464, 0f00000000;
fma.rn.f32 %f2465, %f145, %f144, %f2464;
fma.rn.f32 %f5281, %f2463, %f2465, %f144;
and.b32 %r3013, %r7994, 2;
setp.eq.s32 %p143, %r3013, 0;
@%p143 bra $L__BB0_143;
mov.f32 %f2467, 0fBF800000;
fma.rn.f32 %f5281, %f5281, %f2467, %f2464;
$L__BB0_143:
shl.b32 %r7955, %r12, 5;
mov.u32 %r7954, -32;
sub.s32 %r7953, %r7954, %r7955;
setp.ge.s32 %p1788, %r11, %r7953;
@%p1788 bra $L__BB0_156;
mul.f32 %f2468, %f5405, 0f3F22F983;
cvt.rni.s32.f32 %r7998, %f2468;
cvt.rn.f32.s32 %f2469, %r7998;
mov.f32 %f2470, 0fBFC90FDA;
fma.rn.f32 %f2471, %f2469, %f2470, %f5405;
mov.f32 %f2472, 0fB3A22168;
fma.rn.f32 %f2473, %f2469, %f2472, %f2471;
mov.f32 %f2474, 0fA7C234C5;
fma.rn.f32 %f5289, %f2469, %f2474, %f2473;
abs.f32 %f153, %f5405;
setp.ltu.f32 %p145, %f153, 0f47CE4780;
@%p145 bra $L__BB0_152;
setp.eq.f32 %p146, %f153, 0f7F800000;
@%p146 bra $L__BB0_151;
bra.uni $L__BB0_146;
$L__BB0_151:
mov.f32 %f2477, 0f00000000;
mul.rn.f32 %f5289, %f5405, %f2477;
mov.u32 %r7998, 0;
bra.uni $L__BB0_152;
$L__BB0_146:
mov.b32 %r164, %f5405;
shr.u32 %r3015, %r164, 23;
and.b32 %r3016, %r3015, 255;
shl.b32 %r3017, %r164, 8;
or.b32 %r166, %r3017, -2147483648;
mov.u64 %rd2519, 0;
mov.u32 %r7995, 0;
mov.u64 %rd722, __cudart_i2opi_f;
mov.u64 %rd2520, %rd2519;
$L__BB0_147:
.pragma "nounroll";
shl.b64 %rd721, %rd2519, 2;
add.s64 %rd723, %rd722, %rd721;
ld.global.nc.u32 %r3018, [%rd723];
mad.wide.u32 %rd724, %r3018, %r166, %rd2520;
shr.u64 %rd2520, %rd724, 32;
add.s64 %rd725, %rd1, %rd721;
st.local.u32 [%rd725], %rd724;
add.s32 %r7995, %r7995, 1;
cvt.s64.s32 %rd2519, %r7995;
setp.ne.s32 %p147, %r7995, 6;
@%p147 bra $L__BB0_147;
add.s32 %r7930, %r3016, -128;
mov.b32 %r7929, %f5405;
shr.u32 %r7928, %r7929, 23;
and.b32 %r7927, %r7928, 255;
add.s32 %r7926, %r7927, -128;
shr.u32 %r7925, %r7926, 5;
st.local.u32 [%rd4], %rd2520;
mov.u32 %r3019, 4;
sub.s32 %r170, %r3019, %r7925;
mov.u32 %r3020, 6;
sub.s32 %r3021, %r3020, %r7925;
mul.wide.s32 %rd726, %r3021, 4;
add.s64 %rd727, %rd1, %rd726;
ld.local.u32 %r7996, [%rd727];
ld.local.u32 %r7997, [%rd727+-4];
and.b32 %r173, %r7926, 31;
setp.eq.s32 %p148, %r173, 0;
@%p148 bra $L__BB0_150;
mov.u32 %r3022, 32;
sub.s32 %r3023, %r3022, %r173;
shr.u32 %r3024, %r7997, %r3023;
shl.b32 %r3025, %r7996, %r173;
add.s32 %r7996, %r3024, %r3025;
mul.wide.s32 %rd728, %r170, 4;
add.s64 %rd729, %rd1, %rd728;
ld.local.u32 %r3026, [%rd729];
shr.u32 %r3027, %r3026, %r3023;
shl.b32 %r3028, %r7997, %r173;
add.s32 %r7997, %r3027, %r3028;
$L__BB0_150:
mov.b32 %r7931, %f5405;
and.b32 %r3029, %r7931, -2147483648;
shr.u32 %r3030, %r7997, 30;
shl.b32 %r3031, %r7996, 2;
or.b32 %r3032, %r3030, %r3031;
shr.u32 %r3033, %r3032, 31;
shr.u32 %r3034, %r7996, 30;
add.s32 %r3035, %r3033, %r3034;
neg.s32 %r3036, %r3035;
setp.eq.s32 %p149, %r3029, 0;
selp.b32 %r7998, %r3035, %r3036, %p149;
setp.ne.s32 %p150, %r3033, 0;
xor.b32 %r3037, %r3029, -2147483648;
selp.b32 %r3038, %r3037, %r3029, %p150;
selp.b32 %r3039, -1, 0, %p150;
xor.b32 %r3040, %r3032, %r3039;
shl.b32 %r3041, %r7997, 2;
xor.b32 %r3042, %r3041, %r3039;
cvt.u64.u32 %rd730, %r3040;
cvt.u64.u32 %rd731, %r3042;
bfi.b64 %rd732, %rd730, %rd731, 32, 32;
cvt.rn.f64.s64 %fd15, %rd732;
mul.f64 %fd16, %fd15, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2475, %fd16;
setp.eq.s32 %p151, %r3038, 0;
neg.f32 %f2476, %f2475;
selp.f32 %f5289, %f2475, %f2476, %p151;
$L__BB0_152:
add.s32 %r180, %r7998, 1;
and.b32 %r181, %r180, 1;
setp.eq.s32 %p152, %r181, 0;
selp.f32 %f157, %f5289, 0f3F800000, %p152;
mul.rn.f32 %f158, %f5289, %f5289;
mov.f32 %f5290, 0fB94D4153;
@%p152 bra $L__BB0_154;
mov.f32 %f2479, 0fBAB607ED;
mov.f32 %f2480, 0f37CBAC00;
fma.rn.f32 %f5290, %f2480, %f158, %f2479;
$L__BB0_154:
selp.f32 %f2481, 0f3C0885E4, 0f3D2AAABB, %p152;
fma.rn.f32 %f2482, %f5290, %f158, %f2481;
selp.f32 %f2483, 0fBE2AAAA8, 0fBEFFFFFF, %p152;
fma.rn.f32 %f2484, %f2482, %f158, %f2483;
mov.f32 %f2485, 0f00000000;
fma.rn.f32 %f2486, %f158, %f157, %f2485;
fma.rn.f32 %f5283, %f2484, %f2486, %f157;
and.b32 %r3044, %r180, 2;
setp.eq.s32 %p154, %r3044, 0;
@%p154 bra $L__BB0_156;
mov.f32 %f2488, 0fBF800000;
fma.rn.f32 %f5283, %f5283, %f2488, %f2485;
$L__BB0_156:
shl.b32 %r7937, %r12, 5;
mov.u32 %r7936, -32;
sub.s32 %r7935, %r7936, %r7937;
setp.lt.s32 %p1784, %r11, %r7935;
shl.b32 %r7934, %r12, 5;
mov.u32 %r7933, -32;
sub.s32 %r7932, %r7933, %r7934;
setp.ge.s32 %p1783, %r11, %r7932;
selp.f32 %f165, %f5283, %f5284, %p1784;
selp.f32 %f166, %f5281, %f5282, %p1784;
@%p1783 bra $L__BB0_158;
add.f32 %f5397, %f166, %f165;
mov.f32 %f5282, %f5281;
mov.f32 %f5284, %f5283;
$L__BB0_158:
add.s32 %r7777, %r12, 2;
setp.gt.s32 %p1763, %r7777, 14;
@%p1763 bra $L__BB0_187;
shl.b32 %r3046, %r12, 5;
neg.s32 %r182, %r3046;
setp.ge.s32 %p158, %r11, %r182;
@%p158 bra $L__BB0_172;
mul.f32 %f2491, %f5412, 0f3F22F983;
cvt.rni.s32.f32 %r8002, %f2491;
cvt.rn.f32.s32 %f2492, %r8002;
mov.f32 %f2493, 0fBFC90FDA;
fma.rn.f32 %f2494, %f2492, %f2493, %f5412;
mov.f32 %f2495, 0fB3A22168;
fma.rn.f32 %f2496, %f2492, %f2495, %f2494;
mov.f32 %f2497, 0fA7C234C5;
fma.rn.f32 %f5298, %f2492, %f2497, %f2496;
abs.f32 %f174, %f5412;
setp.ltu.f32 %p159, %f174, 0f47CE4780;
@%p159 bra $L__BB0_168;
setp.eq.f32 %p160, %f174, 0f7F800000;
@%p160 bra $L__BB0_167;
bra.uni $L__BB0_162;
$L__BB0_167:
mov.f32 %f2500, 0f00000000;
mul.rn.f32 %f5298, %f5412, %f2500;
mov.u32 %r8002, 0;
bra.uni $L__BB0_168;
$L__BB0_162:
mov.b32 %r184, %f5412;
shr.u32 %r3048, %r184, 23;
and.b32 %r3049, %r3048, 255;
shl.b32 %r3050, %r184, 8;
or.b32 %r186, %r3050, -2147483648;
mov.u64 %rd2521, 0;
mov.u32 %r7999, 0;
mov.u64 %rd736, __cudart_i2opi_f;
mov.u64 %rd2522, %rd2521;
$L__BB0_163:
.pragma "nounroll";
shl.b64 %rd735, %rd2521, 2;
add.s64 %rd737, %rd736, %rd735;
ld.global.nc.u32 %r3051, [%rd737];
mad.wide.u32 %rd738, %r3051, %r186, %rd2522;
shr.u64 %rd2522, %rd738, 32;
add.s64 %rd739, %rd1, %rd735;
st.local.u32 [%rd739], %rd738;
add.s32 %r7999, %r7999, 1;
cvt.s64.s32 %rd2521, %r7999;
setp.ne.s32 %p161, %r7999, 6;
@%p161 bra $L__BB0_163;
add.s32 %r7961, %r3049, -128;
mov.b32 %r7960, %f5412;
shr.u32 %r7959, %r7960, 23;
and.b32 %r7958, %r7959, 255;
add.s32 %r7957, %r7958, -128;
shr.u32 %r7956, %r7957, 5;
st.local.u32 [%rd4], %rd2522;
mov.u32 %r3052, 4;
sub.s32 %r190, %r3052, %r7956;
mov.u32 %r3053, 6;
sub.s32 %r3054, %r3053, %r7956;
mul.wide.s32 %rd740, %r3054, 4;
add.s64 %rd741, %rd1, %rd740;
ld.local.u32 %r8000, [%rd741];
ld.local.u32 %r8001, [%rd741+-4];
and.b32 %r193, %r7957, 31;
setp.eq.s32 %p162, %r193, 0;
@%p162 bra $L__BB0_166;
mov.u32 %r3055, 32;
sub.s32 %r3056, %r3055, %r193;
shr.u32 %r3057, %r8001, %r3056;
shl.b32 %r3058, %r8000, %r193;
add.s32 %r8000, %r3057, %r3058;
mul.wide.s32 %rd742, %r190, 4;
add.s64 %rd743, %rd1, %rd742;
ld.local.u32 %r3059, [%rd743];
shr.u32 %r3060, %r3059, %r3056;
shl.b32 %r3061, %r8001, %r193;
add.s32 %r8001, %r3060, %r3061;
$L__BB0_166:
mov.b32 %r7962, %f5412;
and.b32 %r3062, %r7962, -2147483648;
shr.u32 %r3063, %r8001, 30;
shl.b32 %r3064, %r8000, 2;
or.b32 %r3065, %r3063, %r3064;
shr.u32 %r3066, %r3065, 31;
shr.u32 %r3067, %r8000, 30;
add.s32 %r3068, %r3066, %r3067;
neg.s32 %r3069, %r3068;
setp.eq.s32 %p163, %r3062, 0;
selp.b32 %r8002, %r3068, %r3069, %p163;
setp.ne.s32 %p164, %r3066, 0;
xor.b32 %r3070, %r3062, -2147483648;
selp.b32 %r3071, %r3070, %r3062, %p164;
selp.b32 %r3072, -1, 0, %p164;
xor.b32 %r3073, %r3065, %r3072;
shl.b32 %r3074, %r8001, 2;
xor.b32 %r3075, %r3074, %r3072;
cvt.u64.u32 %rd744, %r3073;
cvt.u64.u32 %rd745, %r3075;
bfi.b64 %rd746, %rd744, %rd745, 32, 32;
cvt.rn.f64.s64 %fd17, %rd746;
mul.f64 %fd18, %fd17, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2498, %fd18;
setp.eq.s32 %p165, %r3071, 0;
neg.f32 %f2499, %f2498;
selp.f32 %f5298, %f2498, %f2499, %p165;
$L__BB0_168:
and.b32 %r200, %r8002, 1;
setp.eq.s32 %p166, %r200, 0;
selp.f32 %f178, %f5298, 0f3F800000, %p166;
mul.rn.f32 %f179, %f5298, %f5298;
mov.f32 %f5299, 0fB94D4153;
@%p166 bra $L__BB0_170;
mov.f32 %f2502, 0fBAB607ED;
mov.f32 %f2503, 0f37CBAC00;
fma.rn.f32 %f5299, %f2503, %f179, %f2502;
$L__BB0_170:
selp.f32 %f2504, 0f3C0885E4, 0f3D2AAABB, %p166;
fma.rn.f32 %f2505, %f5299, %f179, %f2504;
selp.f32 %f2506, 0fBE2AAAA8, 0fBEFFFFFF, %p166;
fma.rn.f32 %f2507, %f2505, %f179, %f2506;
mov.f32 %f2508, 0f00000000;
fma.rn.f32 %f2509, %f179, %f178, %f2508;
fma.rn.f32 %f5281, %f2507, %f2509, %f178;
and.b32 %r3077, %r8002, 2;
setp.eq.s32 %p168, %r3077, 0;
@%p168 bra $L__BB0_172;
mov.f32 %f2511, 0fBF800000;
fma.rn.f32 %f5281, %f5281, %f2511, %f2508;
$L__BB0_172:
shl.b32 %r7964, %r12, 5;
neg.s32 %r7963, %r7964;
setp.lt.s32 %p4, %r11, %r7963;
@%p158 bra $L__BB0_185;
mul.f32 %f2512, %f5404, 0f3F22F983;
cvt.rni.s32.f32 %r8006, %f2512;
cvt.rn.f32.s32 %f2513, %r8006;
mov.f32 %f2514, 0fBFC90FDA;
fma.rn.f32 %f2515, %f2513, %f2514, %f5404;
mov.f32 %f2516, 0fB3A22168;
fma.rn.f32 %f2517, %f2513, %f2516, %f2515;
mov.f32 %f2518, 0fA7C234C5;
fma.rn.f32 %f5302, %f2513, %f2518, %f2517;
abs.f32 %f187, %f5404;
setp.ltu.f32 %p170, %f187, 0f47CE4780;
@%p170 bra $L__BB0_181;
setp.eq.f32 %p171, %f187, 0f7F800000;
@%p171 bra $L__BB0_180;
bra.uni $L__BB0_175;
$L__BB0_180:
mov.f32 %f2521, 0f00000000;
mul.rn.f32 %f5302, %f5404, %f2521;
mov.u32 %r8006, 0;
bra.uni $L__BB0_181;
$L__BB0_175:
mov.b32 %r202, %f5404;
shr.u32 %r3079, %r202, 23;
and.b32 %r3080, %r3079, 255;
add.s32 %r203, %r3080, -128;
shl.b32 %r3081, %r202, 8;
or.b32 %r204, %r3081, -2147483648;
shr.u32 %r205, %r203, 5;
mov.u64 %rd2523, 0;
mov.u32 %r8003, 0;
mov.u64 %rd750, __cudart_i2opi_f;
mov.u64 %rd2524, %rd2523;
$L__BB0_176:
.pragma "nounroll";
shl.b64 %rd749, %rd2523, 2;
add.s64 %rd751, %rd750, %rd749;
ld.global.nc.u32 %r3082, [%rd751];
mad.wide.u32 %rd752, %r3082, %r204, %rd2524;
shr.u64 %rd2524, %rd752, 32;
add.s64 %rd753, %rd1, %rd749;
st.local.u32 [%rd753], %rd752;
add.s32 %r8003, %r8003, 1;
cvt.s64.s32 %rd2523, %r8003;
setp.ne.s32 %p172, %r8003, 6;
@%p172 bra $L__BB0_176;
st.local.u32 [%rd4], %rd2524;
mov.u32 %r3083, 4;
sub.s32 %r208, %r3083, %r205;
mov.u32 %r3084, 6;
sub.s32 %r3085, %r3084, %r205;
mul.wide.s32 %rd754, %r3085, 4;
add.s64 %rd755, %rd1, %rd754;
ld.local.u32 %r8004, [%rd755];
ld.local.u32 %r8005, [%rd755+-4];
and.b32 %r211, %r203, 31;
setp.eq.s32 %p173, %r211, 0;
@%p173 bra $L__BB0_179;
mov.u32 %r3086, 32;
sub.s32 %r3087, %r3086, %r211;
shr.u32 %r3088, %r8005, %r3087;
shl.b32 %r3089, %r8004, %r211;
add.s32 %r8004, %r3088, %r3089;
mul.wide.s32 %rd756, %r208, 4;
add.s64 %rd757, %rd1, %rd756;
ld.local.u32 %r3090, [%rd757];
shr.u32 %r3091, %r3090, %r3087;
shl.b32 %r3092, %r8005, %r211;
add.s32 %r8005, %r3091, %r3092;
$L__BB0_179:
and.b32 %r3093, %r202, -2147483648;
shr.u32 %r3094, %r8005, 30;
shl.b32 %r3095, %r8004, 2;
or.b32 %r3096, %r3094, %r3095;
shr.u32 %r3097, %r3096, 31;
shr.u32 %r3098, %r8004, 30;
add.s32 %r3099, %r3097, %r3098;
neg.s32 %r3100, %r3099;
setp.eq.s32 %p174, %r3093, 0;
selp.b32 %r8006, %r3099, %r3100, %p174;
setp.ne.s32 %p175, %r3097, 0;
xor.b32 %r3101, %r3093, -2147483648;
selp.b32 %r3102, %r3101, %r3093, %p175;
selp.b32 %r3103, -1, 0, %p175;
xor.b32 %r3104, %r3096, %r3103;
shl.b32 %r3105, %r8005, 2;
xor.b32 %r3106, %r3105, %r3103;
cvt.u64.u32 %rd758, %r3104;
cvt.u64.u32 %rd759, %r3106;
bfi.b64 %rd760, %rd758, %rd759, 32, 32;
cvt.rn.f64.s64 %fd19, %rd760;
mul.f64 %fd20, %fd19, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2519, %fd20;
setp.eq.s32 %p176, %r3102, 0;
neg.f32 %f2520, %f2519;
selp.f32 %f5302, %f2519, %f2520, %p176;
$L__BB0_181:
add.s32 %r218, %r8006, 1;
and.b32 %r219, %r218, 1;
setp.eq.s32 %p177, %r219, 0;
selp.f32 %f191, %f5302, 0f3F800000, %p177;
mul.rn.f32 %f192, %f5302, %f5302;
mov.f32 %f5303, 0fB94D4153;
@%p177 bra $L__BB0_183;
mov.f32 %f2523, 0fBAB607ED;
mov.f32 %f2524, 0f37CBAC00;
fma.rn.f32 %f5303, %f2524, %f192, %f2523;
$L__BB0_183:
selp.f32 %f2525, 0f3C0885E4, 0f3D2AAABB, %p177;
fma.rn.f32 %f2526, %f5303, %f192, %f2525;
selp.f32 %f2527, 0fBE2AAAA8, 0fBEFFFFFF, %p177;
fma.rn.f32 %f2528, %f2526, %f192, %f2527;
mov.f32 %f2529, 0f00000000;
fma.rn.f32 %f2530, %f192, %f191, %f2529;
fma.rn.f32 %f5283, %f2528, %f2530, %f191;
and.b32 %r3108, %r218, 2;
setp.eq.s32 %p179, %r3108, 0;
@%p179 bra $L__BB0_185;
mov.f32 %f2532, 0fBF800000;
fma.rn.f32 %f5283, %f5283, %f2532, %f2529;
$L__BB0_185:
selp.f32 %f199, %f5283, %f5284, %p4;
selp.f32 %f200, %f5281, %f5282, %p4;
@%p158 bra $L__BB0_187;
add.f32 %f5396, %f200, %f199;
mov.f32 %f5282, %f5281;
mov.f32 %f5284, %f5283;
$L__BB0_187:
add.s32 %r7778, %r12, 2;
setp.gt.s32 %p1764, %r7778, 14;
@%p1764 bra $L__BB0_216;
shl.b32 %r3110, %r12, 5;
mov.u32 %r3111, -32;
sub.s32 %r220, %r3111, %r3110;
setp.ge.s32 %p183, %r11, %r220;
@%p183 bra $L__BB0_201;
mul.f32 %f2535, %f5411, 0f3F22F983;
cvt.rni.s32.f32 %r8010, %f2535;
cvt.rn.f32.s32 %f2536, %r8010;
mov.f32 %f2537, 0fBFC90FDA;
fma.rn.f32 %f2538, %f2536, %f2537, %f5411;
mov.f32 %f2539, 0fB3A22168;
fma.rn.f32 %f2540, %f2536, %f2539, %f2538;
mov.f32 %f2541, 0fA7C234C5;
fma.rn.f32 %f5311, %f2536, %f2541, %f2540;
abs.f32 %f208, %f5411;
setp.ltu.f32 %p184, %f208, 0f47CE4780;
@%p184 bra $L__BB0_197;
setp.eq.f32 %p185, %f208, 0f7F800000;
@%p185 bra $L__BB0_196;
bra.uni $L__BB0_191;
$L__BB0_196:
mov.f32 %f2544, 0f00000000;
mul.rn.f32 %f5311, %f5411, %f2544;
mov.u32 %r8010, 0;
bra.uni $L__BB0_197;
$L__BB0_191:
mov.b32 %r222, %f5411;
shr.u32 %r3113, %r222, 23;
and.b32 %r3114, %r3113, 255;
add.s32 %r223, %r3114, -128;
shl.b32 %r3115, %r222, 8;
or.b32 %r224, %r3115, -2147483648;
shr.u32 %r225, %r223, 5;
mov.u64 %rd2525, 0;
mov.u32 %r8007, 0;
mov.u64 %rd764, __cudart_i2opi_f;
mov.u64 %rd2526, %rd2525;
$L__BB0_192:
.pragma "nounroll";
shl.b64 %rd763, %rd2525, 2;
add.s64 %rd765, %rd764, %rd763;
ld.global.nc.u32 %r3116, [%rd765];
mad.wide.u32 %rd766, %r3116, %r224, %rd2526;
shr.u64 %rd2526, %rd766, 32;
add.s64 %rd767, %rd1, %rd763;
st.local.u32 [%rd767], %rd766;
add.s32 %r8007, %r8007, 1;
cvt.s64.s32 %rd2525, %r8007;
setp.ne.s32 %p186, %r8007, 6;
@%p186 bra $L__BB0_192;
st.local.u32 [%rd4], %rd2526;
mov.u32 %r3117, 4;
sub.s32 %r228, %r3117, %r225;
mov.u32 %r3118, 6;
sub.s32 %r3119, %r3118, %r225;
mul.wide.s32 %rd768, %r3119, 4;
add.s64 %rd769, %rd1, %rd768;
ld.local.u32 %r8008, [%rd769];
ld.local.u32 %r8009, [%rd769+-4];
and.b32 %r231, %r223, 31;
setp.eq.s32 %p187, %r231, 0;
@%p187 bra $L__BB0_195;
mov.u32 %r3120, 32;
sub.s32 %r3121, %r3120, %r231;
shr.u32 %r3122, %r8009, %r3121;
shl.b32 %r3123, %r8008, %r231;
add.s32 %r8008, %r3122, %r3123;
mul.wide.s32 %rd770, %r228, 4;
add.s64 %rd771, %rd1, %rd770;
ld.local.u32 %r3124, [%rd771];
shr.u32 %r3125, %r3124, %r3121;
shl.b32 %r3126, %r8009, %r231;
add.s32 %r8009, %r3125, %r3126;
$L__BB0_195:
and.b32 %r3127, %r222, -2147483648;
shr.u32 %r3128, %r8009, 30;
shl.b32 %r3129, %r8008, 2;
or.b32 %r3130, %r3128, %r3129;
shr.u32 %r3131, %r3130, 31;
shr.u32 %r3132, %r8008, 30;
add.s32 %r3133, %r3131, %r3132;
neg.s32 %r3134, %r3133;
setp.eq.s32 %p188, %r3127, 0;
selp.b32 %r8010, %r3133, %r3134, %p188;
setp.ne.s32 %p189, %r3131, 0;
xor.b32 %r3135, %r3127, -2147483648;
selp.b32 %r3136, %r3135, %r3127, %p189;
selp.b32 %r3137, -1, 0, %p189;
xor.b32 %r3138, %r3130, %r3137;
shl.b32 %r3139, %r8009, 2;
xor.b32 %r3140, %r3139, %r3137;
cvt.u64.u32 %rd772, %r3138;
cvt.u64.u32 %rd773, %r3140;
bfi.b64 %rd774, %rd772, %rd773, 32, 32;
cvt.rn.f64.s64 %fd21, %rd774;
mul.f64 %fd22, %fd21, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2542, %fd22;
setp.eq.s32 %p190, %r3136, 0;
neg.f32 %f2543, %f2542;
selp.f32 %f5311, %f2542, %f2543, %p190;
$L__BB0_197:
and.b32 %r238, %r8010, 1;
setp.eq.s32 %p191, %r238, 0;
selp.f32 %f212, %f5311, 0f3F800000, %p191;
mul.rn.f32 %f213, %f5311, %f5311;
mov.f32 %f5312, 0fB94D4153;
@%p191 bra $L__BB0_199;
mov.f32 %f2546, 0fBAB607ED;
mov.f32 %f2547, 0f37CBAC00;
fma.rn.f32 %f5312, %f2547, %f213, %f2546;
$L__BB0_199:
selp.f32 %f2548, 0f3C0885E4, 0f3D2AAABB, %p191;
fma.rn.f32 %f2549, %f5312, %f213, %f2548;
selp.f32 %f2550, 0fBE2AAAA8, 0fBEFFFFFF, %p191;
fma.rn.f32 %f2551, %f2549, %f213, %f2550;
mov.f32 %f2552, 0f00000000;
fma.rn.f32 %f2553, %f213, %f212, %f2552;
fma.rn.f32 %f5281, %f2551, %f2553, %f212;
and.b32 %r3142, %r8010, 2;
setp.eq.s32 %p193, %r3142, 0;
@%p193 bra $L__BB0_201;
mov.f32 %f2555, 0fBF800000;
fma.rn.f32 %f5281, %f5281, %f2555, %f2552;
$L__BB0_201:
setp.lt.s32 %p5, %r11, %r220;
@%p183 bra $L__BB0_214;
mul.f32 %f2556, %f5403, 0f3F22F983;
cvt.rni.s32.f32 %r8014, %f2556;
cvt.rn.f32.s32 %f2557, %r8014;
mov.f32 %f2558, 0fBFC90FDA;
fma.rn.f32 %f2559, %f2557, %f2558, %f5403;
mov.f32 %f2560, 0fB3A22168;
fma.rn.f32 %f2561, %f2557, %f2560, %f2559;
mov.f32 %f2562, 0fA7C234C5;
fma.rn.f32 %f5315, %f2557, %f2562, %f2561;
abs.f32 %f221, %f5403;
setp.ltu.f32 %p195, %f221, 0f47CE4780;
@%p195 bra $L__BB0_210;
setp.eq.f32 %p196, %f221, 0f7F800000;
@%p196 bra $L__BB0_209;
bra.uni $L__BB0_204;
$L__BB0_209:
mov.f32 %f2565, 0f00000000;
mul.rn.f32 %f5315, %f5403, %f2565;
mov.u32 %r8014, 0;
bra.uni $L__BB0_210;
$L__BB0_204:
mov.b32 %r240, %f5403;
shr.u32 %r3144, %r240, 23;
and.b32 %r3145, %r3144, 255;
add.s32 %r241, %r3145, -128;
shl.b32 %r3146, %r240, 8;
or.b32 %r242, %r3146, -2147483648;
shr.u32 %r243, %r241, 5;
mov.u64 %rd2527, 0;
mov.u32 %r8011, 0;
mov.u64 %rd778, __cudart_i2opi_f;
mov.u64 %rd2528, %rd2527;
$L__BB0_205:
.pragma "nounroll";
shl.b64 %rd777, %rd2527, 2;
add.s64 %rd779, %rd778, %rd777;
ld.global.nc.u32 %r3147, [%rd779];
mad.wide.u32 %rd780, %r3147, %r242, %rd2528;
shr.u64 %rd2528, %rd780, 32;
add.s64 %rd781, %rd1, %rd777;
st.local.u32 [%rd781], %rd780;
add.s32 %r8011, %r8011, 1;
cvt.s64.s32 %rd2527, %r8011;
setp.ne.s32 %p197, %r8011, 6;
@%p197 bra $L__BB0_205;
st.local.u32 [%rd4], %rd2528;
mov.u32 %r3148, 4;
sub.s32 %r246, %r3148, %r243;
mov.u32 %r3149, 6;
sub.s32 %r3150, %r3149, %r243;
mul.wide.s32 %rd782, %r3150, 4;
add.s64 %rd783, %rd1, %rd782;
ld.local.u32 %r8012, [%rd783];
ld.local.u32 %r8013, [%rd783+-4];
and.b32 %r249, %r241, 31;
setp.eq.s32 %p198, %r249, 0;
@%p198 bra $L__BB0_208;
mov.u32 %r3151, 32;
sub.s32 %r3152, %r3151, %r249;
shr.u32 %r3153, %r8013, %r3152;
shl.b32 %r3154, %r8012, %r249;
add.s32 %r8012, %r3153, %r3154;
mul.wide.s32 %rd784, %r246, 4;
add.s64 %rd785, %rd1, %rd784;
ld.local.u32 %r3155, [%rd785];
shr.u32 %r3156, %r3155, %r3152;
shl.b32 %r3157, %r8013, %r249;
add.s32 %r8013, %r3156, %r3157;
$L__BB0_208:
and.b32 %r3158, %r240, -2147483648;
shr.u32 %r3159, %r8013, 30;
shl.b32 %r3160, %r8012, 2;
or.b32 %r3161, %r3159, %r3160;
shr.u32 %r3162, %r3161, 31;
shr.u32 %r3163, %r8012, 30;
add.s32 %r3164, %r3162, %r3163;
neg.s32 %r3165, %r3164;
setp.eq.s32 %p199, %r3158, 0;
selp.b32 %r8014, %r3164, %r3165, %p199;
setp.ne.s32 %p200, %r3162, 0;
xor.b32 %r3166, %r3158, -2147483648;
selp.b32 %r3167, %r3166, %r3158, %p200;
selp.b32 %r3168, -1, 0, %p200;
xor.b32 %r3169, %r3161, %r3168;
shl.b32 %r3170, %r8013, 2;
xor.b32 %r3171, %r3170, %r3168;
cvt.u64.u32 %rd786, %r3169;
cvt.u64.u32 %rd787, %r3171;
bfi.b64 %rd788, %rd786, %rd787, 32, 32;
cvt.rn.f64.s64 %fd23, %rd788;
mul.f64 %fd24, %fd23, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2563, %fd24;
setp.eq.s32 %p201, %r3167, 0;
neg.f32 %f2564, %f2563;
selp.f32 %f5315, %f2563, %f2564, %p201;
$L__BB0_210:
add.s32 %r256, %r8014, 1;
and.b32 %r257, %r256, 1;
setp.eq.s32 %p202, %r257, 0;
selp.f32 %f225, %f5315, 0f3F800000, %p202;
mul.rn.f32 %f226, %f5315, %f5315;
mov.f32 %f5316, 0fB94D4153;
@%p202 bra $L__BB0_212;
mov.f32 %f2567, 0fBAB607ED;
mov.f32 %f2568, 0f37CBAC00;
fma.rn.f32 %f5316, %f2568, %f226, %f2567;
$L__BB0_212:
selp.f32 %f2569, 0f3C0885E4, 0f3D2AAABB, %p202;
fma.rn.f32 %f2570, %f5316, %f226, %f2569;
selp.f32 %f2571, 0fBE2AAAA8, 0fBEFFFFFF, %p202;
fma.rn.f32 %f2572, %f2570, %f226, %f2571;
mov.f32 %f2573, 0f00000000;
fma.rn.f32 %f2574, %f226, %f225, %f2573;
fma.rn.f32 %f5283, %f2572, %f2574, %f225;
and.b32 %r3173, %r256, 2;
setp.eq.s32 %p204, %r3173, 0;
@%p204 bra $L__BB0_214;
mov.f32 %f2576, 0fBF800000;
fma.rn.f32 %f5283, %f5283, %f2576, %f2573;
$L__BB0_214:
selp.f32 %f233, %f5283, %f5284, %p5;
selp.f32 %f234, %f5281, %f5282, %p5;
@%p183 bra $L__BB0_216;
add.f32 %f5395, %f234, %f233;
mov.f32 %f5282, %f5281;
mov.f32 %f5284, %f5283;
$L__BB0_216:
add.s32 %r7779, %r12, 3;
setp.gt.s32 %p1765, %r7779, 14;
@%p1765 bra $L__BB0_245;
shl.b32 %r3175, %r12, 5;
neg.s32 %r258, %r3175;
setp.ge.s32 %p208, %r11, %r258;
@%p208 bra $L__BB0_230;
mul.f32 %f2579, %f5410, 0f3F22F983;
cvt.rni.s32.f32 %r8018, %f2579;
cvt.rn.f32.s32 %f2580, %r8018;
mov.f32 %f2581, 0fBFC90FDA;
fma.rn.f32 %f2582, %f2580, %f2581, %f5410;
mov.f32 %f2583, 0fB3A22168;
fma.rn.f32 %f2584, %f2580, %f2583, %f2582;
mov.f32 %f2585, 0fA7C234C5;
fma.rn.f32 %f5324, %f2580, %f2585, %f2584;
abs.f32 %f242, %f5410;
setp.ltu.f32 %p209, %f242, 0f47CE4780;
@%p209 bra $L__BB0_226;
setp.eq.f32 %p210, %f242, 0f7F800000;
@%p210 bra $L__BB0_225;
bra.uni $L__BB0_220;
$L__BB0_225:
mov.f32 %f2588, 0f00000000;
mul.rn.f32 %f5324, %f5410, %f2588;
mov.u32 %r8018, 0;
bra.uni $L__BB0_226;
$L__BB0_220:
mov.b32 %r260, %f5410;
shr.u32 %r3177, %r260, 23;
and.b32 %r3178, %r3177, 255;
add.s32 %r261, %r3178, -128;
shl.b32 %r3179, %r260, 8;
or.b32 %r262, %r3179, -2147483648;
shr.u32 %r263, %r261, 5;
mov.u64 %rd2529, 0;
mov.u32 %r8015, 0;
mov.u64 %rd792, __cudart_i2opi_f;
mov.u64 %rd2530, %rd2529;
$L__BB0_221:
.pragma "nounroll";
shl.b64 %rd791, %rd2529, 2;
add.s64 %rd793, %rd792, %rd791;
ld.global.nc.u32 %r3180, [%rd793];
mad.wide.u32 %rd794, %r3180, %r262, %rd2530;
shr.u64 %rd2530, %rd794, 32;
add.s64 %rd795, %rd1, %rd791;
st.local.u32 [%rd795], %rd794;
add.s32 %r8015, %r8015, 1;
cvt.s64.s32 %rd2529, %r8015;
setp.ne.s32 %p211, %r8015, 6;
@%p211 bra $L__BB0_221;
st.local.u32 [%rd4], %rd2530;
mov.u32 %r3181, 4;
sub.s32 %r266, %r3181, %r263;
mov.u32 %r3182, 6;
sub.s32 %r3183, %r3182, %r263;
mul.wide.s32 %rd796, %r3183, 4;
add.s64 %rd797, %rd1, %rd796;
ld.local.u32 %r8016, [%rd797];
ld.local.u32 %r8017, [%rd797+-4];
and.b32 %r269, %r261, 31;
setp.eq.s32 %p212, %r269, 0;
@%p212 bra $L__BB0_224;
mov.u32 %r3184, 32;
sub.s32 %r3185, %r3184, %r269;
shr.u32 %r3186, %r8017, %r3185;
shl.b32 %r3187, %r8016, %r269;
add.s32 %r8016, %r3186, %r3187;
mul.wide.s32 %rd798, %r266, 4;
add.s64 %rd799, %rd1, %rd798;
ld.local.u32 %r3188, [%rd799];
shr.u32 %r3189, %r3188, %r3185;
shl.b32 %r3190, %r8017, %r269;
add.s32 %r8017, %r3189, %r3190;
$L__BB0_224:
and.b32 %r3191, %r260, -2147483648;
shr.u32 %r3192, %r8017, 30;
shl.b32 %r3193, %r8016, 2;
or.b32 %r3194, %r3192, %r3193;
shr.u32 %r3195, %r3194, 31;
shr.u32 %r3196, %r8016, 30;
add.s32 %r3197, %r3195, %r3196;
neg.s32 %r3198, %r3197;
setp.eq.s32 %p213, %r3191, 0;
selp.b32 %r8018, %r3197, %r3198, %p213;
setp.ne.s32 %p214, %r3195, 0;
xor.b32 %r3199, %r3191, -2147483648;
selp.b32 %r3200, %r3199, %r3191, %p214;
selp.b32 %r3201, -1, 0, %p214;
xor.b32 %r3202, %r3194, %r3201;
shl.b32 %r3203, %r8017, 2;
xor.b32 %r3204, %r3203, %r3201;
cvt.u64.u32 %rd800, %r3202;
cvt.u64.u32 %rd801, %r3204;
bfi.b64 %rd802, %rd800, %rd801, 32, 32;
cvt.rn.f64.s64 %fd25, %rd802;
mul.f64 %fd26, %fd25, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2586, %fd26;
setp.eq.s32 %p215, %r3200, 0;
neg.f32 %f2587, %f2586;
selp.f32 %f5324, %f2586, %f2587, %p215;
$L__BB0_226:
and.b32 %r276, %r8018, 1;
setp.eq.s32 %p216, %r276, 0;
selp.f32 %f246, %f5324, 0f3F800000, %p216;
mul.rn.f32 %f247, %f5324, %f5324;
mov.f32 %f5325, 0fB94D4153;
@%p216 bra $L__BB0_228;
mov.f32 %f2590, 0fBAB607ED;
mov.f32 %f2591, 0f37CBAC00;
fma.rn.f32 %f5325, %f2591, %f247, %f2590;
$L__BB0_228:
selp.f32 %f2592, 0f3C0885E4, 0f3D2AAABB, %p216;
fma.rn.f32 %f2593, %f5325, %f247, %f2592;
selp.f32 %f2594, 0fBE2AAAA8, 0fBEFFFFFF, %p216;
fma.rn.f32 %f2595, %f2593, %f247, %f2594;
mov.f32 %f2596, 0f00000000;
fma.rn.f32 %f2597, %f247, %f246, %f2596;
fma.rn.f32 %f5281, %f2595, %f2597, %f246;
and.b32 %r3206, %r8018, 2;
setp.eq.s32 %p218, %r3206, 0;
@%p218 bra $L__BB0_230;
mov.f32 %f2599, 0fBF800000;
fma.rn.f32 %f5281, %f5281, %f2599, %f2596;
$L__BB0_230:
setp.lt.s32 %p6, %r11, %r258;
@%p208 bra $L__BB0_243;
mul.f32 %f2600, %f5402, 0f3F22F983;
cvt.rni.s32.f32 %r8022, %f2600;
cvt.rn.f32.s32 %f2601, %r8022;
mov.f32 %f2602, 0fBFC90FDA;
fma.rn.f32 %f2603, %f2601, %f2602, %f5402;
mov.f32 %f2604, 0fB3A22168;
fma.rn.f32 %f2605, %f2601, %f2604, %f2603;
mov.f32 %f2606, 0fA7C234C5;
fma.rn.f32 %f5328, %f2601, %f2606, %f2605;
abs.f32 %f255, %f5402;
setp.ltu.f32 %p220, %f255, 0f47CE4780;
@%p220 bra $L__BB0_239;
setp.eq.f32 %p221, %f255, 0f7F800000;
@%p221 bra $L__BB0_238;
bra.uni $L__BB0_233;
$L__BB0_238:
mov.f32 %f2609, 0f00000000;
mul.rn.f32 %f5328, %f5402, %f2609;
mov.u32 %r8022, 0;
bra.uni $L__BB0_239;
$L__BB0_233:
mov.b32 %r278, %f5402;
shr.u32 %r3208, %r278, 23;
and.b32 %r3209, %r3208, 255;
add.s32 %r279, %r3209, -128;
shl.b32 %r3210, %r278, 8;
or.b32 %r280, %r3210, -2147483648;
shr.u32 %r281, %r279, 5;
mov.u64 %rd2531, 0;
mov.u32 %r8019, 0;
mov.u64 %rd806, __cudart_i2opi_f;
mov.u64 %rd2532, %rd2531;
$L__BB0_234:
.pragma "nounroll";
shl.b64 %rd805, %rd2531, 2;
add.s64 %rd807, %rd806, %rd805;
ld.global.nc.u32 %r3211, [%rd807];
mad.wide.u32 %rd808, %r3211, %r280, %rd2532;
shr.u64 %rd2532, %rd808, 32;
add.s64 %rd809, %rd1, %rd805;
st.local.u32 [%rd809], %rd808;
add.s32 %r8019, %r8019, 1;
cvt.s64.s32 %rd2531, %r8019;
setp.ne.s32 %p222, %r8019, 6;
@%p222 bra $L__BB0_234;
st.local.u32 [%rd4], %rd2532;
mov.u32 %r3212, 4;
sub.s32 %r284, %r3212, %r281;
mov.u32 %r3213, 6;
sub.s32 %r3214, %r3213, %r281;
mul.wide.s32 %rd810, %r3214, 4;
add.s64 %rd811, %rd1, %rd810;
ld.local.u32 %r8020, [%rd811];
ld.local.u32 %r8021, [%rd811+-4];
and.b32 %r287, %r279, 31;
setp.eq.s32 %p223, %r287, 0;
@%p223 bra $L__BB0_237;
mov.u32 %r3215, 32;
sub.s32 %r3216, %r3215, %r287;
shr.u32 %r3217, %r8021, %r3216;
shl.b32 %r3218, %r8020, %r287;
add.s32 %r8020, %r3217, %r3218;
mul.wide.s32 %rd812, %r284, 4;
add.s64 %rd813, %rd1, %rd812;
ld.local.u32 %r3219, [%rd813];
shr.u32 %r3220, %r3219, %r3216;
shl.b32 %r3221, %r8021, %r287;
add.s32 %r8021, %r3220, %r3221;
$L__BB0_237:
and.b32 %r3222, %r278, -2147483648;
shr.u32 %r3223, %r8021, 30;
shl.b32 %r3224, %r8020, 2;
or.b32 %r3225, %r3223, %r3224;
shr.u32 %r3226, %r3225, 31;
shr.u32 %r3227, %r8020, 30;
add.s32 %r3228, %r3226, %r3227;
neg.s32 %r3229, %r3228;
setp.eq.s32 %p224, %r3222, 0;
selp.b32 %r8022, %r3228, %r3229, %p224;
setp.ne.s32 %p225, %r3226, 0;
xor.b32 %r3230, %r3222, -2147483648;
selp.b32 %r3231, %r3230, %r3222, %p225;
selp.b32 %r3232, -1, 0, %p225;
xor.b32 %r3233, %r3225, %r3232;
shl.b32 %r3234, %r8021, 2;
xor.b32 %r3235, %r3234, %r3232;
cvt.u64.u32 %rd814, %r3233;
cvt.u64.u32 %rd815, %r3235;
bfi.b64 %rd816, %rd814, %rd815, 32, 32;
cvt.rn.f64.s64 %fd27, %rd816;
mul.f64 %fd28, %fd27, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2607, %fd28;
setp.eq.s32 %p226, %r3231, 0;
neg.f32 %f2608, %f2607;
selp.f32 %f5328, %f2607, %f2608, %p226;
$L__BB0_239:
add.s32 %r294, %r8022, 1;
and.b32 %r295, %r294, 1;
setp.eq.s32 %p227, %r295, 0;
selp.f32 %f259, %f5328, 0f3F800000, %p227;
mul.rn.f32 %f260, %f5328, %f5328;
mov.f32 %f5329, 0fB94D4153;
@%p227 bra $L__BB0_241;
mov.f32 %f2611, 0fBAB607ED;
mov.f32 %f2612, 0f37CBAC00;
fma.rn.f32 %f5329, %f2612, %f260, %f2611;
$L__BB0_241:
selp.f32 %f2613, 0f3C0885E4, 0f3D2AAABB, %p227;
fma.rn.f32 %f2614, %f5329, %f260, %f2613;
selp.f32 %f2615, 0fBE2AAAA8, 0fBEFFFFFF, %p227;
fma.rn.f32 %f2616, %f2614, %f260, %f2615;
mov.f32 %f2617, 0f00000000;
fma.rn.f32 %f2618, %f260, %f259, %f2617;
fma.rn.f32 %f5283, %f2616, %f2618, %f259;
and.b32 %r3237, %r294, 2;
setp.eq.s32 %p229, %r3237, 0;
@%p229 bra $L__BB0_243;
mov.f32 %f2620, 0fBF800000;
fma.rn.f32 %f5283, %f5283, %f2620, %f2617;
$L__BB0_243:
selp.f32 %f267, %f5283, %f5284, %p6;
selp.f32 %f268, %f5281, %f5282, %p6;
@%p208 bra $L__BB0_245;
add.f32 %f5394, %f268, %f267;
mov.f32 %f5282, %f5281;
mov.f32 %f5284, %f5283;
$L__BB0_245:
add.s32 %r7780, %r12, 3;
setp.gt.s32 %p1766, %r7780, 14;
@%p1766 bra $L__BB0_467;
shl.b32 %r3239, %r12, 5;
mov.u32 %r3240, -32;
sub.s32 %r296, %r3240, %r3239;
setp.ge.s32 %p233, %r11, %r296;
@%p233 bra $L__BB0_259;
mul.f32 %f2623, %f5409, 0f3F22F983;
cvt.rni.s32.f32 %r8026, %f2623;
cvt.rn.f32.s32 %f2624, %r8026;
mov.f32 %f2625, 0fBFC90FDA;
fma.rn.f32 %f2626, %f2624, %f2625, %f5409;
mov.f32 %f2627, 0fB3A22168;
fma.rn.f32 %f2628, %f2624, %f2627, %f2626;
mov.f32 %f2629, 0fA7C234C5;
fma.rn.f32 %f5337, %f2624, %f2629, %f2628;
abs.f32 %f276, %f5409;
setp.ltu.f32 %p234, %f276, 0f47CE4780;
@%p234 bra $L__BB0_255;
setp.eq.f32 %p235, %f276, 0f7F800000;
@%p235 bra $L__BB0_254;
bra.uni $L__BB0_249;
$L__BB0_254:
mov.f32 %f2632, 0f00000000;
mul.rn.f32 %f5337, %f5409, %f2632;
mov.u32 %r8026, 0;
bra.uni $L__BB0_255;
$L__BB0_249:
mov.b32 %r298, %f5409;
shr.u32 %r3242, %r298, 23;
and.b32 %r3243, %r3242, 255;
add.s32 %r299, %r3243, -128;
shl.b32 %r3244, %r298, 8;
or.b32 %r300, %r3244, -2147483648;
shr.u32 %r301, %r299, 5;
mov.u64 %rd2533, 0;
mov.u32 %r8023, 0;
mov.u64 %rd820, __cudart_i2opi_f;
mov.u64 %rd2534, %rd2533;
$L__BB0_250:
.pragma "nounroll";
shl.b64 %rd819, %rd2533, 2;
add.s64 %rd821, %rd820, %rd819;
ld.global.nc.u32 %r3245, [%rd821];
mad.wide.u32 %rd822, %r3245, %r300, %rd2534;
shr.u64 %rd2534, %rd822, 32;
add.s64 %rd823, %rd1, %rd819;
st.local.u32 [%rd823], %rd822;
add.s32 %r8023, %r8023, 1;
cvt.s64.s32 %rd2533, %r8023;
setp.ne.s32 %p236, %r8023, 6;
@%p236 bra $L__BB0_250;
st.local.u32 [%rd4], %rd2534;
mov.u32 %r3246, 4;
sub.s32 %r304, %r3246, %r301;
mov.u32 %r3247, 6;
sub.s32 %r3248, %r3247, %r301;
mul.wide.s32 %rd824, %r3248, 4;
add.s64 %rd825, %rd1, %rd824;
ld.local.u32 %r8024, [%rd825];
ld.local.u32 %r8025, [%rd825+-4];
and.b32 %r307, %r299, 31;
setp.eq.s32 %p237, %r307, 0;
@%p237 bra $L__BB0_253;
mov.u32 %r3249, 32;
sub.s32 %r3250, %r3249, %r307;
shr.u32 %r3251, %r8025, %r3250;
shl.b32 %r3252, %r8024, %r307;
add.s32 %r8024, %r3251, %r3252;
mul.wide.s32 %rd826, %r304, 4;
add.s64 %rd827, %rd1, %rd826;
ld.local.u32 %r3253, [%rd827];
shr.u32 %r3254, %r3253, %r3250;
shl.b32 %r3255, %r8025, %r307;
add.s32 %r8025, %r3254, %r3255;
$L__BB0_253:
and.b32 %r3256, %r298, -2147483648;
shr.u32 %r3257, %r8025, 30;
shl.b32 %r3258, %r8024, 2;
or.b32 %r3259, %r3257, %r3258;
shr.u32 %r3260, %r3259, 31;
shr.u32 %r3261, %r8024, 30;
add.s32 %r3262, %r3260, %r3261;
neg.s32 %r3263, %r3262;
setp.eq.s32 %p238, %r3256, 0;
selp.b32 %r8026, %r3262, %r3263, %p238;
setp.ne.s32 %p239, %r3260, 0;
xor.b32 %r3264, %r3256, -2147483648;
selp.b32 %r3265, %r3264, %r3256, %p239;
selp.b32 %r3266, -1, 0, %p239;
xor.b32 %r3267, %r3259, %r3266;
shl.b32 %r3268, %r8025, 2;
xor.b32 %r3269, %r3268, %r3266;
cvt.u64.u32 %rd828, %r3267;
cvt.u64.u32 %rd829, %r3269;
bfi.b64 %rd830, %rd828, %rd829, 32, 32;
cvt.rn.f64.s64 %fd29, %rd830;
mul.f64 %fd30, %fd29, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2630, %fd30;
setp.eq.s32 %p240, %r3265, 0;
neg.f32 %f2631, %f2630;
selp.f32 %f5337, %f2630, %f2631, %p240;
$L__BB0_255:
and.b32 %r314, %r8026, 1;
setp.eq.s32 %p241, %r314, 0;
selp.f32 %f280, %f5337, 0f3F800000, %p241;
mul.rn.f32 %f281, %f5337, %f5337;
mov.f32 %f5338, 0fB94D4153;
@%p241 bra $L__BB0_257;
mov.f32 %f2634, 0fBAB607ED;
mov.f32 %f2635, 0f37CBAC00;
fma.rn.f32 %f5338, %f2635, %f281, %f2634;
$L__BB0_257:
selp.f32 %f2636, 0f3C0885E4, 0f3D2AAABB, %p241;
fma.rn.f32 %f2637, %f5338, %f281, %f2636;
selp.f32 %f2638, 0fBE2AAAA8, 0fBEFFFFFF, %p241;
fma.rn.f32 %f2639, %f2637, %f281, %f2638;
mov.f32 %f2640, 0f00000000;
fma.rn.f32 %f2641, %f281, %f280, %f2640;
fma.rn.f32 %f5281, %f2639, %f2641, %f280;
and.b32 %r3271, %r8026, 2;
setp.eq.s32 %p243, %r3271, 0;
@%p243 bra $L__BB0_259;
mov.f32 %f2643, 0fBF800000;
fma.rn.f32 %f5281, %f5281, %f2643, %f2640;
$L__BB0_259:
setp.lt.s32 %p7, %r11, %r296;
@%p233 bra $L__BB0_272;
mul.f32 %f2644, %f5401, 0f3F22F983;
cvt.rni.s32.f32 %r8030, %f2644;
cvt.rn.f32.s32 %f2645, %r8030;
mov.f32 %f2646, 0fBFC90FDA;
fma.rn.f32 %f2647, %f2645, %f2646, %f5401;
mov.f32 %f2648, 0fB3A22168;
fma.rn.f32 %f2649, %f2645, %f2648, %f2647;
mov.f32 %f2650, 0fA7C234C5;
fma.rn.f32 %f5341, %f2645, %f2650, %f2649;
abs.f32 %f289, %f5401;
setp.ltu.f32 %p245, %f289, 0f47CE4780;
@%p245 bra $L__BB0_268;
setp.eq.f32 %p246, %f289, 0f7F800000;
@%p246 bra $L__BB0_267;
bra.uni $L__BB0_262;
$L__BB0_267:
mov.f32 %f2653, 0f00000000;
mul.rn.f32 %f5341, %f5401, %f2653;
mov.u32 %r8030, 0;
bra.uni $L__BB0_268;
$L__BB0_262:
mov.b32 %r316, %f5401;
shr.u32 %r3273, %r316, 23;
and.b32 %r3274, %r3273, 255;
add.s32 %r317, %r3274, -128;
shl.b32 %r3275, %r316, 8;
or.b32 %r318, %r3275, -2147483648;
shr.u32 %r319, %r317, 5;
mov.u64 %rd2535, 0;
mov.u32 %r8027, 0;
mov.u64 %rd834, __cudart_i2opi_f;
mov.u64 %rd2536, %rd2535;
$L__BB0_263:
.pragma "nounroll";
shl.b64 %rd833, %rd2535, 2;
add.s64 %rd835, %rd834, %rd833;
ld.global.nc.u32 %r3276, [%rd835];
mad.wide.u32 %rd836, %r3276, %r318, %rd2536;
shr.u64 %rd2536, %rd836, 32;
add.s64 %rd837, %rd1, %rd833;
st.local.u32 [%rd837], %rd836;
add.s32 %r8027, %r8027, 1;
cvt.s64.s32 %rd2535, %r8027;
setp.ne.s32 %p247, %r8027, 6;
@%p247 bra $L__BB0_263;
st.local.u32 [%rd4], %rd2536;
mov.u32 %r3277, 4;
sub.s32 %r322, %r3277, %r319;
mov.u32 %r3278, 6;
sub.s32 %r3279, %r3278, %r319;
mul.wide.s32 %rd838, %r3279, 4;
add.s64 %rd839, %rd1, %rd838;
ld.local.u32 %r8028, [%rd839];
ld.local.u32 %r8029, [%rd839+-4];
and.b32 %r325, %r317, 31;
setp.eq.s32 %p248, %r325, 0;
@%p248 bra $L__BB0_266;
mov.u32 %r3280, 32;
sub.s32 %r3281, %r3280, %r325;
shr.u32 %r3282, %r8029, %r3281;
shl.b32 %r3283, %r8028, %r325;
add.s32 %r8028, %r3282, %r3283;
mul.wide.s32 %rd840, %r322, 4;
add.s64 %rd841, %rd1, %rd840;
ld.local.u32 %r3284, [%rd841];
shr.u32 %r3285, %r3284, %r3281;
shl.b32 %r3286, %r8029, %r325;
add.s32 %r8029, %r3285, %r3286;
$L__BB0_266:
and.b32 %r3287, %r316, -2147483648;
shr.u32 %r3288, %r8029, 30;
shl.b32 %r3289, %r8028, 2;
or.b32 %r3290, %r3288, %r3289;
shr.u32 %r3291, %r3290, 31;
shr.u32 %r3292, %r8028, 30;
add.s32 %r3293, %r3291, %r3292;
neg.s32 %r3294, %r3293;
setp.eq.s32 %p249, %r3287, 0;
selp.b32 %r8030, %r3293, %r3294, %p249;
setp.ne.s32 %p250, %r3291, 0;
xor.b32 %r3295, %r3287, -2147483648;
selp.b32 %r3296, %r3295, %r3287, %p250;
selp.b32 %r3297, -1, 0, %p250;
xor.b32 %r3298, %r3290, %r3297;
shl.b32 %r3299, %r8029, 2;
xor.b32 %r3300, %r3299, %r3297;
cvt.u64.u32 %rd842, %r3298;
cvt.u64.u32 %rd843, %r3300;
bfi.b64 %rd844, %rd842, %rd843, 32, 32;
cvt.rn.f64.s64 %fd31, %rd844;
mul.f64 %fd32, %fd31, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f2651, %fd32;
setp.eq.s32 %p251, %r3296, 0;
neg.f32 %f2652, %f2651;
selp.f32 %f5341, %f2651, %f2652, %p251;
$L__BB0_268:
add.s32 %r332, %r8030, 1;
and.b32 %r333, %r332, 1;
setp.eq.s32 %p252, %r333, 0;
selp.f32 %f293, %f5341, 0f3F800000, %p252;
mul.rn.f32 %f294, %f5341, %f5341;
mov.f32 %f5342, 0fB94D4153;
@%p252 bra $L__BB0_270;
mov.f32 %f2655, 0fBAB607ED;
mov.f32 %f2656, 0f37CBAC00;
fma.rn.f32 %f5342, %f2656, %f294, %f2655;
$L__BB0_270:
selp.f32 %f2657, 0f3C0885E4, 0f3D2AAABB, %p252;
fma.rn.f32 %f2658, %f5342, %f294, %f2657;
selp.f32 %f2659, 0fBE2AAAA8, 0fBEFFFFFF, %p252;
fma.rn.f32 %f2660, %f2658, %f294, %f2659;
mov.f32 %f2661, 0f00000000;
fma.rn.f32 %f2662, %f294, %f293, %f2661;
fma.rn.f32 %f5283, %f2660, %f2662, %f293;
and.b32 %r3302, %r332, 2;
setp.eq.s32 %p254, %r3302, 0;
@%p254 bra $L__BB0_272;
mov.f32 %f2664, 0fBF800000;
fma.rn.f32 %f5283, %f5283, %f2664, %f2661;
$L__BB0_272:
selp.f32 %f301, %f5283, %f5284, %p7;
selp.f32 %f302, %f5281, %f5282, %p7;
@%p233 bra $L__BB0_467;
add.f32 %f5393, %f302, %f301;
mov.f32 %f5282, %f5281;
mov.f32 %f5284, %f5283;
$L__BB0_467:
setp.lt.s32 %p417, %r12, 2;
and.pred %p419, %p33, %p417;
@%p419 bra $L__BB0_740;
bra.uni $L__BB0_468;
$L__BB0_740:
mov.u32 %r7787, %ctaid.x;
shl.b32 %r4542, %r12, 5;
add.s32 %r4543, %r4542, %r1;
mul.hi.s32 %r4544, %r4543, -1840700269;
add.s32 %r4545, %r4544, %r4543;
shr.u32 %r4546, %r4545, 31;
shr.s32 %r4547, %r4545, 2;
add.s32 %r4548, %r4547, %r4546;
mul.lo.s32 %r4549, %r4548, %r2589;
shl.b32 %r4550, %r2587, 2;
add.s32 %r4551, %r14, %r4550;
add.s32 %r4552, %r4551, %r4549;
mul.lo.s32 %r4553, %r4548, 7;
sub.s32 %r4554, %r4543, %r4553;
mul.lo.s32 %r4555, %r4554, %r2590;
add.s32 %r4556, %r4552, %r4555;
mul.wide.s32 %rd1349, %r4556, 4;
add.s64 %rd1350, %rd3, %rd1349;
ld.global.f32 %f855, [%rd1350];
add.s32 %r4557, %r4543, 32;
mul.hi.s32 %r4558, %r4557, -1840700269;
add.s32 %r4559, %r4558, %r4557;
shr.u32 %r4560, %r4559, 31;
shr.s32 %r4561, %r4559, 2;
add.s32 %r4562, %r4561, %r4560;
mul.lo.s32 %r4563, %r4562, %r2589;
add.s32 %r4564, %r4551, %r4563;
mul.lo.s32 %r4565, %r4562, 7;
sub.s32 %r4566, %r4557, %r4565;
mul.lo.s32 %r4567, %r4566, %r2590;
add.s32 %r4568, %r4564, %r4567;
mul.wide.s32 %rd1351, %r4568, 4;
add.s64 %rd1352, %rd3, %rd1351;
ld.global.f32 %f856, [%rd1352];
add.s32 %r4569, %r2586, %r14;
add.s32 %r4570, %r4569, %r4549;
add.s32 %r4571, %r4570, %r4555;
mul.wide.s32 %rd1353, %r4571, 4;
add.s64 %rd1354, %rd3, %rd1353;
ld.global.f32 %f857, [%rd1354];
add.s32 %r4572, %r4569, %r4563;
add.s32 %r4573, %r4572, %r4567;
mul.wide.s32 %rd1355, %r4573, 4;
add.s64 %rd1356, %rd3, %rd1355;
ld.global.f32 %f858, [%rd1356];
mul.wide.s32 %rd1357, %r2587, 4;
add.s64 %rd1358, %rd1354, %rd1357;
ld.global.f32 %f859, [%rd1358];
add.s64 %rd1359, %rd1356, %rd1357;
ld.global.f32 %f860, [%rd1359];
add.s64 %rd1360, %rd1358, %rd1357;
ld.global.f32 %f861, [%rd1360];
add.s64 %rd1361, %rd1359, %rd1357;
ld.global.f32 %f862, [%rd1361];
mul.hi.s32 %r4575, %r4543, 954437177;
shr.u32 %r4576, %r4575, 31;
shr.s32 %r4577, %r4575, 1;
add.s32 %r4578, %r4577, %r4576;
mul.lo.s32 %r4579, %r4578, %r2579;
mad.lo.s32 %r4580, %r2578, %r7787, %r2576;
add.s32 %r4581, %r4580, %r2577;
add.s32 %r4582, %r4581, %r4579;
mul.lo.s32 %r4583, %r4578, 9;
sub.s32 %r4584, %r4543, %r4583;
mul.lo.s32 %r4585, %r4584, %r2580;
add.s32 %r4586, %r4582, %r4585;
mul.wide.s32 %rd1362, %r4586, 4;
add.s64 %rd1363, %rd2, %rd1362;
ld.global.f32 %f863, [%rd1363];
mul.hi.s32 %r4587, %r4557, 954437177;
shr.u32 %r4588, %r4587, 31;
shr.s32 %r4589, %r4587, 1;
add.s32 %r4590, %r4589, %r4588;
mul.lo.s32 %r4591, %r4590, %r2579;
add.s32 %r4592, %r4581, %r4591;
mul.lo.s32 %r4593, %r4590, 9;
sub.s32 %r4594, %r4557, %r4593;
mul.lo.s32 %r4595, %r4594, %r2580;
add.s32 %r4596, %r4592, %r4595;
mul.wide.s32 %rd1364, %r4596, 4;
add.s64 %rd1365, %rd2, %rd1364;
ld.global.f32 %f864, [%rd1365];
add.s32 %r4597, %r4581, %r2577;
add.s32 %r4598, %r4597, %r4579;
add.s32 %r4599, %r4598, %r4585;
mul.wide.s32 %rd1366, %r4599, 4;
add.s64 %rd1367, %rd2, %rd1366;
ld.global.f32 %f865, [%rd1367];
add.s32 %r4600, %r4597, %r4591;
add.s32 %r4601, %r4600, %r4595;
mul.wide.s32 %rd1368, %r4601, 4;
add.s64 %rd1369, %rd2, %rd1368;
ld.global.f32 %f866, [%rd1369];
add.s32 %r4602, %r4580, %r2576;
add.s32 %r4603, %r4602, %r4579;
add.s32 %r4604, %r4603, %r4585;
mul.wide.s32 %rd1370, %r4604, 4;
add.s64 %rd1371, %rd2, %rd1370;
ld.global.f32 %f867, [%rd1371];
add.s32 %r4605, %r4602, %r4591;
add.s32 %r4606, %r4605, %r4595;
mul.wide.s32 %rd1372, %r4606, 4;
add.s64 %rd1373, %rd2, %rd1372;
ld.global.f32 %f868, [%rd1373];
add.s32 %r4607, %r4602, %r2577;
add.s32 %r4608, %r4607, %r4579;
add.s32 %r4609, %r4608, %r4585;
mul.wide.s32 %rd1374, %r4609, 4;
add.s64 %rd1375, %rd2, %rd1374;
ld.global.f32 %f869, [%rd1375];
add.s32 %r4610, %r4607, %r4591;
add.s32 %r4611, %r4610, %r4595;
mul.wide.s32 %rd1376, %r4611, 4;
add.s64 %rd1377, %rd2, %rd1376;
ld.global.f32 %f870, [%rd1377];
mul.f32 %f3355, %f863, 0f3F22F983;
cvt.rni.s32.f32 %r8162, %f3355;
cvt.rn.f32.s32 %f3356, %r8162;
mov.f32 %f3357, 0fBFC90FDA;
fma.rn.f32 %f3358, %f3356, %f3357, %f863;
mov.f32 %f3359, 0fB3A22168;
fma.rn.f32 %f3360, %f3356, %f3359, %f3358;
mov.f32 %f3361, 0fA7C234C5;
fma.rn.f32 %f5544, %f3356, %f3361, %f3360;
abs.f32 %f872, %f863;
setp.ltu.f32 %p644, %f872, 0f47CE4780;
@%p644 bra $L__BB0_748;
setp.eq.f32 %p645, %f872, 0f7F800000;
@%p645 bra $L__BB0_747;
bra.uni $L__BB0_742;
$L__BB0_747:
mov.f32 %f3364, 0f00000000;
mul.rn.f32 %f5544, %f863, %f3364;
mov.u32 %r8162, 0;
bra.uni $L__BB0_748;
$L__BB0_468:
add.s32 %r630, %r12, 4;
setp.gt.s32 %p420, %r630, 14;
shl.b32 %r3846, %r2587, 2;
add.s32 %r631, %r14, %r3846;
@%p420 bra $L__BB0_473;
shl.b32 %r632, %r12, 5;
neg.s32 %r3847, %r632;
setp.ge.s32 %p421, %r11, %r3847;
@%p421 bra $L__BB0_471;
add.s32 %r3848, %r632, %r1;
mul.hi.s32 %r3849, %r3848, -1840700269;
add.s32 %r3850, %r3849, %r3848;
shr.u32 %r3851, %r3850, 31;
shr.s32 %r3852, %r3850, 2;
add.s32 %r3853, %r3852, %r3851;
mad.lo.s32 %r3854, %r3853, %r2589, %r631;
mul.lo.s32 %r3855, %r3853, 7;
sub.s32 %r3856, %r3848, %r3855;
mad.lo.s32 %r3857, %r3856, %r2590, %r3854;
mul.wide.s32 %rd1093, %r3857, 4;
add.s64 %rd1094, %rd3, %rd1093;
ld.global.f32 %f5607, [%rd1094];
$L__BB0_471:
mov.u32 %r3858, -32;
sub.s32 %r3859, %r3858, %r632;
setp.ge.s32 %p422, %r11, %r3859;
@%p422 bra $L__BB0_473;
add.s32 %r3860, %r632, %r1;
add.s32 %r3861, %r3860, 32;
mul.hi.s32 %r3862, %r3861, -1840700269;
add.s32 %r3863, %r3862, %r3861;
shr.u32 %r3864, %r3863, 31;
shr.s32 %r3865, %r3863, 2;
add.s32 %r3866, %r3865, %r3864;
mad.lo.s32 %r3867, %r3866, %r2589, %r631;
mul.lo.s32 %r3868, %r3866, 7;
sub.s32 %r3869, %r3861, %r3868;
mad.lo.s32 %r3870, %r3869, %r2590, %r3867;
mul.wide.s32 %rd1095, %r3870, 4;
add.s64 %rd1096, %rd3, %rd1095;
ld.global.f32 %f5606, [%rd1096];
$L__BB0_473:
add.s32 %r633, %r12, 5;
setp.gt.s32 %p423, %r633, 14;
add.s32 %r634, %r2586, %r14;
@%p423 bra $L__BB0_478;
shl.b32 %r635, %r12, 5;
neg.s32 %r3871, %r635;
setp.ge.s32 %p424, %r11, %r3871;
@%p424 bra $L__BB0_476;
add.s32 %r3872, %r635, %r1;
mul.hi.s32 %r3873, %r3872, -1840700269;
add.s32 %r3874, %r3873, %r3872;
shr.u32 %r3875, %r3874, 31;
shr.s32 %r3876, %r3874, 2;
add.s32 %r3877, %r3876, %r3875;
mad.lo.s32 %r3878, %r3877, %r2589, %r634;
mul.lo.s32 %r3879, %r3877, 7;
sub.s32 %r3880, %r3872, %r3879;
mad.lo.s32 %r3881, %r3880, %r2590, %r3878;
mul.wide.s32 %rd1097, %r3881, 4;
add.s64 %rd1098, %rd3, %rd1097;
ld.global.f32 %f5406, [%rd1098];
$L__BB0_476:
mov.u32 %r3882, -32;
sub.s32 %r3883, %r3882, %r635;
setp.ge.s32 %p425, %r11, %r3883;
@%p425 bra $L__BB0_478;
add.s32 %r3884, %r635, %r1;
add.s32 %r3885, %r3884, 32;
mul.hi.s32 %r3886, %r3885, -1840700269;
add.s32 %r3887, %r3886, %r3885;
shr.u32 %r3888, %r3887, 31;
shr.s32 %r3889, %r3887, 2;
add.s32 %r3890, %r3889, %r3888;
mad.lo.s32 %r3891, %r3890, %r2589, %r634;
mul.lo.s32 %r3892, %r3890, 7;
sub.s32 %r3893, %r3885, %r3892;
mad.lo.s32 %r3894, %r3893, %r2590, %r3891;
mul.wide.s32 %rd1099, %r3894, 4;
add.s64 %rd1100, %rd3, %rd1099;
ld.global.f32 %f5405, [%rd1100];
$L__BB0_478:
add.s32 %r636, %r12, 6;
setp.gt.s32 %p426, %r636, 14;
add.s32 %r637, %r634, %r2587;
@%p426 bra $L__BB0_483;
shl.b32 %r638, %r12, 5;
neg.s32 %r3895, %r638;
setp.ge.s32 %p427, %r11, %r3895;
@%p427 bra $L__BB0_481;
add.s32 %r3896, %r638, %r1;
mul.hi.s32 %r3897, %r3896, -1840700269;
add.s32 %r3898, %r3897, %r3896;
shr.u32 %r3899, %r3898, 31;
shr.s32 %r3900, %r3898, 2;
add.s32 %r3901, %r3900, %r3899;
mad.lo.s32 %r3902, %r3901, %r2589, %r637;
mul.lo.s32 %r3903, %r3901, 7;
sub.s32 %r3904, %r3896, %r3903;
mad.lo.s32 %r3905, %r3904, %r2590, %r3902;
mul.wide.s32 %rd1101, %r3905, 4;
add.s64 %rd1102, %rd3, %rd1101;
ld.global.f32 %f5404, [%rd1102];
$L__BB0_481:
mov.u32 %r3906, -32;
sub.s32 %r3907, %r3906, %r638;
setp.ge.s32 %p428, %r11, %r3907;
@%p428 bra $L__BB0_483;
add.s32 %r3908, %r638, %r1;
add.s32 %r3909, %r3908, 32;
mul.hi.s32 %r3910, %r3909, -1840700269;
add.s32 %r3911, %r3910, %r3909;
shr.u32 %r3912, %r3911, 31;
shr.s32 %r3913, %r3911, 2;
add.s32 %r3914, %r3913, %r3912;
mad.lo.s32 %r3915, %r3914, %r2589, %r637;
mul.lo.s32 %r3916, %r3914, 7;
sub.s32 %r3917, %r3909, %r3916;
mad.lo.s32 %r3918, %r3917, %r2590, %r3915;
mul.wide.s32 %rd1103, %r3918, 4;
add.s64 %rd1104, %rd3, %rd1103;
ld.global.f32 %f5403, [%rd1104];
$L__BB0_483:
add.s32 %r639, %r12, 7;
setp.gt.s32 %p429, %r639, 14;
add.s32 %r640, %r637, %r2587;
@%p429 bra $L__BB0_488;
shl.b32 %r641, %r12, 5;
neg.s32 %r3919, %r641;
setp.ge.s32 %p430, %r11, %r3919;
@%p430 bra $L__BB0_486;
add.s32 %r3920, %r641, %r1;
mul.hi.s32 %r3921, %r3920, -1840700269;
add.s32 %r3922, %r3921, %r3920;
shr.u32 %r3923, %r3922, 31;
shr.s32 %r3924, %r3922, 2;
add.s32 %r3925, %r3924, %r3923;
mad.lo.s32 %r3926, %r3925, %r2589, %r640;
mul.lo.s32 %r3927, %r3925, 7;
sub.s32 %r3928, %r3920, %r3927;
mad.lo.s32 %r3929, %r3928, %r2590, %r3926;
mul.wide.s32 %rd1105, %r3929, 4;
add.s64 %rd1106, %rd3, %rd1105;
ld.global.f32 %f5402, [%rd1106];
$L__BB0_486:
mov.u32 %r3930, -32;
sub.s32 %r3931, %r3930, %r641;
setp.ge.s32 %p431, %r11, %r3931;
@%p431 bra $L__BB0_488;
add.s32 %r3932, %r641, %r1;
add.s32 %r3933, %r3932, 32;
mul.hi.s32 %r3934, %r3933, -1840700269;
add.s32 %r3935, %r3934, %r3933;
shr.u32 %r3936, %r3935, 31;
shr.s32 %r3937, %r3935, 2;
add.s32 %r3938, %r3937, %r3936;
mad.lo.s32 %r3939, %r3938, %r2589, %r640;
mul.lo.s32 %r3940, %r3938, 7;
sub.s32 %r3941, %r3933, %r3940;
mad.lo.s32 %r3942, %r3941, %r2590, %r3939;
mul.wide.s32 %rd1107, %r3942, 4;
add.s64 %rd1108, %rd3, %rd1107;
ld.global.f32 %f5401, [%rd1108];
$L__BB0_488:
mov.u32 %r7781, %ctaid.x;
mul.lo.s32 %r642, %r2578, %r7781;
add.s32 %r3944, %r2576, %r642;
add.s32 %r643, %r3944, %r2577;
@%p420 bra $L__BB0_493;
shl.b32 %r644, %r12, 5;
neg.s32 %r3945, %r644;
setp.ge.s32 %p433, %r11, %r3945;
@%p433 bra $L__BB0_491;
add.s32 %r3946, %r644, %r1;
mul.hi.s32 %r3947, %r3946, 954437177;
shr.u32 %r3948, %r3947, 31;
shr.s32 %r3949, %r3947, 1;
add.s32 %r3950, %r3949, %r3948;
mad.lo.s32 %r3951, %r3950, %r2579, %r643;
mul.lo.s32 %r3952, %r3950, 9;
sub.s32 %r3953, %r3946, %r3952;
mad.lo.s32 %r3954, %r3953, %r2580, %r3951;
mul.wide.s32 %rd1109, %r3954, 4;
add.s64 %rd1110, %rd2, %rd1109;
ld.global.f32 %f5416, [%rd1110];
$L__BB0_491:
mov.u32 %r3955, -32;
sub.s32 %r3956, %r3955, %r644;
setp.ge.s32 %p434, %r11, %r3956;
@%p434 bra $L__BB0_493;
add.s32 %r3957, %r644, %r1;
add.s32 %r3958, %r3957, 32;
mul.hi.s32 %r3959, %r3958, 954437177;
shr.u32 %r3960, %r3959, 31;
shr.s32 %r3961, %r3959, 1;
add.s32 %r3962, %r3961, %r3960;
mad.lo.s32 %r3963, %r3962, %r2579, %r643;
mul.lo.s32 %r3964, %r3962, 9;
sub.s32 %r3965, %r3958, %r3964;
mad.lo.s32 %r3966, %r3965, %r2580, %r3963;
mul.wide.s32 %rd1111, %r3966, 4;
add.s64 %rd1112, %rd2, %rd1111;
ld.global.f32 %f5415, [%rd1112];
$L__BB0_493:
add.s32 %r645, %r643, %r2577;
@%p423 bra $L__BB0_498;
shl.b32 %r646, %r12, 5;
neg.s32 %r3967, %r646;
setp.ge.s32 %p436, %r11, %r3967;
@%p436 bra $L__BB0_496;
add.s32 %r3968, %r646, %r1;
mul.hi.s32 %r3969, %r3968, 954437177;
shr.u32 %r3970, %r3969, 31;
shr.s32 %r3971, %r3969, 1;
add.s32 %r3972, %r3971, %r3970;
mad.lo.s32 %r3973, %r3972, %r2579, %r645;
mul.lo.s32 %r3974, %r3972, 9;
sub.s32 %r3975, %r3968, %r3974;
mad.lo.s32 %r3976, %r3975, %r2580, %r3973;
mul.wide.s32 %rd1113, %r3976, 4;
add.s64 %rd1114, %rd2, %rd1113;
ld.global.f32 %f5414, [%rd1114];
$L__BB0_496:
mov.u32 %r3977, -32;
sub.s32 %r3978, %r3977, %r646;
setp.ge.s32 %p437, %r11, %r3978;
@%p437 bra $L__BB0_498;
add.s32 %r3979, %r646, %r1;
add.s32 %r3980, %r3979, 32;
mul.hi.s32 %r3981, %r3980, 954437177;
shr.u32 %r3982, %r3981, 31;
shr.s32 %r3983, %r3981, 1;
add.s32 %r3984, %r3983, %r3982;
mad.lo.s32 %r3985, %r3984, %r2579, %r645;
mul.lo.s32 %r3986, %r3984, 9;
sub.s32 %r3987, %r3980, %r3986;
mad.lo.s32 %r3988, %r3987, %r2580, %r3985;
mul.wide.s32 %rd1115, %r3988, 4;
add.s64 %rd1116, %rd2, %rd1115;
ld.global.f32 %f5413, [%rd1116];
$L__BB0_498:
shl.b32 %r3989, %r2576, 1;
add.s32 %r647, %r3989, %r642;
@%p426 bra $L__BB0_503;
shl.b32 %r648, %r12, 5;
neg.s32 %r3990, %r648;
setp.ge.s32 %p439, %r11, %r3990;
@%p439 bra $L__BB0_501;
add.s32 %r3991, %r648, %r1;
mul.hi.s32 %r3992, %r3991, 954437177;
shr.u32 %r3993, %r3992, 31;
shr.s32 %r3994, %r3992, 1;
add.s32 %r3995, %r3994, %r3993;
mad.lo.s32 %r3996, %r3995, %r2579, %r647;
mul.lo.s32 %r3997, %r3995, 9;
sub.s32 %r3998, %r3991, %r3997;
mad.lo.s32 %r3999, %r3998, %r2580, %r3996;
mul.wide.s32 %rd1117, %r3999, 4;
add.s64 %rd1118, %rd2, %rd1117;
ld.global.f32 %f5412, [%rd1118];
$L__BB0_501:
mov.u32 %r4000, -32;
sub.s32 %r4001, %r4000, %r648;
setp.ge.s32 %p440, %r11, %r4001;
@%p440 bra $L__BB0_503;
add.s32 %r4002, %r648, %r1;
add.s32 %r4003, %r4002, 32;
mul.hi.s32 %r4004, %r4003, 954437177;
shr.u32 %r4005, %r4004, 31;
shr.s32 %r4006, %r4004, 1;
add.s32 %r4007, %r4006, %r4005;
mad.lo.s32 %r4008, %r4007, %r2579, %r647;
mul.lo.s32 %r4009, %r4007, 9;
sub.s32 %r4010, %r4003, %r4009;
mad.lo.s32 %r4011, %r4010, %r2580, %r4008;
mul.wide.s32 %rd1119, %r4011, 4;
add.s64 %rd1120, %rd2, %rd1119;
ld.global.f32 %f5411, [%rd1120];
$L__BB0_503:
add.s32 %r649, %r647, %r2577;
@%p429 bra $L__BB0_508;
shl.b32 %r650, %r12, 5;
neg.s32 %r4012, %r650;
setp.ge.s32 %p442, %r11, %r4012;
@%p442 bra $L__BB0_506;
add.s32 %r4013, %r650, %r1;
mul.hi.s32 %r4014, %r4013, 954437177;
shr.u32 %r4015, %r4014, 31;
shr.s32 %r4016, %r4014, 1;
add.s32 %r4017, %r4016, %r4015;
mad.lo.s32 %r4018, %r4017, %r2579, %r649;
mul.lo.s32 %r4019, %r4017, 9;
sub.s32 %r4020, %r4013, %r4019;
mad.lo.s32 %r4021, %r4020, %r2580, %r4018;
mul.wide.s32 %rd1121, %r4021, 4;
add.s64 %rd1122, %rd2, %rd1121;
ld.global.f32 %f5410, [%rd1122];
$L__BB0_506:
mov.u32 %r4022, -32;
sub.s32 %r4023, %r4022, %r650;
setp.ge.s32 %p443, %r11, %r4023;
@%p443 bra $L__BB0_508;
add.s32 %r4024, %r650, %r1;
add.s32 %r4025, %r4024, 32;
mul.hi.s32 %r4026, %r4025, 954437177;
shr.u32 %r4027, %r4026, 31;
shr.s32 %r4028, %r4026, 1;
add.s32 %r4029, %r4028, %r4027;
mad.lo.s32 %r4030, %r4029, %r2579, %r649;
mul.lo.s32 %r4031, %r4029, 9;
sub.s32 %r4032, %r4025, %r4031;
mad.lo.s32 %r4033, %r4032, %r2580, %r4030;
mul.wide.s32 %rd1123, %r4033, 4;
add.s64 %rd1124, %rd2, %rd1123;
ld.global.f32 %f5409, [%rd1124];
$L__BB0_508:
@%p420 bra $L__BB0_537;
shl.b32 %r4034, %r12, 5;
neg.s32 %r651, %r4034;
setp.ge.s32 %p445, %r11, %r651;
@%p445 bra $L__BB0_522;
mul.f32 %f3004, %f5416, 0f3F22F983;
cvt.rni.s32.f32 %r8098, %f3004;
cvt.rn.f32.s32 %f3005, %r8098;
mov.f32 %f3006, 0fBFC90FDA;
fma.rn.f32 %f3007, %f3005, %f3006, %f5416;
mov.f32 %f3008, 0fB3A22168;
fma.rn.f32 %f3009, %f3005, %f3008, %f3007;
mov.f32 %f3010, 0fA7C234C5;
fma.rn.f32 %f5445, %f3005, %f3010, %f3009;
abs.f32 %f589, %f5416;
setp.ltu.f32 %p446, %f589, 0f47CE4780;
@%p446 bra $L__BB0_518;
setp.eq.f32 %p447, %f589, 0f7F800000;
@%p447 bra $L__BB0_517;
bra.uni $L__BB0_512;
$L__BB0_517:
mov.f32 %f3013, 0f00000000;
mul.rn.f32 %f5445, %f5416, %f3013;
mov.u32 %r8098, 0;
bra.uni $L__BB0_518;
$L__BB0_742:
mov.b32 %r956, %f863;
shr.u32 %r4613, %r956, 23;
and.b32 %r4614, %r4613, 255;
add.s32 %r957, %r4614, -128;
shl.b32 %r4615, %r956, 8;
or.b32 %r958, %r4615, -2147483648;
shr.u32 %r959, %r957, 5;
mov.u64 %rd2601, 0;
mov.u32 %r8159, 0;
mov.u64 %rd1381, __cudart_i2opi_f;
mov.u64 %rd2602, %rd2601;
$L__BB0_743:
.pragma "nounroll";
shl.b64 %rd1380, %rd2601, 2;
add.s64 %rd1382, %rd1381, %rd1380;
ld.global.nc.u32 %r4616, [%rd1382];
mad.wide.u32 %rd1383, %r4616, %r958, %rd2602;
shr.u64 %rd2602, %rd1383, 32;
add.s64 %rd1384, %rd1, %rd1380;
st.local.u32 [%rd1384], %rd1383;
add.s32 %r8159, %r8159, 1;
cvt.s64.s32 %rd2601, %r8159;
setp.ne.s32 %p646, %r8159, 6;
@%p646 bra $L__BB0_743;
st.local.u32 [%rd4], %rd2602;
mov.u32 %r4617, 4;
sub.s32 %r962, %r4617, %r959;
mov.u32 %r4618, 6;
sub.s32 %r4619, %r4618, %r959;
mul.wide.s32 %rd1385, %r4619, 4;
add.s64 %rd1386, %rd1, %rd1385;
ld.local.u32 %r8160, [%rd1386];
ld.local.u32 %r8161, [%rd1386+-4];
and.b32 %r965, %r957, 31;
setp.eq.s32 %p647, %r965, 0;
@%p647 bra $L__BB0_746;
mov.u32 %r4620, 32;
sub.s32 %r4621, %r4620, %r965;
shr.u32 %r4622, %r8161, %r4621;
shl.b32 %r4623, %r8160, %r965;
add.s32 %r8160, %r4622, %r4623;
mul.wide.s32 %rd1387, %r962, 4;
add.s64 %rd1388, %rd1, %rd1387;
ld.local.u32 %r4624, [%rd1388];
shr.u32 %r4625, %r4624, %r4621;
shl.b32 %r4626, %r8161, %r965;
add.s32 %r8161, %r4625, %r4626;
$L__BB0_746:
and.b32 %r4627, %r956, -2147483648;
shr.u32 %r4628, %r8161, 30;
shl.b32 %r4629, %r8160, 2;
or.b32 %r4630, %r4628, %r4629;
shr.u32 %r4631, %r4630, 31;
shr.u32 %r4632, %r8160, 30;
add.s32 %r4633, %r4631, %r4632;
neg.s32 %r4634, %r4633;
setp.eq.s32 %p648, %r4627, 0;
selp.b32 %r8162, %r4633, %r4634, %p648;
setp.ne.s32 %p649, %r4631, 0;
xor.b32 %r4635, %r4627, -2147483648;
selp.b32 %r4636, %r4635, %r4627, %p649;
selp.b32 %r4637, -1, 0, %p649;
xor.b32 %r4638, %r4630, %r4637;
shl.b32 %r4639, %r8161, 2;
xor.b32 %r4640, %r4639, %r4637;
cvt.u64.u32 %rd1389, %r4638;
cvt.u64.u32 %rd1390, %r4640;
bfi.b64 %rd1391, %rd1389, %rd1390, 32, 32;
cvt.rn.f64.s64 %fd97, %rd1391;
mul.f64 %fd98, %fd97, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3362, %fd98;
setp.eq.s32 %p650, %r4636, 0;
neg.f32 %f3363, %f3362;
selp.f32 %f5544, %f3362, %f3363, %p650;
$L__BB0_748:
and.b32 %r972, %r8162, 1;
setp.eq.s32 %p651, %r972, 0;
selp.f32 %f876, %f5544, 0f3F800000, %p651;
mul.rn.f32 %f877, %f5544, %f5544;
mov.f32 %f5545, 0fB94D4153;
@%p651 bra $L__BB0_750;
mov.f32 %f3366, 0fBAB607ED;
mov.f32 %f3367, 0f37CBAC00;
fma.rn.f32 %f5545, %f3367, %f877, %f3366;
$L__BB0_750:
selp.f32 %f3368, 0f3C0885E4, 0f3D2AAABB, %p651;
fma.rn.f32 %f3369, %f5545, %f877, %f3368;
selp.f32 %f3370, 0fBE2AAAA8, 0fBEFFFFFF, %p651;
fma.rn.f32 %f3371, %f3369, %f877, %f3370;
mov.f32 %f3372, 0f00000000;
fma.rn.f32 %f3373, %f877, %f876, %f3372;
fma.rn.f32 %f5546, %f3371, %f3373, %f876;
and.b32 %r4642, %r8162, 2;
setp.eq.s32 %p653, %r4642, 0;
@%p653 bra $L__BB0_752;
mov.f32 %f3375, 0fBF800000;
fma.rn.f32 %f5546, %f5546, %f3375, %f3372;
$L__BB0_752:
mul.f32 %f3376, %f855, 0f3F22F983;
cvt.rni.s32.f32 %r8166, %f3376;
cvt.rn.f32.s32 %f3377, %r8166;
mov.f32 %f3378, 0fBFC90FDA;
fma.rn.f32 %f3379, %f3377, %f3378, %f855;
mov.f32 %f3380, 0fB3A22168;
fma.rn.f32 %f3381, %f3377, %f3380, %f3379;
mov.f32 %f3382, 0fA7C234C5;
fma.rn.f32 %f5547, %f3377, %f3382, %f3381;
abs.f32 %f884, %f855;
setp.ltu.f32 %p654, %f884, 0f47CE4780;
@%p654 bra $L__BB0_760;
setp.eq.f32 %p655, %f884, 0f7F800000;
@%p655 bra $L__BB0_759;
bra.uni $L__BB0_754;
$L__BB0_759:
mov.f32 %f3385, 0f00000000;
mul.rn.f32 %f5547, %f855, %f3385;
mov.u32 %r8166, 0;
bra.uni $L__BB0_760;
$L__BB0_754:
mov.b32 %r974, %f855;
shr.u32 %r4644, %r974, 23;
and.b32 %r4645, %r4644, 255;
add.s32 %r975, %r4645, -128;
shl.b32 %r4646, %r974, 8;
or.b32 %r976, %r4646, -2147483648;
shr.u32 %r977, %r975, 5;
mov.u64 %rd2603, 0;
mov.u32 %r8163, 0;
mov.u64 %rd1395, __cudart_i2opi_f;
mov.u64 %rd2604, %rd2603;
$L__BB0_755:
.pragma "nounroll";
shl.b64 %rd1394, %rd2603, 2;
add.s64 %rd1396, %rd1395, %rd1394;
ld.global.nc.u32 %r4647, [%rd1396];
mad.wide.u32 %rd1397, %r4647, %r976, %rd2604;
shr.u64 %rd2604, %rd1397, 32;
add.s64 %rd1398, %rd1, %rd1394;
st.local.u32 [%rd1398], %rd1397;
add.s32 %r8163, %r8163, 1;
cvt.s64.s32 %rd2603, %r8163;
setp.ne.s32 %p656, %r8163, 6;
@%p656 bra $L__BB0_755;
st.local.u32 [%rd4], %rd2604;
mov.u32 %r4648, 4;
sub.s32 %r980, %r4648, %r977;
mov.u32 %r4649, 6;
sub.s32 %r4650, %r4649, %r977;
mul.wide.s32 %rd1399, %r4650, 4;
add.s64 %rd1400, %rd1, %rd1399;
ld.local.u32 %r8164, [%rd1400];
ld.local.u32 %r8165, [%rd1400+-4];
and.b32 %r983, %r975, 31;
setp.eq.s32 %p657, %r983, 0;
@%p657 bra $L__BB0_758;
mov.u32 %r4651, 32;
sub.s32 %r4652, %r4651, %r983;
shr.u32 %r4653, %r8165, %r4652;
shl.b32 %r4654, %r8164, %r983;
add.s32 %r8164, %r4653, %r4654;
mul.wide.s32 %rd1401, %r980, 4;
add.s64 %rd1402, %rd1, %rd1401;
ld.local.u32 %r4655, [%rd1402];
shr.u32 %r4656, %r4655, %r4652;
shl.b32 %r4657, %r8165, %r983;
add.s32 %r8165, %r4656, %r4657;
$L__BB0_758:
and.b32 %r4658, %r974, -2147483648;
shr.u32 %r4659, %r8165, 30;
shl.b32 %r4660, %r8164, 2;
or.b32 %r4661, %r4659, %r4660;
shr.u32 %r4662, %r4661, 31;
shr.u32 %r4663, %r8164, 30;
add.s32 %r4664, %r4662, %r4663;
neg.s32 %r4665, %r4664;
setp.eq.s32 %p658, %r4658, 0;
selp.b32 %r8166, %r4664, %r4665, %p658;
setp.ne.s32 %p659, %r4662, 0;
xor.b32 %r4666, %r4658, -2147483648;
selp.b32 %r4667, %r4666, %r4658, %p659;
selp.b32 %r4668, -1, 0, %p659;
xor.b32 %r4669, %r4661, %r4668;
shl.b32 %r4670, %r8165, 2;
xor.b32 %r4671, %r4670, %r4668;
cvt.u64.u32 %rd1403, %r4669;
cvt.u64.u32 %rd1404, %r4671;
bfi.b64 %rd1405, %rd1403, %rd1404, 32, 32;
cvt.rn.f64.s64 %fd99, %rd1405;
mul.f64 %fd100, %fd99, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3383, %fd100;
setp.eq.s32 %p660, %r4667, 0;
neg.f32 %f3384, %f3383;
selp.f32 %f5547, %f3383, %f3384, %p660;
$L__BB0_760:
add.s32 %r990, %r8166, 1;
and.b32 %r991, %r990, 1;
setp.eq.s32 %p661, %r991, 0;
selp.f32 %f888, %f5547, 0f3F800000, %p661;
mul.rn.f32 %f889, %f5547, %f5547;
mov.f32 %f5548, 0fB94D4153;
@%p661 bra $L__BB0_762;
mov.f32 %f3387, 0fBAB607ED;
mov.f32 %f3388, 0f37CBAC00;
fma.rn.f32 %f5548, %f3388, %f889, %f3387;
$L__BB0_762:
selp.f32 %f3389, 0f3C0885E4, 0f3D2AAABB, %p661;
fma.rn.f32 %f3390, %f5548, %f889, %f3389;
selp.f32 %f3391, 0fBE2AAAA8, 0fBEFFFFFF, %p661;
fma.rn.f32 %f3392, %f3390, %f889, %f3391;
mov.f32 %f3393, 0f00000000;
fma.rn.f32 %f3394, %f889, %f888, %f3393;
fma.rn.f32 %f5549, %f3392, %f3394, %f888;
and.b32 %r4673, %r990, 2;
setp.eq.s32 %p663, %r4673, 0;
@%p663 bra $L__BB0_764;
mov.f32 %f3396, 0fBF800000;
fma.rn.f32 %f5549, %f5549, %f3396, %f3393;
$L__BB0_764:
add.f32 %f5599, %f5546, %f5549;
mul.f32 %f3397, %f864, 0f3F22F983;
cvt.rni.s32.f32 %r8170, %f3397;
cvt.rn.f32.s32 %f3398, %r8170;
mov.f32 %f3399, 0fBFC90FDA;
fma.rn.f32 %f3400, %f3398, %f3399, %f864;
mov.f32 %f3401, 0fB3A22168;
fma.rn.f32 %f3402, %f3398, %f3401, %f3400;
mov.f32 %f3403, 0fA7C234C5;
fma.rn.f32 %f5550, %f3398, %f3403, %f3402;
abs.f32 %f897, %f864;
setp.ltu.f32 %p664, %f897, 0f47CE4780;
@%p664 bra $L__BB0_772;
setp.eq.f32 %p665, %f897, 0f7F800000;
@%p665 bra $L__BB0_771;
bra.uni $L__BB0_766;
$L__BB0_771:
mov.f32 %f3406, 0f00000000;
mul.rn.f32 %f5550, %f864, %f3406;
mov.u32 %r8170, 0;
bra.uni $L__BB0_772;
$L__BB0_766:
mov.b32 %r993, %f864;
shr.u32 %r4675, %r993, 23;
and.b32 %r4676, %r4675, 255;
add.s32 %r994, %r4676, -128;
shl.b32 %r4677, %r993, 8;
or.b32 %r995, %r4677, -2147483648;
shr.u32 %r996, %r994, 5;
mov.u64 %rd2605, 0;
mov.u32 %r8167, 0;
mov.u64 %rd1409, __cudart_i2opi_f;
mov.u64 %rd2606, %rd2605;
$L__BB0_767:
.pragma "nounroll";
shl.b64 %rd1408, %rd2605, 2;
add.s64 %rd1410, %rd1409, %rd1408;
ld.global.nc.u32 %r4678, [%rd1410];
mad.wide.u32 %rd1411, %r4678, %r995, %rd2606;
shr.u64 %rd2606, %rd1411, 32;
add.s64 %rd1412, %rd1, %rd1408;
st.local.u32 [%rd1412], %rd1411;
add.s32 %r8167, %r8167, 1;
cvt.s64.s32 %rd2605, %r8167;
setp.ne.s32 %p666, %r8167, 6;
@%p666 bra $L__BB0_767;
st.local.u32 [%rd4], %rd2606;
mov.u32 %r4679, 4;
sub.s32 %r999, %r4679, %r996;
mov.u32 %r4680, 6;
sub.s32 %r4681, %r4680, %r996;
mul.wide.s32 %rd1413, %r4681, 4;
add.s64 %rd1414, %rd1, %rd1413;
ld.local.u32 %r8168, [%rd1414];
ld.local.u32 %r8169, [%rd1414+-4];
and.b32 %r1002, %r994, 31;
setp.eq.s32 %p667, %r1002, 0;
@%p667 bra $L__BB0_770;
mov.u32 %r4682, 32;
sub.s32 %r4683, %r4682, %r1002;
shr.u32 %r4684, %r8169, %r4683;
shl.b32 %r4685, %r8168, %r1002;
add.s32 %r8168, %r4684, %r4685;
mul.wide.s32 %rd1415, %r999, 4;
add.s64 %rd1416, %rd1, %rd1415;
ld.local.u32 %r4686, [%rd1416];
shr.u32 %r4687, %r4686, %r4683;
shl.b32 %r4688, %r8169, %r1002;
add.s32 %r8169, %r4687, %r4688;
$L__BB0_770:
and.b32 %r4689, %r993, -2147483648;
shr.u32 %r4690, %r8169, 30;
shl.b32 %r4691, %r8168, 2;
or.b32 %r4692, %r4690, %r4691;
shr.u32 %r4693, %r4692, 31;
shr.u32 %r4694, %r8168, 30;
add.s32 %r4695, %r4693, %r4694;
neg.s32 %r4696, %r4695;
setp.eq.s32 %p668, %r4689, 0;
selp.b32 %r8170, %r4695, %r4696, %p668;
setp.ne.s32 %p669, %r4693, 0;
xor.b32 %r4697, %r4689, -2147483648;
selp.b32 %r4698, %r4697, %r4689, %p669;
selp.b32 %r4699, -1, 0, %p669;
xor.b32 %r4700, %r4692, %r4699;
shl.b32 %r4701, %r8169, 2;
xor.b32 %r4702, %r4701, %r4699;
cvt.u64.u32 %rd1417, %r4700;
cvt.u64.u32 %rd1418, %r4702;
bfi.b64 %rd1419, %rd1417, %rd1418, 32, 32;
cvt.rn.f64.s64 %fd101, %rd1419;
mul.f64 %fd102, %fd101, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3404, %fd102;
setp.eq.s32 %p670, %r4698, 0;
neg.f32 %f3405, %f3404;
selp.f32 %f5550, %f3404, %f3405, %p670;
$L__BB0_772:
and.b32 %r1009, %r8170, 1;
setp.eq.s32 %p671, %r1009, 0;
selp.f32 %f901, %f5550, 0f3F800000, %p671;
mul.rn.f32 %f902, %f5550, %f5550;
mov.f32 %f5551, 0fB94D4153;
@%p671 bra $L__BB0_774;
mov.f32 %f3408, 0fBAB607ED;
mov.f32 %f3409, 0f37CBAC00;
fma.rn.f32 %f5551, %f3409, %f902, %f3408;
$L__BB0_774:
selp.f32 %f3410, 0f3C0885E4, 0f3D2AAABB, %p671;
fma.rn.f32 %f3411, %f5551, %f902, %f3410;
selp.f32 %f3412, 0fBE2AAAA8, 0fBEFFFFFF, %p671;
fma.rn.f32 %f3413, %f3411, %f902, %f3412;
mov.f32 %f3414, 0f00000000;
fma.rn.f32 %f3415, %f902, %f901, %f3414;
fma.rn.f32 %f5552, %f3413, %f3415, %f901;
and.b32 %r4704, %r8170, 2;
setp.eq.s32 %p673, %r4704, 0;
@%p673 bra $L__BB0_776;
mov.f32 %f3417, 0fBF800000;
fma.rn.f32 %f5552, %f5552, %f3417, %f3414;
$L__BB0_776:
mul.f32 %f3418, %f856, 0f3F22F983;
cvt.rni.s32.f32 %r8174, %f3418;
cvt.rn.f32.s32 %f3419, %r8174;
mov.f32 %f3420, 0fBFC90FDA;
fma.rn.f32 %f3421, %f3419, %f3420, %f856;
mov.f32 %f3422, 0fB3A22168;
fma.rn.f32 %f3423, %f3419, %f3422, %f3421;
mov.f32 %f3424, 0fA7C234C5;
fma.rn.f32 %f5553, %f3419, %f3424, %f3423;
abs.f32 %f909, %f856;
setp.ltu.f32 %p674, %f909, 0f47CE4780;
@%p674 bra $L__BB0_784;
setp.eq.f32 %p675, %f909, 0f7F800000;
@%p675 bra $L__BB0_783;
bra.uni $L__BB0_778;
$L__BB0_783:
mov.f32 %f3427, 0f00000000;
mul.rn.f32 %f5553, %f856, %f3427;
mov.u32 %r8174, 0;
bra.uni $L__BB0_784;
$L__BB0_778:
mov.b32 %r1011, %f856;
shr.u32 %r4706, %r1011, 23;
and.b32 %r4707, %r4706, 255;
add.s32 %r1012, %r4707, -128;
shl.b32 %r4708, %r1011, 8;
or.b32 %r1013, %r4708, -2147483648;
shr.u32 %r1014, %r1012, 5;
mov.u64 %rd2607, 0;
mov.u32 %r8171, 0;
mov.u64 %rd1423, __cudart_i2opi_f;
mov.u64 %rd2608, %rd2607;
$L__BB0_779:
.pragma "nounroll";
shl.b64 %rd1422, %rd2607, 2;
add.s64 %rd1424, %rd1423, %rd1422;
ld.global.nc.u32 %r4709, [%rd1424];
mad.wide.u32 %rd1425, %r4709, %r1013, %rd2608;
shr.u64 %rd2608, %rd1425, 32;
add.s64 %rd1426, %rd1, %rd1422;
st.local.u32 [%rd1426], %rd1425;
add.s32 %r8171, %r8171, 1;
cvt.s64.s32 %rd2607, %r8171;
setp.ne.s32 %p676, %r8171, 6;
@%p676 bra $L__BB0_779;
st.local.u32 [%rd4], %rd2608;
mov.u32 %r4710, 4;
sub.s32 %r1017, %r4710, %r1014;
mov.u32 %r4711, 6;
sub.s32 %r4712, %r4711, %r1014;
mul.wide.s32 %rd1427, %r4712, 4;
add.s64 %rd1428, %rd1, %rd1427;
ld.local.u32 %r8172, [%rd1428];
ld.local.u32 %r8173, [%rd1428+-4];
and.b32 %r1020, %r1012, 31;
setp.eq.s32 %p677, %r1020, 0;
@%p677 bra $L__BB0_782;
mov.u32 %r4713, 32;
sub.s32 %r4714, %r4713, %r1020;
shr.u32 %r4715, %r8173, %r4714;
shl.b32 %r4716, %r8172, %r1020;
add.s32 %r8172, %r4715, %r4716;
mul.wide.s32 %rd1429, %r1017, 4;
add.s64 %rd1430, %rd1, %rd1429;
ld.local.u32 %r4717, [%rd1430];
shr.u32 %r4718, %r4717, %r4714;
shl.b32 %r4719, %r8173, %r1020;
add.s32 %r8173, %r4718, %r4719;
$L__BB0_782:
and.b32 %r4720, %r1011, -2147483648;
shr.u32 %r4721, %r8173, 30;
shl.b32 %r4722, %r8172, 2;
or.b32 %r4723, %r4721, %r4722;
shr.u32 %r4724, %r4723, 31;
shr.u32 %r4725, %r8172, 30;
add.s32 %r4726, %r4724, %r4725;
neg.s32 %r4727, %r4726;
setp.eq.s32 %p678, %r4720, 0;
selp.b32 %r8174, %r4726, %r4727, %p678;
setp.ne.s32 %p679, %r4724, 0;
xor.b32 %r4728, %r4720, -2147483648;
selp.b32 %r4729, %r4728, %r4720, %p679;
selp.b32 %r4730, -1, 0, %p679;
xor.b32 %r4731, %r4723, %r4730;
shl.b32 %r4732, %r8173, 2;
xor.b32 %r4733, %r4732, %r4730;
cvt.u64.u32 %rd1431, %r4731;
cvt.u64.u32 %rd1432, %r4733;
bfi.b64 %rd1433, %rd1431, %rd1432, 32, 32;
cvt.rn.f64.s64 %fd103, %rd1433;
mul.f64 %fd104, %fd103, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3425, %fd104;
setp.eq.s32 %p680, %r4729, 0;
neg.f32 %f3426, %f3425;
selp.f32 %f5553, %f3425, %f3426, %p680;
$L__BB0_784:
add.s32 %r1027, %r8174, 1;
and.b32 %r1028, %r1027, 1;
setp.eq.s32 %p681, %r1028, 0;
selp.f32 %f913, %f5553, 0f3F800000, %p681;
mul.rn.f32 %f914, %f5553, %f5553;
mov.f32 %f5554, 0fB94D4153;
@%p681 bra $L__BB0_786;
mov.f32 %f3429, 0fBAB607ED;
mov.f32 %f3430, 0f37CBAC00;
fma.rn.f32 %f5554, %f3430, %f914, %f3429;
$L__BB0_786:
selp.f32 %f3431, 0f3C0885E4, 0f3D2AAABB, %p681;
fma.rn.f32 %f3432, %f5554, %f914, %f3431;
selp.f32 %f3433, 0fBE2AAAA8, 0fBEFFFFFF, %p681;
fma.rn.f32 %f3434, %f3432, %f914, %f3433;
mov.f32 %f3435, 0f00000000;
fma.rn.f32 %f3436, %f914, %f913, %f3435;
fma.rn.f32 %f5555, %f3434, %f3436, %f913;
and.b32 %r4735, %r1027, 2;
setp.eq.s32 %p683, %r4735, 0;
@%p683 bra $L__BB0_788;
mov.f32 %f3438, 0fBF800000;
fma.rn.f32 %f5555, %f5555, %f3438, %f3435;
$L__BB0_788:
add.f32 %f5598, %f5552, %f5555;
mul.f32 %f3439, %f865, 0f3F22F983;
cvt.rni.s32.f32 %r8178, %f3439;
cvt.rn.f32.s32 %f3440, %r8178;
mov.f32 %f3441, 0fBFC90FDA;
fma.rn.f32 %f3442, %f3440, %f3441, %f865;
mov.f32 %f3443, 0fB3A22168;
fma.rn.f32 %f3444, %f3440, %f3443, %f3442;
mov.f32 %f3445, 0fA7C234C5;
fma.rn.f32 %f5556, %f3440, %f3445, %f3444;
abs.f32 %f922, %f865;
setp.ltu.f32 %p684, %f922, 0f47CE4780;
@%p684 bra $L__BB0_796;
setp.eq.f32 %p685, %f922, 0f7F800000;
@%p685 bra $L__BB0_795;
bra.uni $L__BB0_790;
$L__BB0_795:
mov.f32 %f3448, 0f00000000;
mul.rn.f32 %f5556, %f865, %f3448;
mov.u32 %r8178, 0;
bra.uni $L__BB0_796;
$L__BB0_790:
mov.b32 %r1030, %f865;
shr.u32 %r4737, %r1030, 23;
and.b32 %r4738, %r4737, 255;
add.s32 %r1031, %r4738, -128;
shl.b32 %r4739, %r1030, 8;
or.b32 %r1032, %r4739, -2147483648;
shr.u32 %r1033, %r1031, 5;
mov.u64 %rd2609, 0;
mov.u32 %r8175, 0;
mov.u64 %rd1437, __cudart_i2opi_f;
mov.u64 %rd2610, %rd2609;
$L__BB0_791:
.pragma "nounroll";
shl.b64 %rd1436, %rd2609, 2;
add.s64 %rd1438, %rd1437, %rd1436;
ld.global.nc.u32 %r4740, [%rd1438];
mad.wide.u32 %rd1439, %r4740, %r1032, %rd2610;
shr.u64 %rd2610, %rd1439, 32;
add.s64 %rd1440, %rd1, %rd1436;
st.local.u32 [%rd1440], %rd1439;
add.s32 %r8175, %r8175, 1;
cvt.s64.s32 %rd2609, %r8175;
setp.ne.s32 %p686, %r8175, 6;
@%p686 bra $L__BB0_791;
st.local.u32 [%rd4], %rd2610;
mov.u32 %r4741, 4;
sub.s32 %r1036, %r4741, %r1033;
mov.u32 %r4742, 6;
sub.s32 %r4743, %r4742, %r1033;
mul.wide.s32 %rd1441, %r4743, 4;
add.s64 %rd1442, %rd1, %rd1441;
ld.local.u32 %r8176, [%rd1442];
ld.local.u32 %r8177, [%rd1442+-4];
and.b32 %r1039, %r1031, 31;
setp.eq.s32 %p687, %r1039, 0;
@%p687 bra $L__BB0_794;
mov.u32 %r4744, 32;
sub.s32 %r4745, %r4744, %r1039;
shr.u32 %r4746, %r8177, %r4745;
shl.b32 %r4747, %r8176, %r1039;
add.s32 %r8176, %r4746, %r4747;
mul.wide.s32 %rd1443, %r1036, 4;
add.s64 %rd1444, %rd1, %rd1443;
ld.local.u32 %r4748, [%rd1444];
shr.u32 %r4749, %r4748, %r4745;
shl.b32 %r4750, %r8177, %r1039;
add.s32 %r8177, %r4749, %r4750;
$L__BB0_794:
and.b32 %r4751, %r1030, -2147483648;
shr.u32 %r4752, %r8177, 30;
shl.b32 %r4753, %r8176, 2;
or.b32 %r4754, %r4752, %r4753;
shr.u32 %r4755, %r4754, 31;
shr.u32 %r4756, %r8176, 30;
add.s32 %r4757, %r4755, %r4756;
neg.s32 %r4758, %r4757;
setp.eq.s32 %p688, %r4751, 0;
selp.b32 %r8178, %r4757, %r4758, %p688;
setp.ne.s32 %p689, %r4755, 0;
xor.b32 %r4759, %r4751, -2147483648;
selp.b32 %r4760, %r4759, %r4751, %p689;
selp.b32 %r4761, -1, 0, %p689;
xor.b32 %r4762, %r4754, %r4761;
shl.b32 %r4763, %r8177, 2;
xor.b32 %r4764, %r4763, %r4761;
cvt.u64.u32 %rd1445, %r4762;
cvt.u64.u32 %rd1446, %r4764;
bfi.b64 %rd1447, %rd1445, %rd1446, 32, 32;
cvt.rn.f64.s64 %fd105, %rd1447;
mul.f64 %fd106, %fd105, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3446, %fd106;
setp.eq.s32 %p690, %r4760, 0;
neg.f32 %f3447, %f3446;
selp.f32 %f5556, %f3446, %f3447, %p690;
$L__BB0_796:
and.b32 %r1046, %r8178, 1;
setp.eq.s32 %p691, %r1046, 0;
selp.f32 %f926, %f5556, 0f3F800000, %p691;
mul.rn.f32 %f927, %f5556, %f5556;
mov.f32 %f5557, 0fB94D4153;
@%p691 bra $L__BB0_798;
mov.f32 %f3450, 0fBAB607ED;
mov.f32 %f3451, 0f37CBAC00;
fma.rn.f32 %f5557, %f3451, %f927, %f3450;
$L__BB0_798:
selp.f32 %f3452, 0f3C0885E4, 0f3D2AAABB, %p691;
fma.rn.f32 %f3453, %f5557, %f927, %f3452;
selp.f32 %f3454, 0fBE2AAAA8, 0fBEFFFFFF, %p691;
fma.rn.f32 %f3455, %f3453, %f927, %f3454;
mov.f32 %f3456, 0f00000000;
fma.rn.f32 %f3457, %f927, %f926, %f3456;
fma.rn.f32 %f5558, %f3455, %f3457, %f926;
and.b32 %r4766, %r8178, 2;
setp.eq.s32 %p693, %r4766, 0;
@%p693 bra $L__BB0_800;
mov.f32 %f3459, 0fBF800000;
fma.rn.f32 %f5558, %f5558, %f3459, %f3456;
$L__BB0_800:
mul.f32 %f3460, %f857, 0f3F22F983;
cvt.rni.s32.f32 %r8182, %f3460;
cvt.rn.f32.s32 %f3461, %r8182;
mov.f32 %f3462, 0fBFC90FDA;
fma.rn.f32 %f3463, %f3461, %f3462, %f857;
mov.f32 %f3464, 0fB3A22168;
fma.rn.f32 %f3465, %f3461, %f3464, %f3463;
mov.f32 %f3466, 0fA7C234C5;
fma.rn.f32 %f5559, %f3461, %f3466, %f3465;
abs.f32 %f934, %f857;
setp.ltu.f32 %p694, %f934, 0f47CE4780;
@%p694 bra $L__BB0_808;
setp.eq.f32 %p695, %f934, 0f7F800000;
@%p695 bra $L__BB0_807;
bra.uni $L__BB0_802;
$L__BB0_807:
mov.f32 %f3469, 0f00000000;
mul.rn.f32 %f5559, %f857, %f3469;
mov.u32 %r8182, 0;
bra.uni $L__BB0_808;
$L__BB0_802:
mov.b32 %r1048, %f857;
shr.u32 %r4768, %r1048, 23;
and.b32 %r4769, %r4768, 255;
add.s32 %r1049, %r4769, -128;
shl.b32 %r4770, %r1048, 8;
or.b32 %r1050, %r4770, -2147483648;
shr.u32 %r1051, %r1049, 5;
mov.u64 %rd2611, 0;
mov.u32 %r8179, 0;
mov.u64 %rd1451, __cudart_i2opi_f;
mov.u64 %rd2612, %rd2611;
$L__BB0_803:
.pragma "nounroll";
shl.b64 %rd1450, %rd2611, 2;
add.s64 %rd1452, %rd1451, %rd1450;
ld.global.nc.u32 %r4771, [%rd1452];
mad.wide.u32 %rd1453, %r4771, %r1050, %rd2612;
shr.u64 %rd2612, %rd1453, 32;
add.s64 %rd1454, %rd1, %rd1450;
st.local.u32 [%rd1454], %rd1453;
add.s32 %r8179, %r8179, 1;
cvt.s64.s32 %rd2611, %r8179;
setp.ne.s32 %p696, %r8179, 6;
@%p696 bra $L__BB0_803;
st.local.u32 [%rd4], %rd2612;
mov.u32 %r4772, 4;
sub.s32 %r1054, %r4772, %r1051;
mov.u32 %r4773, 6;
sub.s32 %r4774, %r4773, %r1051;
mul.wide.s32 %rd1455, %r4774, 4;
add.s64 %rd1456, %rd1, %rd1455;
ld.local.u32 %r8180, [%rd1456];
ld.local.u32 %r8181, [%rd1456+-4];
and.b32 %r1057, %r1049, 31;
setp.eq.s32 %p697, %r1057, 0;
@%p697 bra $L__BB0_806;
mov.u32 %r4775, 32;
sub.s32 %r4776, %r4775, %r1057;
shr.u32 %r4777, %r8181, %r4776;
shl.b32 %r4778, %r8180, %r1057;
add.s32 %r8180, %r4777, %r4778;
mul.wide.s32 %rd1457, %r1054, 4;
add.s64 %rd1458, %rd1, %rd1457;
ld.local.u32 %r4779, [%rd1458];
shr.u32 %r4780, %r4779, %r4776;
shl.b32 %r4781, %r8181, %r1057;
add.s32 %r8181, %r4780, %r4781;
$L__BB0_806:
and.b32 %r4782, %r1048, -2147483648;
shr.u32 %r4783, %r8181, 30;
shl.b32 %r4784, %r8180, 2;
or.b32 %r4785, %r4783, %r4784;
shr.u32 %r4786, %r4785, 31;
shr.u32 %r4787, %r8180, 30;
add.s32 %r4788, %r4786, %r4787;
neg.s32 %r4789, %r4788;
setp.eq.s32 %p698, %r4782, 0;
selp.b32 %r8182, %r4788, %r4789, %p698;
setp.ne.s32 %p699, %r4786, 0;
xor.b32 %r4790, %r4782, -2147483648;
selp.b32 %r4791, %r4790, %r4782, %p699;
selp.b32 %r4792, -1, 0, %p699;
xor.b32 %r4793, %r4785, %r4792;
shl.b32 %r4794, %r8181, 2;
xor.b32 %r4795, %r4794, %r4792;
cvt.u64.u32 %rd1459, %r4793;
cvt.u64.u32 %rd1460, %r4795;
bfi.b64 %rd1461, %rd1459, %rd1460, 32, 32;
cvt.rn.f64.s64 %fd107, %rd1461;
mul.f64 %fd108, %fd107, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3467, %fd108;
setp.eq.s32 %p700, %r4791, 0;
neg.f32 %f3468, %f3467;
selp.f32 %f5559, %f3467, %f3468, %p700;
$L__BB0_808:
add.s32 %r1064, %r8182, 1;
and.b32 %r1065, %r1064, 1;
setp.eq.s32 %p701, %r1065, 0;
selp.f32 %f938, %f5559, 0f3F800000, %p701;
mul.rn.f32 %f939, %f5559, %f5559;
mov.f32 %f5560, 0fB94D4153;
@%p701 bra $L__BB0_810;
mov.f32 %f3471, 0fBAB607ED;
mov.f32 %f3472, 0f37CBAC00;
fma.rn.f32 %f5560, %f3472, %f939, %f3471;
$L__BB0_810:
selp.f32 %f3473, 0f3C0885E4, 0f3D2AAABB, %p701;
fma.rn.f32 %f3474, %f5560, %f939, %f3473;
selp.f32 %f3475, 0fBE2AAAA8, 0fBEFFFFFF, %p701;
fma.rn.f32 %f3476, %f3474, %f939, %f3475;
mov.f32 %f3477, 0f00000000;
fma.rn.f32 %f3478, %f939, %f938, %f3477;
fma.rn.f32 %f5561, %f3476, %f3478, %f938;
and.b32 %r4797, %r1064, 2;
setp.eq.s32 %p703, %r4797, 0;
@%p703 bra $L__BB0_812;
mov.f32 %f3480, 0fBF800000;
fma.rn.f32 %f5561, %f5561, %f3480, %f3477;
$L__BB0_812:
add.f32 %f5597, %f5558, %f5561;
mul.f32 %f3481, %f866, 0f3F22F983;
cvt.rni.s32.f32 %r8186, %f3481;
cvt.rn.f32.s32 %f3482, %r8186;
mov.f32 %f3483, 0fBFC90FDA;
fma.rn.f32 %f3484, %f3482, %f3483, %f866;
mov.f32 %f3485, 0fB3A22168;
fma.rn.f32 %f3486, %f3482, %f3485, %f3484;
mov.f32 %f3487, 0fA7C234C5;
fma.rn.f32 %f5562, %f3482, %f3487, %f3486;
abs.f32 %f947, %f866;
setp.ltu.f32 %p704, %f947, 0f47CE4780;
@%p704 bra $L__BB0_820;
setp.eq.f32 %p705, %f947, 0f7F800000;
@%p705 bra $L__BB0_819;
bra.uni $L__BB0_814;
$L__BB0_819:
mov.f32 %f3490, 0f00000000;
mul.rn.f32 %f5562, %f866, %f3490;
mov.u32 %r8186, 0;
bra.uni $L__BB0_820;
$L__BB0_814:
mov.b32 %r1067, %f866;
shr.u32 %r4799, %r1067, 23;
and.b32 %r4800, %r4799, 255;
add.s32 %r1068, %r4800, -128;
shl.b32 %r4801, %r1067, 8;
or.b32 %r1069, %r4801, -2147483648;
shr.u32 %r1070, %r1068, 5;
mov.u64 %rd2613, 0;
mov.u32 %r8183, 0;
mov.u64 %rd1465, __cudart_i2opi_f;
mov.u64 %rd2614, %rd2613;
$L__BB0_815:
.pragma "nounroll";
shl.b64 %rd1464, %rd2613, 2;
add.s64 %rd1466, %rd1465, %rd1464;
ld.global.nc.u32 %r4802, [%rd1466];
mad.wide.u32 %rd1467, %r4802, %r1069, %rd2614;
shr.u64 %rd2614, %rd1467, 32;
add.s64 %rd1468, %rd1, %rd1464;
st.local.u32 [%rd1468], %rd1467;
add.s32 %r8183, %r8183, 1;
cvt.s64.s32 %rd2613, %r8183;
setp.ne.s32 %p706, %r8183, 6;
@%p706 bra $L__BB0_815;
st.local.u32 [%rd4], %rd2614;
mov.u32 %r4803, 4;
sub.s32 %r1073, %r4803, %r1070;
mov.u32 %r4804, 6;
sub.s32 %r4805, %r4804, %r1070;
mul.wide.s32 %rd1469, %r4805, 4;
add.s64 %rd1470, %rd1, %rd1469;
ld.local.u32 %r8184, [%rd1470];
ld.local.u32 %r8185, [%rd1470+-4];
and.b32 %r1076, %r1068, 31;
setp.eq.s32 %p707, %r1076, 0;
@%p707 bra $L__BB0_818;
mov.u32 %r4806, 32;
sub.s32 %r4807, %r4806, %r1076;
shr.u32 %r4808, %r8185, %r4807;
shl.b32 %r4809, %r8184, %r1076;
add.s32 %r8184, %r4808, %r4809;
mul.wide.s32 %rd1471, %r1073, 4;
add.s64 %rd1472, %rd1, %rd1471;
ld.local.u32 %r4810, [%rd1472];
shr.u32 %r4811, %r4810, %r4807;
shl.b32 %r4812, %r8185, %r1076;
add.s32 %r8185, %r4811, %r4812;
$L__BB0_818:
and.b32 %r4813, %r1067, -2147483648;
shr.u32 %r4814, %r8185, 30;
shl.b32 %r4815, %r8184, 2;
or.b32 %r4816, %r4814, %r4815;
shr.u32 %r4817, %r4816, 31;
shr.u32 %r4818, %r8184, 30;
add.s32 %r4819, %r4817, %r4818;
neg.s32 %r4820, %r4819;
setp.eq.s32 %p708, %r4813, 0;
selp.b32 %r8186, %r4819, %r4820, %p708;
setp.ne.s32 %p709, %r4817, 0;
xor.b32 %r4821, %r4813, -2147483648;
selp.b32 %r4822, %r4821, %r4813, %p709;
selp.b32 %r4823, -1, 0, %p709;
xor.b32 %r4824, %r4816, %r4823;
shl.b32 %r4825, %r8185, 2;
xor.b32 %r4826, %r4825, %r4823;
cvt.u64.u32 %rd1473, %r4824;
cvt.u64.u32 %rd1474, %r4826;
bfi.b64 %rd1475, %rd1473, %rd1474, 32, 32;
cvt.rn.f64.s64 %fd109, %rd1475;
mul.f64 %fd110, %fd109, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3488, %fd110;
setp.eq.s32 %p710, %r4822, 0;
neg.f32 %f3489, %f3488;
selp.f32 %f5562, %f3488, %f3489, %p710;
$L__BB0_820:
and.b32 %r1083, %r8186, 1;
setp.eq.s32 %p711, %r1083, 0;
selp.f32 %f951, %f5562, 0f3F800000, %p711;
mul.rn.f32 %f952, %f5562, %f5562;
mov.f32 %f5563, 0fB94D4153;
@%p711 bra $L__BB0_822;
mov.f32 %f3492, 0fBAB607ED;
mov.f32 %f3493, 0f37CBAC00;
fma.rn.f32 %f5563, %f3493, %f952, %f3492;
$L__BB0_822:
selp.f32 %f3494, 0f3C0885E4, 0f3D2AAABB, %p711;
fma.rn.f32 %f3495, %f5563, %f952, %f3494;
selp.f32 %f3496, 0fBE2AAAA8, 0fBEFFFFFF, %p711;
fma.rn.f32 %f3497, %f3495, %f952, %f3496;
mov.f32 %f3498, 0f00000000;
fma.rn.f32 %f3499, %f952, %f951, %f3498;
fma.rn.f32 %f5564, %f3497, %f3499, %f951;
and.b32 %r4828, %r8186, 2;
setp.eq.s32 %p713, %r4828, 0;
@%p713 bra $L__BB0_824;
mov.f32 %f3501, 0fBF800000;
fma.rn.f32 %f5564, %f5564, %f3501, %f3498;
$L__BB0_824:
mul.f32 %f3502, %f858, 0f3F22F983;
cvt.rni.s32.f32 %r8190, %f3502;
cvt.rn.f32.s32 %f3503, %r8190;
mov.f32 %f3504, 0fBFC90FDA;
fma.rn.f32 %f3505, %f3503, %f3504, %f858;
mov.f32 %f3506, 0fB3A22168;
fma.rn.f32 %f3507, %f3503, %f3506, %f3505;
mov.f32 %f3508, 0fA7C234C5;
fma.rn.f32 %f5565, %f3503, %f3508, %f3507;
abs.f32 %f959, %f858;
setp.ltu.f32 %p714, %f959, 0f47CE4780;
@%p714 bra $L__BB0_832;
setp.eq.f32 %p715, %f959, 0f7F800000;
@%p715 bra $L__BB0_831;
bra.uni $L__BB0_826;
$L__BB0_831:
mov.f32 %f3511, 0f00000000;
mul.rn.f32 %f5565, %f858, %f3511;
mov.u32 %r8190, 0;
bra.uni $L__BB0_832;
$L__BB0_826:
mov.b32 %r1085, %f858;
shr.u32 %r4830, %r1085, 23;
and.b32 %r4831, %r4830, 255;
add.s32 %r1086, %r4831, -128;
shl.b32 %r4832, %r1085, 8;
or.b32 %r1087, %r4832, -2147483648;
shr.u32 %r1088, %r1086, 5;
mov.u64 %rd2615, 0;
mov.u32 %r8187, 0;
mov.u64 %rd1479, __cudart_i2opi_f;
mov.u64 %rd2616, %rd2615;
$L__BB0_827:
.pragma "nounroll";
shl.b64 %rd1478, %rd2615, 2;
add.s64 %rd1480, %rd1479, %rd1478;
ld.global.nc.u32 %r4833, [%rd1480];
mad.wide.u32 %rd1481, %r4833, %r1087, %rd2616;
shr.u64 %rd2616, %rd1481, 32;
add.s64 %rd1482, %rd1, %rd1478;
st.local.u32 [%rd1482], %rd1481;
add.s32 %r8187, %r8187, 1;
cvt.s64.s32 %rd2615, %r8187;
setp.ne.s32 %p716, %r8187, 6;
@%p716 bra $L__BB0_827;
st.local.u32 [%rd4], %rd2616;
mov.u32 %r4834, 4;
sub.s32 %r1091, %r4834, %r1088;
mov.u32 %r4835, 6;
sub.s32 %r4836, %r4835, %r1088;
mul.wide.s32 %rd1483, %r4836, 4;
add.s64 %rd1484, %rd1, %rd1483;
ld.local.u32 %r8188, [%rd1484];
ld.local.u32 %r8189, [%rd1484+-4];
and.b32 %r1094, %r1086, 31;
setp.eq.s32 %p717, %r1094, 0;
@%p717 bra $L__BB0_830;
mov.u32 %r4837, 32;
sub.s32 %r4838, %r4837, %r1094;
shr.u32 %r4839, %r8189, %r4838;
shl.b32 %r4840, %r8188, %r1094;
add.s32 %r8188, %r4839, %r4840;
mul.wide.s32 %rd1485, %r1091, 4;
add.s64 %rd1486, %rd1, %rd1485;
ld.local.u32 %r4841, [%rd1486];
shr.u32 %r4842, %r4841, %r4838;
shl.b32 %r4843, %r8189, %r1094;
add.s32 %r8189, %r4842, %r4843;
$L__BB0_830:
and.b32 %r4844, %r1085, -2147483648;
shr.u32 %r4845, %r8189, 30;
shl.b32 %r4846, %r8188, 2;
or.b32 %r4847, %r4845, %r4846;
shr.u32 %r4848, %r4847, 31;
shr.u32 %r4849, %r8188, 30;
add.s32 %r4850, %r4848, %r4849;
neg.s32 %r4851, %r4850;
setp.eq.s32 %p718, %r4844, 0;
selp.b32 %r8190, %r4850, %r4851, %p718;
setp.ne.s32 %p719, %r4848, 0;
xor.b32 %r4852, %r4844, -2147483648;
selp.b32 %r4853, %r4852, %r4844, %p719;
selp.b32 %r4854, -1, 0, %p719;
xor.b32 %r4855, %r4847, %r4854;
shl.b32 %r4856, %r8189, 2;
xor.b32 %r4857, %r4856, %r4854;
cvt.u64.u32 %rd1487, %r4855;
cvt.u64.u32 %rd1488, %r4857;
bfi.b64 %rd1489, %rd1487, %rd1488, 32, 32;
cvt.rn.f64.s64 %fd111, %rd1489;
mul.f64 %fd112, %fd111, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3509, %fd112;
setp.eq.s32 %p720, %r4853, 0;
neg.f32 %f3510, %f3509;
selp.f32 %f5565, %f3509, %f3510, %p720;
$L__BB0_832:
add.s32 %r1101, %r8190, 1;
and.b32 %r1102, %r1101, 1;
setp.eq.s32 %p721, %r1102, 0;
selp.f32 %f963, %f5565, 0f3F800000, %p721;
mul.rn.f32 %f964, %f5565, %f5565;
mov.f32 %f5566, 0fB94D4153;
@%p721 bra $L__BB0_834;
mov.f32 %f3513, 0fBAB607ED;
mov.f32 %f3514, 0f37CBAC00;
fma.rn.f32 %f5566, %f3514, %f964, %f3513;
$L__BB0_834:
selp.f32 %f3515, 0f3C0885E4, 0f3D2AAABB, %p721;
fma.rn.f32 %f3516, %f5566, %f964, %f3515;
selp.f32 %f3517, 0fBE2AAAA8, 0fBEFFFFFF, %p721;
fma.rn.f32 %f3518, %f3516, %f964, %f3517;
mov.f32 %f3519, 0f00000000;
fma.rn.f32 %f3520, %f964, %f963, %f3519;
fma.rn.f32 %f5567, %f3518, %f3520, %f963;
and.b32 %r4859, %r1101, 2;
setp.eq.s32 %p723, %r4859, 0;
@%p723 bra $L__BB0_836;
mov.f32 %f3522, 0fBF800000;
fma.rn.f32 %f5567, %f5567, %f3522, %f3519;
$L__BB0_836:
add.f32 %f5596, %f5564, %f5567;
mul.f32 %f3523, %f867, 0f3F22F983;
cvt.rni.s32.f32 %r8194, %f3523;
cvt.rn.f32.s32 %f3524, %r8194;
mov.f32 %f3525, 0fBFC90FDA;
fma.rn.f32 %f3526, %f3524, %f3525, %f867;
mov.f32 %f3527, 0fB3A22168;
fma.rn.f32 %f3528, %f3524, %f3527, %f3526;
mov.f32 %f3529, 0fA7C234C5;
fma.rn.f32 %f5568, %f3524, %f3529, %f3528;
abs.f32 %f972, %f867;
setp.ltu.f32 %p724, %f972, 0f47CE4780;
@%p724 bra $L__BB0_844;
setp.eq.f32 %p725, %f972, 0f7F800000;
@%p725 bra $L__BB0_843;
bra.uni $L__BB0_838;
$L__BB0_843:
mov.f32 %f3532, 0f00000000;
mul.rn.f32 %f5568, %f867, %f3532;
mov.u32 %r8194, 0;
bra.uni $L__BB0_844;
$L__BB0_838:
mov.b32 %r1104, %f867;
shr.u32 %r4861, %r1104, 23;
and.b32 %r4862, %r4861, 255;
add.s32 %r1105, %r4862, -128;
shl.b32 %r4863, %r1104, 8;
or.b32 %r1106, %r4863, -2147483648;
shr.u32 %r1107, %r1105, 5;
mov.u64 %rd2617, 0;
mov.u32 %r8191, 0;
mov.u64 %rd1493, __cudart_i2opi_f;
mov.u64 %rd2618, %rd2617;
$L__BB0_839:
.pragma "nounroll";
shl.b64 %rd1492, %rd2617, 2;
add.s64 %rd1494, %rd1493, %rd1492;
ld.global.nc.u32 %r4864, [%rd1494];
mad.wide.u32 %rd1495, %r4864, %r1106, %rd2618;
shr.u64 %rd2618, %rd1495, 32;
add.s64 %rd1496, %rd1, %rd1492;
st.local.u32 [%rd1496], %rd1495;
add.s32 %r8191, %r8191, 1;
cvt.s64.s32 %rd2617, %r8191;
setp.ne.s32 %p726, %r8191, 6;
@%p726 bra $L__BB0_839;
st.local.u32 [%rd4], %rd2618;
mov.u32 %r4865, 4;
sub.s32 %r1110, %r4865, %r1107;
mov.u32 %r4866, 6;
sub.s32 %r4867, %r4866, %r1107;
mul.wide.s32 %rd1497, %r4867, 4;
add.s64 %rd1498, %rd1, %rd1497;
ld.local.u32 %r8192, [%rd1498];
ld.local.u32 %r8193, [%rd1498+-4];
and.b32 %r1113, %r1105, 31;
setp.eq.s32 %p727, %r1113, 0;
@%p727 bra $L__BB0_842;
mov.u32 %r4868, 32;
sub.s32 %r4869, %r4868, %r1113;
shr.u32 %r4870, %r8193, %r4869;
shl.b32 %r4871, %r8192, %r1113;
add.s32 %r8192, %r4870, %r4871;
mul.wide.s32 %rd1499, %r1110, 4;
add.s64 %rd1500, %rd1, %rd1499;
ld.local.u32 %r4872, [%rd1500];
shr.u32 %r4873, %r4872, %r4869;
shl.b32 %r4874, %r8193, %r1113;
add.s32 %r8193, %r4873, %r4874;
$L__BB0_842:
and.b32 %r4875, %r1104, -2147483648;
shr.u32 %r4876, %r8193, 30;
shl.b32 %r4877, %r8192, 2;
or.b32 %r4878, %r4876, %r4877;
shr.u32 %r4879, %r4878, 31;
shr.u32 %r4880, %r8192, 30;
add.s32 %r4881, %r4879, %r4880;
neg.s32 %r4882, %r4881;
setp.eq.s32 %p728, %r4875, 0;
selp.b32 %r8194, %r4881, %r4882, %p728;
setp.ne.s32 %p729, %r4879, 0;
xor.b32 %r4883, %r4875, -2147483648;
selp.b32 %r4884, %r4883, %r4875, %p729;
selp.b32 %r4885, -1, 0, %p729;
xor.b32 %r4886, %r4878, %r4885;
shl.b32 %r4887, %r8193, 2;
xor.b32 %r4888, %r4887, %r4885;
cvt.u64.u32 %rd1501, %r4886;
cvt.u64.u32 %rd1502, %r4888;
bfi.b64 %rd1503, %rd1501, %rd1502, 32, 32;
cvt.rn.f64.s64 %fd113, %rd1503;
mul.f64 %fd114, %fd113, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3530, %fd114;
setp.eq.s32 %p730, %r4884, 0;
neg.f32 %f3531, %f3530;
selp.f32 %f5568, %f3530, %f3531, %p730;
$L__BB0_844:
and.b32 %r1120, %r8194, 1;
setp.eq.s32 %p731, %r1120, 0;
selp.f32 %f976, %f5568, 0f3F800000, %p731;
mul.rn.f32 %f977, %f5568, %f5568;
mov.f32 %f5569, 0fB94D4153;
@%p731 bra $L__BB0_846;
mov.f32 %f3534, 0fBAB607ED;
mov.f32 %f3535, 0f37CBAC00;
fma.rn.f32 %f5569, %f3535, %f977, %f3534;
$L__BB0_846:
selp.f32 %f3536, 0f3C0885E4, 0f3D2AAABB, %p731;
fma.rn.f32 %f3537, %f5569, %f977, %f3536;
selp.f32 %f3538, 0fBE2AAAA8, 0fBEFFFFFF, %p731;
fma.rn.f32 %f3539, %f3537, %f977, %f3538;
mov.f32 %f3540, 0f00000000;
fma.rn.f32 %f3541, %f977, %f976, %f3540;
fma.rn.f32 %f5570, %f3539, %f3541, %f976;
and.b32 %r4890, %r8194, 2;
setp.eq.s32 %p733, %r4890, 0;
@%p733 bra $L__BB0_848;
mov.f32 %f3543, 0fBF800000;
fma.rn.f32 %f5570, %f5570, %f3543, %f3540;
$L__BB0_848:
mul.f32 %f3544, %f859, 0f3F22F983;
cvt.rni.s32.f32 %r8198, %f3544;
cvt.rn.f32.s32 %f3545, %r8198;
mov.f32 %f3546, 0fBFC90FDA;
fma.rn.f32 %f3547, %f3545, %f3546, %f859;
mov.f32 %f3548, 0fB3A22168;
fma.rn.f32 %f3549, %f3545, %f3548, %f3547;
mov.f32 %f3550, 0fA7C234C5;
fma.rn.f32 %f5571, %f3545, %f3550, %f3549;
abs.f32 %f984, %f859;
setp.ltu.f32 %p734, %f984, 0f47CE4780;
@%p734 bra $L__BB0_856;
setp.eq.f32 %p735, %f984, 0f7F800000;
@%p735 bra $L__BB0_855;
bra.uni $L__BB0_850;
$L__BB0_855:
mov.f32 %f3553, 0f00000000;
mul.rn.f32 %f5571, %f859, %f3553;
mov.u32 %r8198, 0;
bra.uni $L__BB0_856;
$L__BB0_850:
mov.b32 %r1122, %f859;
shr.u32 %r4892, %r1122, 23;
and.b32 %r4893, %r4892, 255;
add.s32 %r1123, %r4893, -128;
shl.b32 %r4894, %r1122, 8;
or.b32 %r1124, %r4894, -2147483648;
shr.u32 %r1125, %r1123, 5;
mov.u64 %rd2619, 0;
mov.u32 %r8195, 0;
mov.u64 %rd1507, __cudart_i2opi_f;
mov.u64 %rd2620, %rd2619;
$L__BB0_851:
.pragma "nounroll";
shl.b64 %rd1506, %rd2619, 2;
add.s64 %rd1508, %rd1507, %rd1506;
ld.global.nc.u32 %r4895, [%rd1508];
mad.wide.u32 %rd1509, %r4895, %r1124, %rd2620;
shr.u64 %rd2620, %rd1509, 32;
add.s64 %rd1510, %rd1, %rd1506;
st.local.u32 [%rd1510], %rd1509;
add.s32 %r8195, %r8195, 1;
cvt.s64.s32 %rd2619, %r8195;
setp.ne.s32 %p736, %r8195, 6;
@%p736 bra $L__BB0_851;
st.local.u32 [%rd4], %rd2620;
mov.u32 %r4896, 4;
sub.s32 %r1128, %r4896, %r1125;
mov.u32 %r4897, 6;
sub.s32 %r4898, %r4897, %r1125;
mul.wide.s32 %rd1511, %r4898, 4;
add.s64 %rd1512, %rd1, %rd1511;
ld.local.u32 %r8196, [%rd1512];
ld.local.u32 %r8197, [%rd1512+-4];
and.b32 %r1131, %r1123, 31;
setp.eq.s32 %p737, %r1131, 0;
@%p737 bra $L__BB0_854;
mov.u32 %r4899, 32;
sub.s32 %r4900, %r4899, %r1131;
shr.u32 %r4901, %r8197, %r4900;
shl.b32 %r4902, %r8196, %r1131;
add.s32 %r8196, %r4901, %r4902;
mul.wide.s32 %rd1513, %r1128, 4;
add.s64 %rd1514, %rd1, %rd1513;
ld.local.u32 %r4903, [%rd1514];
shr.u32 %r4904, %r4903, %r4900;
shl.b32 %r4905, %r8197, %r1131;
add.s32 %r8197, %r4904, %r4905;
$L__BB0_854:
and.b32 %r4906, %r1122, -2147483648;
shr.u32 %r4907, %r8197, 30;
shl.b32 %r4908, %r8196, 2;
or.b32 %r4909, %r4907, %r4908;
shr.u32 %r4910, %r4909, 31;
shr.u32 %r4911, %r8196, 30;
add.s32 %r4912, %r4910, %r4911;
neg.s32 %r4913, %r4912;
setp.eq.s32 %p738, %r4906, 0;
selp.b32 %r8198, %r4912, %r4913, %p738;
setp.ne.s32 %p739, %r4910, 0;
xor.b32 %r4914, %r4906, -2147483648;
selp.b32 %r4915, %r4914, %r4906, %p739;
selp.b32 %r4916, -1, 0, %p739;
xor.b32 %r4917, %r4909, %r4916;
shl.b32 %r4918, %r8197, 2;
xor.b32 %r4919, %r4918, %r4916;
cvt.u64.u32 %rd1515, %r4917;
cvt.u64.u32 %rd1516, %r4919;
bfi.b64 %rd1517, %rd1515, %rd1516, 32, 32;
cvt.rn.f64.s64 %fd115, %rd1517;
mul.f64 %fd116, %fd115, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3551, %fd116;
setp.eq.s32 %p740, %r4915, 0;
neg.f32 %f3552, %f3551;
selp.f32 %f5571, %f3551, %f3552, %p740;
$L__BB0_856:
add.s32 %r1138, %r8198, 1;
and.b32 %r1139, %r1138, 1;
setp.eq.s32 %p741, %r1139, 0;
selp.f32 %f988, %f5571, 0f3F800000, %p741;
mul.rn.f32 %f989, %f5571, %f5571;
mov.f32 %f5572, 0fB94D4153;
@%p741 bra $L__BB0_858;
mov.f32 %f3555, 0fBAB607ED;
mov.f32 %f3556, 0f37CBAC00;
fma.rn.f32 %f5572, %f3556, %f989, %f3555;
$L__BB0_858:
selp.f32 %f3557, 0f3C0885E4, 0f3D2AAABB, %p741;
fma.rn.f32 %f3558, %f5572, %f989, %f3557;
selp.f32 %f3559, 0fBE2AAAA8, 0fBEFFFFFF, %p741;
fma.rn.f32 %f3560, %f3558, %f989, %f3559;
mov.f32 %f3561, 0f00000000;
fma.rn.f32 %f3562, %f989, %f988, %f3561;
fma.rn.f32 %f5573, %f3560, %f3562, %f988;
and.b32 %r4921, %r1138, 2;
setp.eq.s32 %p743, %r4921, 0;
@%p743 bra $L__BB0_860;
mov.f32 %f3564, 0fBF800000;
fma.rn.f32 %f5573, %f5573, %f3564, %f3561;
$L__BB0_860:
add.f32 %f5595, %f5570, %f5573;
mul.f32 %f3565, %f868, 0f3F22F983;
cvt.rni.s32.f32 %r8202, %f3565;
cvt.rn.f32.s32 %f3566, %r8202;
mov.f32 %f3567, 0fBFC90FDA;
fma.rn.f32 %f3568, %f3566, %f3567, %f868;
mov.f32 %f3569, 0fB3A22168;
fma.rn.f32 %f3570, %f3566, %f3569, %f3568;
mov.f32 %f3571, 0fA7C234C5;
fma.rn.f32 %f5574, %f3566, %f3571, %f3570;
abs.f32 %f997, %f868;
setp.ltu.f32 %p744, %f997, 0f47CE4780;
@%p744 bra $L__BB0_868;
setp.eq.f32 %p745, %f997, 0f7F800000;
@%p745 bra $L__BB0_867;
bra.uni $L__BB0_862;
$L__BB0_867:
mov.f32 %f3574, 0f00000000;
mul.rn.f32 %f5574, %f868, %f3574;
mov.u32 %r8202, 0;
bra.uni $L__BB0_868;
$L__BB0_862:
mov.b32 %r1141, %f868;
shr.u32 %r4923, %r1141, 23;
and.b32 %r4924, %r4923, 255;
add.s32 %r1142, %r4924, -128;
shl.b32 %r4925, %r1141, 8;
or.b32 %r1143, %r4925, -2147483648;
shr.u32 %r1144, %r1142, 5;
mov.u64 %rd2621, 0;
mov.u32 %r8199, 0;
mov.u64 %rd1521, __cudart_i2opi_f;
mov.u64 %rd2622, %rd2621;
$L__BB0_863:
.pragma "nounroll";
shl.b64 %rd1520, %rd2621, 2;
add.s64 %rd1522, %rd1521, %rd1520;
ld.global.nc.u32 %r4926, [%rd1522];
mad.wide.u32 %rd1523, %r4926, %r1143, %rd2622;
shr.u64 %rd2622, %rd1523, 32;
add.s64 %rd1524, %rd1, %rd1520;
st.local.u32 [%rd1524], %rd1523;
add.s32 %r8199, %r8199, 1;
cvt.s64.s32 %rd2621, %r8199;
setp.ne.s32 %p746, %r8199, 6;
@%p746 bra $L__BB0_863;
st.local.u32 [%rd4], %rd2622;
mov.u32 %r4927, 4;
sub.s32 %r1147, %r4927, %r1144;
mov.u32 %r4928, 6;
sub.s32 %r4929, %r4928, %r1144;
mul.wide.s32 %rd1525, %r4929, 4;
add.s64 %rd1526, %rd1, %rd1525;
ld.local.u32 %r8200, [%rd1526];
ld.local.u32 %r8201, [%rd1526+-4];
and.b32 %r1150, %r1142, 31;
setp.eq.s32 %p747, %r1150, 0;
@%p747 bra $L__BB0_866;
mov.u32 %r4930, 32;
sub.s32 %r4931, %r4930, %r1150;
shr.u32 %r4932, %r8201, %r4931;
shl.b32 %r4933, %r8200, %r1150;
add.s32 %r8200, %r4932, %r4933;
mul.wide.s32 %rd1527, %r1147, 4;
add.s64 %rd1528, %rd1, %rd1527;
ld.local.u32 %r4934, [%rd1528];
shr.u32 %r4935, %r4934, %r4931;
shl.b32 %r4936, %r8201, %r1150;
add.s32 %r8201, %r4935, %r4936;
$L__BB0_866:
and.b32 %r4937, %r1141, -2147483648;
shr.u32 %r4938, %r8201, 30;
shl.b32 %r4939, %r8200, 2;
or.b32 %r4940, %r4938, %r4939;
shr.u32 %r4941, %r4940, 31;
shr.u32 %r4942, %r8200, 30;
add.s32 %r4943, %r4941, %r4942;
neg.s32 %r4944, %r4943;
setp.eq.s32 %p748, %r4937, 0;
selp.b32 %r8202, %r4943, %r4944, %p748;
setp.ne.s32 %p749, %r4941, 0;
xor.b32 %r4945, %r4937, -2147483648;
selp.b32 %r4946, %r4945, %r4937, %p749;
selp.b32 %r4947, -1, 0, %p749;
xor.b32 %r4948, %r4940, %r4947;
shl.b32 %r4949, %r8201, 2;
xor.b32 %r4950, %r4949, %r4947;
cvt.u64.u32 %rd1529, %r4948;
cvt.u64.u32 %rd1530, %r4950;
bfi.b64 %rd1531, %rd1529, %rd1530, 32, 32;
cvt.rn.f64.s64 %fd117, %rd1531;
mul.f64 %fd118, %fd117, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3572, %fd118;
setp.eq.s32 %p750, %r4946, 0;
neg.f32 %f3573, %f3572;
selp.f32 %f5574, %f3572, %f3573, %p750;
$L__BB0_868:
and.b32 %r1157, %r8202, 1;
setp.eq.s32 %p751, %r1157, 0;
selp.f32 %f1001, %f5574, 0f3F800000, %p751;
mul.rn.f32 %f1002, %f5574, %f5574;
mov.f32 %f5575, 0fB94D4153;
@%p751 bra $L__BB0_870;
mov.f32 %f3576, 0fBAB607ED;
mov.f32 %f3577, 0f37CBAC00;
fma.rn.f32 %f5575, %f3577, %f1002, %f3576;
$L__BB0_870:
selp.f32 %f3578, 0f3C0885E4, 0f3D2AAABB, %p751;
fma.rn.f32 %f3579, %f5575, %f1002, %f3578;
selp.f32 %f3580, 0fBE2AAAA8, 0fBEFFFFFF, %p751;
fma.rn.f32 %f3581, %f3579, %f1002, %f3580;
mov.f32 %f3582, 0f00000000;
fma.rn.f32 %f3583, %f1002, %f1001, %f3582;
fma.rn.f32 %f5576, %f3581, %f3583, %f1001;
and.b32 %r4952, %r8202, 2;
setp.eq.s32 %p753, %r4952, 0;
@%p753 bra $L__BB0_872;
mov.f32 %f3585, 0fBF800000;
fma.rn.f32 %f5576, %f5576, %f3585, %f3582;
$L__BB0_872:
mul.f32 %f3586, %f860, 0f3F22F983;
cvt.rni.s32.f32 %r8206, %f3586;
cvt.rn.f32.s32 %f3587, %r8206;
mov.f32 %f3588, 0fBFC90FDA;
fma.rn.f32 %f3589, %f3587, %f3588, %f860;
mov.f32 %f3590, 0fB3A22168;
fma.rn.f32 %f3591, %f3587, %f3590, %f3589;
mov.f32 %f3592, 0fA7C234C5;
fma.rn.f32 %f5577, %f3587, %f3592, %f3591;
abs.f32 %f1009, %f860;
setp.ltu.f32 %p754, %f1009, 0f47CE4780;
@%p754 bra $L__BB0_880;
setp.eq.f32 %p755, %f1009, 0f7F800000;
@%p755 bra $L__BB0_879;
bra.uni $L__BB0_874;
$L__BB0_879:
mov.f32 %f3595, 0f00000000;
mul.rn.f32 %f5577, %f860, %f3595;
mov.u32 %r8206, 0;
bra.uni $L__BB0_880;
$L__BB0_874:
mov.b32 %r1159, %f860;
shr.u32 %r4954, %r1159, 23;
and.b32 %r4955, %r4954, 255;
add.s32 %r1160, %r4955, -128;
shl.b32 %r4956, %r1159, 8;
or.b32 %r1161, %r4956, -2147483648;
shr.u32 %r1162, %r1160, 5;
mov.u64 %rd2623, 0;
mov.u32 %r8203, 0;
mov.u64 %rd1535, __cudart_i2opi_f;
mov.u64 %rd2624, %rd2623;
$L__BB0_875:
.pragma "nounroll";
shl.b64 %rd1534, %rd2623, 2;
add.s64 %rd1536, %rd1535, %rd1534;
ld.global.nc.u32 %r4957, [%rd1536];
mad.wide.u32 %rd1537, %r4957, %r1161, %rd2624;
shr.u64 %rd2624, %rd1537, 32;
add.s64 %rd1538, %rd1, %rd1534;
st.local.u32 [%rd1538], %rd1537;
add.s32 %r8203, %r8203, 1;
cvt.s64.s32 %rd2623, %r8203;
setp.ne.s32 %p756, %r8203, 6;
@%p756 bra $L__BB0_875;
st.local.u32 [%rd4], %rd2624;
mov.u32 %r4958, 4;
sub.s32 %r1165, %r4958, %r1162;
mov.u32 %r4959, 6;
sub.s32 %r4960, %r4959, %r1162;
mul.wide.s32 %rd1539, %r4960, 4;
add.s64 %rd1540, %rd1, %rd1539;
ld.local.u32 %r8204, [%rd1540];
ld.local.u32 %r8205, [%rd1540+-4];
and.b32 %r1168, %r1160, 31;
setp.eq.s32 %p757, %r1168, 0;
@%p757 bra $L__BB0_878;
mov.u32 %r4961, 32;
sub.s32 %r4962, %r4961, %r1168;
shr.u32 %r4963, %r8205, %r4962;
shl.b32 %r4964, %r8204, %r1168;
add.s32 %r8204, %r4963, %r4964;
mul.wide.s32 %rd1541, %r1165, 4;
add.s64 %rd1542, %rd1, %rd1541;
ld.local.u32 %r4965, [%rd1542];
shr.u32 %r4966, %r4965, %r4962;
shl.b32 %r4967, %r8205, %r1168;
add.s32 %r8205, %r4966, %r4967;
$L__BB0_878:
and.b32 %r4968, %r1159, -2147483648;
shr.u32 %r4969, %r8205, 30;
shl.b32 %r4970, %r8204, 2;
or.b32 %r4971, %r4969, %r4970;
shr.u32 %r4972, %r4971, 31;
shr.u32 %r4973, %r8204, 30;
add.s32 %r4974, %r4972, %r4973;
neg.s32 %r4975, %r4974;
setp.eq.s32 %p758, %r4968, 0;
selp.b32 %r8206, %r4974, %r4975, %p758;
setp.ne.s32 %p759, %r4972, 0;
xor.b32 %r4976, %r4968, -2147483648;
selp.b32 %r4977, %r4976, %r4968, %p759;
selp.b32 %r4978, -1, 0, %p759;
xor.b32 %r4979, %r4971, %r4978;
shl.b32 %r4980, %r8205, 2;
xor.b32 %r4981, %r4980, %r4978;
cvt.u64.u32 %rd1543, %r4979;
cvt.u64.u32 %rd1544, %r4981;
bfi.b64 %rd1545, %rd1543, %rd1544, 32, 32;
cvt.rn.f64.s64 %fd119, %rd1545;
mul.f64 %fd120, %fd119, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3593, %fd120;
setp.eq.s32 %p760, %r4977, 0;
neg.f32 %f3594, %f3593;
selp.f32 %f5577, %f3593, %f3594, %p760;
$L__BB0_880:
add.s32 %r1175, %r8206, 1;
and.b32 %r1176, %r1175, 1;
setp.eq.s32 %p761, %r1176, 0;
selp.f32 %f1013, %f5577, 0f3F800000, %p761;
mul.rn.f32 %f1014, %f5577, %f5577;
mov.f32 %f5578, 0fB94D4153;
@%p761 bra $L__BB0_882;
mov.f32 %f3597, 0fBAB607ED;
mov.f32 %f3598, 0f37CBAC00;
fma.rn.f32 %f5578, %f3598, %f1014, %f3597;
$L__BB0_882:
selp.f32 %f3599, 0f3C0885E4, 0f3D2AAABB, %p761;
fma.rn.f32 %f3600, %f5578, %f1014, %f3599;
selp.f32 %f3601, 0fBE2AAAA8, 0fBEFFFFFF, %p761;
fma.rn.f32 %f3602, %f3600, %f1014, %f3601;
mov.f32 %f3603, 0f00000000;
fma.rn.f32 %f3604, %f1014, %f1013, %f3603;
fma.rn.f32 %f5579, %f3602, %f3604, %f1013;
and.b32 %r4983, %r1175, 2;
setp.eq.s32 %p763, %r4983, 0;
@%p763 bra $L__BB0_884;
mov.f32 %f3606, 0fBF800000;
fma.rn.f32 %f5579, %f5579, %f3606, %f3603;
$L__BB0_884:
add.f32 %f5594, %f5576, %f5579;
mul.f32 %f3607, %f869, 0f3F22F983;
cvt.rni.s32.f32 %r8210, %f3607;
cvt.rn.f32.s32 %f3608, %r8210;
mov.f32 %f3609, 0fBFC90FDA;
fma.rn.f32 %f3610, %f3608, %f3609, %f869;
mov.f32 %f3611, 0fB3A22168;
fma.rn.f32 %f3612, %f3608, %f3611, %f3610;
mov.f32 %f3613, 0fA7C234C5;
fma.rn.f32 %f5580, %f3608, %f3613, %f3612;
abs.f32 %f1022, %f869;
setp.ltu.f32 %p764, %f1022, 0f47CE4780;
@%p764 bra $L__BB0_892;
setp.eq.f32 %p765, %f1022, 0f7F800000;
@%p765 bra $L__BB0_891;
bra.uni $L__BB0_886;
$L__BB0_891:
mov.f32 %f3616, 0f00000000;
mul.rn.f32 %f5580, %f869, %f3616;
mov.u32 %r8210, 0;
bra.uni $L__BB0_892;
$L__BB0_886:
mov.b32 %r1178, %f869;
shr.u32 %r4985, %r1178, 23;
and.b32 %r4986, %r4985, 255;
add.s32 %r1179, %r4986, -128;
shl.b32 %r4987, %r1178, 8;
or.b32 %r1180, %r4987, -2147483648;
shr.u32 %r1181, %r1179, 5;
mov.u64 %rd2625, 0;
mov.u32 %r8207, 0;
mov.u64 %rd1549, __cudart_i2opi_f;
mov.u64 %rd2626, %rd2625;
$L__BB0_887:
.pragma "nounroll";
shl.b64 %rd1548, %rd2625, 2;
add.s64 %rd1550, %rd1549, %rd1548;
ld.global.nc.u32 %r4988, [%rd1550];
mad.wide.u32 %rd1551, %r4988, %r1180, %rd2626;
shr.u64 %rd2626, %rd1551, 32;
add.s64 %rd1552, %rd1, %rd1548;
st.local.u32 [%rd1552], %rd1551;
add.s32 %r8207, %r8207, 1;
cvt.s64.s32 %rd2625, %r8207;
setp.ne.s32 %p766, %r8207, 6;
@%p766 bra $L__BB0_887;
st.local.u32 [%rd4], %rd2626;
mov.u32 %r4989, 4;
sub.s32 %r1184, %r4989, %r1181;
mov.u32 %r4990, 6;
sub.s32 %r4991, %r4990, %r1181;
mul.wide.s32 %rd1553, %r4991, 4;
add.s64 %rd1554, %rd1, %rd1553;
ld.local.u32 %r8208, [%rd1554];
ld.local.u32 %r8209, [%rd1554+-4];
and.b32 %r1187, %r1179, 31;
setp.eq.s32 %p767, %r1187, 0;
@%p767 bra $L__BB0_890;
mov.u32 %r4992, 32;
sub.s32 %r4993, %r4992, %r1187;
shr.u32 %r4994, %r8209, %r4993;
shl.b32 %r4995, %r8208, %r1187;
add.s32 %r8208, %r4994, %r4995;
mul.wide.s32 %rd1555, %r1184, 4;
add.s64 %rd1556, %rd1, %rd1555;
ld.local.u32 %r4996, [%rd1556];
shr.u32 %r4997, %r4996, %r4993;
shl.b32 %r4998, %r8209, %r1187;
add.s32 %r8209, %r4997, %r4998;
$L__BB0_890:
and.b32 %r4999, %r1178, -2147483648;
shr.u32 %r5000, %r8209, 30;
shl.b32 %r5001, %r8208, 2;
or.b32 %r5002, %r5000, %r5001;
shr.u32 %r5003, %r5002, 31;
shr.u32 %r5004, %r8208, 30;
add.s32 %r5005, %r5003, %r5004;
neg.s32 %r5006, %r5005;
setp.eq.s32 %p768, %r4999, 0;
selp.b32 %r8210, %r5005, %r5006, %p768;
setp.ne.s32 %p769, %r5003, 0;
xor.b32 %r5007, %r4999, -2147483648;
selp.b32 %r5008, %r5007, %r4999, %p769;
selp.b32 %r5009, -1, 0, %p769;
xor.b32 %r5010, %r5002, %r5009;
shl.b32 %r5011, %r8209, 2;
xor.b32 %r5012, %r5011, %r5009;
cvt.u64.u32 %rd1557, %r5010;
cvt.u64.u32 %rd1558, %r5012;
bfi.b64 %rd1559, %rd1557, %rd1558, 32, 32;
cvt.rn.f64.s64 %fd121, %rd1559;
mul.f64 %fd122, %fd121, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3614, %fd122;
setp.eq.s32 %p770, %r5008, 0;
neg.f32 %f3615, %f3614;
selp.f32 %f5580, %f3614, %f3615, %p770;
$L__BB0_892:
and.b32 %r1194, %r8210, 1;
setp.eq.s32 %p771, %r1194, 0;
selp.f32 %f1026, %f5580, 0f3F800000, %p771;
mul.rn.f32 %f1027, %f5580, %f5580;
mov.f32 %f5581, 0fB94D4153;
@%p771 bra $L__BB0_894;
mov.f32 %f3618, 0fBAB607ED;
mov.f32 %f3619, 0f37CBAC00;
fma.rn.f32 %f5581, %f3619, %f1027, %f3618;
$L__BB0_894:
selp.f32 %f3620, 0f3C0885E4, 0f3D2AAABB, %p771;
fma.rn.f32 %f3621, %f5581, %f1027, %f3620;
selp.f32 %f3622, 0fBE2AAAA8, 0fBEFFFFFF, %p771;
fma.rn.f32 %f3623, %f3621, %f1027, %f3622;
mov.f32 %f3624, 0f00000000;
fma.rn.f32 %f3625, %f1027, %f1026, %f3624;
fma.rn.f32 %f5582, %f3623, %f3625, %f1026;
and.b32 %r5014, %r8210, 2;
setp.eq.s32 %p773, %r5014, 0;
@%p773 bra $L__BB0_896;
mov.f32 %f3627, 0fBF800000;
fma.rn.f32 %f5582, %f5582, %f3627, %f3624;
$L__BB0_896:
mul.f32 %f3628, %f861, 0f3F22F983;
cvt.rni.s32.f32 %r8214, %f3628;
cvt.rn.f32.s32 %f3629, %r8214;
mov.f32 %f3630, 0fBFC90FDA;
fma.rn.f32 %f3631, %f3629, %f3630, %f861;
mov.f32 %f3632, 0fB3A22168;
fma.rn.f32 %f3633, %f3629, %f3632, %f3631;
mov.f32 %f3634, 0fA7C234C5;
fma.rn.f32 %f5583, %f3629, %f3634, %f3633;
abs.f32 %f1034, %f861;
setp.ltu.f32 %p774, %f1034, 0f47CE4780;
@%p774 bra $L__BB0_904;
setp.eq.f32 %p775, %f1034, 0f7F800000;
@%p775 bra $L__BB0_903;
bra.uni $L__BB0_898;
$L__BB0_903:
mov.f32 %f3637, 0f00000000;
mul.rn.f32 %f5583, %f861, %f3637;
mov.u32 %r8214, 0;
bra.uni $L__BB0_904;
$L__BB0_898:
mov.b32 %r1196, %f861;
shr.u32 %r5016, %r1196, 23;
and.b32 %r5017, %r5016, 255;
add.s32 %r1197, %r5017, -128;
shl.b32 %r5018, %r1196, 8;
or.b32 %r1198, %r5018, -2147483648;
shr.u32 %r1199, %r1197, 5;
mov.u64 %rd2627, 0;
mov.u32 %r8211, 0;
mov.u64 %rd1563, __cudart_i2opi_f;
mov.u64 %rd2628, %rd2627;
$L__BB0_899:
.pragma "nounroll";
shl.b64 %rd1562, %rd2627, 2;
add.s64 %rd1564, %rd1563, %rd1562;
ld.global.nc.u32 %r5019, [%rd1564];
mad.wide.u32 %rd1565, %r5019, %r1198, %rd2628;
shr.u64 %rd2628, %rd1565, 32;
add.s64 %rd1566, %rd1, %rd1562;
st.local.u32 [%rd1566], %rd1565;
add.s32 %r8211, %r8211, 1;
cvt.s64.s32 %rd2627, %r8211;
setp.ne.s32 %p776, %r8211, 6;
@%p776 bra $L__BB0_899;
st.local.u32 [%rd4], %rd2628;
mov.u32 %r5020, 4;
sub.s32 %r1202, %r5020, %r1199;
mov.u32 %r5021, 6;
sub.s32 %r5022, %r5021, %r1199;
mul.wide.s32 %rd1567, %r5022, 4;
add.s64 %rd1568, %rd1, %rd1567;
ld.local.u32 %r8212, [%rd1568];
ld.local.u32 %r8213, [%rd1568+-4];
and.b32 %r1205, %r1197, 31;
setp.eq.s32 %p777, %r1205, 0;
@%p777 bra $L__BB0_902;
mov.u32 %r5023, 32;
sub.s32 %r5024, %r5023, %r1205;
shr.u32 %r5025, %r8213, %r5024;
shl.b32 %r5026, %r8212, %r1205;
add.s32 %r8212, %r5025, %r5026;
mul.wide.s32 %rd1569, %r1202, 4;
add.s64 %rd1570, %rd1, %rd1569;
ld.local.u32 %r5027, [%rd1570];
shr.u32 %r5028, %r5027, %r5024;
shl.b32 %r5029, %r8213, %r1205;
add.s32 %r8213, %r5028, %r5029;
$L__BB0_902:
and.b32 %r5030, %r1196, -2147483648;
shr.u32 %r5031, %r8213, 30;
shl.b32 %r5032, %r8212, 2;
or.b32 %r5033, %r5031, %r5032;
shr.u32 %r5034, %r5033, 31;
shr.u32 %r5035, %r8212, 30;
add.s32 %r5036, %r5034, %r5035;
neg.s32 %r5037, %r5036;
setp.eq.s32 %p778, %r5030, 0;
selp.b32 %r8214, %r5036, %r5037, %p778;
setp.ne.s32 %p779, %r5034, 0;
xor.b32 %r5038, %r5030, -2147483648;
selp.b32 %r5039, %r5038, %r5030, %p779;
selp.b32 %r5040, -1, 0, %p779;
xor.b32 %r5041, %r5033, %r5040;
shl.b32 %r5042, %r8213, 2;
xor.b32 %r5043, %r5042, %r5040;
cvt.u64.u32 %rd1571, %r5041;
cvt.u64.u32 %rd1572, %r5043;
bfi.b64 %rd1573, %rd1571, %rd1572, 32, 32;
cvt.rn.f64.s64 %fd123, %rd1573;
mul.f64 %fd124, %fd123, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3635, %fd124;
setp.eq.s32 %p780, %r5039, 0;
neg.f32 %f3636, %f3635;
selp.f32 %f5583, %f3635, %f3636, %p780;
$L__BB0_904:
add.s32 %r1212, %r8214, 1;
and.b32 %r1213, %r1212, 1;
setp.eq.s32 %p781, %r1213, 0;
selp.f32 %f1038, %f5583, 0f3F800000, %p781;
mul.rn.f32 %f1039, %f5583, %f5583;
mov.f32 %f5584, 0fB94D4153;
@%p781 bra $L__BB0_906;
mov.f32 %f3639, 0fBAB607ED;
mov.f32 %f3640, 0f37CBAC00;
fma.rn.f32 %f5584, %f3640, %f1039, %f3639;
$L__BB0_906:
selp.f32 %f3641, 0f3C0885E4, 0f3D2AAABB, %p781;
fma.rn.f32 %f3642, %f5584, %f1039, %f3641;
selp.f32 %f3643, 0fBE2AAAA8, 0fBEFFFFFF, %p781;
fma.rn.f32 %f3644, %f3642, %f1039, %f3643;
mov.f32 %f3645, 0f00000000;
fma.rn.f32 %f3646, %f1039, %f1038, %f3645;
fma.rn.f32 %f5585, %f3644, %f3646, %f1038;
and.b32 %r5045, %r1212, 2;
setp.eq.s32 %p783, %r5045, 0;
@%p783 bra $L__BB0_908;
mov.f32 %f3648, 0fBF800000;
fma.rn.f32 %f5585, %f5585, %f3648, %f3645;
$L__BB0_908:
add.f32 %f5593, %f5582, %f5585;
mul.f32 %f3649, %f870, 0f3F22F983;
cvt.rni.s32.f32 %r8218, %f3649;
cvt.rn.f32.s32 %f3650, %r8218;
mov.f32 %f3651, 0fBFC90FDA;
fma.rn.f32 %f3652, %f3650, %f3651, %f870;
mov.f32 %f3653, 0fB3A22168;
fma.rn.f32 %f3654, %f3650, %f3653, %f3652;
mov.f32 %f3655, 0fA7C234C5;
fma.rn.f32 %f5586, %f3650, %f3655, %f3654;
abs.f32 %f1047, %f870;
setp.ltu.f32 %p784, %f1047, 0f47CE4780;
@%p784 bra $L__BB0_916;
setp.eq.f32 %p785, %f1047, 0f7F800000;
@%p785 bra $L__BB0_915;
bra.uni $L__BB0_910;
$L__BB0_915:
mov.f32 %f3658, 0f00000000;
mul.rn.f32 %f5586, %f870, %f3658;
mov.u32 %r8218, 0;
bra.uni $L__BB0_916;
$L__BB0_910:
mov.b32 %r1215, %f870;
shr.u32 %r5047, %r1215, 23;
and.b32 %r5048, %r5047, 255;
add.s32 %r1216, %r5048, -128;
shl.b32 %r5049, %r1215, 8;
or.b32 %r1217, %r5049, -2147483648;
shr.u32 %r1218, %r1216, 5;
mov.u64 %rd2629, 0;
mov.u32 %r8215, 0;
mov.u64 %rd1577, __cudart_i2opi_f;
mov.u64 %rd2630, %rd2629;
$L__BB0_911:
.pragma "nounroll";
shl.b64 %rd1576, %rd2629, 2;
add.s64 %rd1578, %rd1577, %rd1576;
ld.global.nc.u32 %r5050, [%rd1578];
mad.wide.u32 %rd1579, %r5050, %r1217, %rd2630;
shr.u64 %rd2630, %rd1579, 32;
add.s64 %rd1580, %rd1, %rd1576;
st.local.u32 [%rd1580], %rd1579;
add.s32 %r8215, %r8215, 1;
cvt.s64.s32 %rd2629, %r8215;
setp.ne.s32 %p786, %r8215, 6;
@%p786 bra $L__BB0_911;
st.local.u32 [%rd4], %rd2630;
mov.u32 %r5051, 4;
sub.s32 %r1221, %r5051, %r1218;
mov.u32 %r5052, 6;
sub.s32 %r5053, %r5052, %r1218;
mul.wide.s32 %rd1581, %r5053, 4;
add.s64 %rd1582, %rd1, %rd1581;
ld.local.u32 %r8216, [%rd1582];
ld.local.u32 %r8217, [%rd1582+-4];
and.b32 %r1224, %r1216, 31;
setp.eq.s32 %p787, %r1224, 0;
@%p787 bra $L__BB0_914;
mov.u32 %r5054, 32;
sub.s32 %r5055, %r5054, %r1224;
shr.u32 %r5056, %r8217, %r5055;
shl.b32 %r5057, %r8216, %r1224;
add.s32 %r8216, %r5056, %r5057;
mul.wide.s32 %rd1583, %r1221, 4;
add.s64 %rd1584, %rd1, %rd1583;
ld.local.u32 %r5058, [%rd1584];
shr.u32 %r5059, %r5058, %r5055;
shl.b32 %r5060, %r8217, %r1224;
add.s32 %r8217, %r5059, %r5060;
$L__BB0_914:
and.b32 %r5061, %r1215, -2147483648;
shr.u32 %r5062, %r8217, 30;
shl.b32 %r5063, %r8216, 2;
or.b32 %r5064, %r5062, %r5063;
shr.u32 %r5065, %r5064, 31;
shr.u32 %r5066, %r8216, 30;
add.s32 %r5067, %r5065, %r5066;
neg.s32 %r5068, %r5067;
setp.eq.s32 %p788, %r5061, 0;
selp.b32 %r8218, %r5067, %r5068, %p788;
setp.ne.s32 %p789, %r5065, 0;
xor.b32 %r5069, %r5061, -2147483648;
selp.b32 %r5070, %r5069, %r5061, %p789;
selp.b32 %r5071, -1, 0, %p789;
xor.b32 %r5072, %r5064, %r5071;
shl.b32 %r5073, %r8217, 2;
xor.b32 %r5074, %r5073, %r5071;
cvt.u64.u32 %rd1585, %r5072;
cvt.u64.u32 %rd1586, %r5074;
bfi.b64 %rd1587, %rd1585, %rd1586, 32, 32;
cvt.rn.f64.s64 %fd125, %rd1587;
mul.f64 %fd126, %fd125, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3656, %fd126;
setp.eq.s32 %p790, %r5070, 0;
neg.f32 %f3657, %f3656;
selp.f32 %f5586, %f3656, %f3657, %p790;
$L__BB0_916:
and.b32 %r1231, %r8218, 1;
setp.eq.s32 %p791, %r1231, 0;
selp.f32 %f1051, %f5586, 0f3F800000, %p791;
mul.rn.f32 %f1052, %f5586, %f5586;
mov.f32 %f5587, 0fB94D4153;
@%p791 bra $L__BB0_918;
mov.f32 %f3660, 0fBAB607ED;
mov.f32 %f3661, 0f37CBAC00;
fma.rn.f32 %f5587, %f3661, %f1052, %f3660;
$L__BB0_918:
selp.f32 %f3662, 0f3C0885E4, 0f3D2AAABB, %p791;
fma.rn.f32 %f3663, %f5587, %f1052, %f3662;
selp.f32 %f3664, 0fBE2AAAA8, 0fBEFFFFFF, %p791;
fma.rn.f32 %f3665, %f3663, %f1052, %f3664;
mov.f32 %f3666, 0f00000000;
fma.rn.f32 %f3667, %f1052, %f1051, %f3666;
fma.rn.f32 %f5588, %f3665, %f3667, %f1051;
and.b32 %r5076, %r8218, 2;
setp.eq.s32 %p793, %r5076, 0;
@%p793 bra $L__BB0_920;
mov.f32 %f3669, 0fBF800000;
fma.rn.f32 %f5588, %f5588, %f3669, %f3666;
$L__BB0_920:
mul.f32 %f3670, %f862, 0f3F22F983;
cvt.rni.s32.f32 %r8222, %f3670;
cvt.rn.f32.s32 %f3671, %r8222;
mov.f32 %f3672, 0fBFC90FDA;
fma.rn.f32 %f3673, %f3671, %f3672, %f862;
mov.f32 %f3674, 0fB3A22168;
fma.rn.f32 %f3675, %f3671, %f3674, %f3673;
mov.f32 %f3676, 0fA7C234C5;
fma.rn.f32 %f5589, %f3671, %f3676, %f3675;
abs.f32 %f1059, %f862;
setp.ltu.f32 %p794, %f1059, 0f47CE4780;
@%p794 bra $L__BB0_928;
setp.eq.f32 %p795, %f1059, 0f7F800000;
@%p795 bra $L__BB0_927;
bra.uni $L__BB0_922;
$L__BB0_927:
mov.f32 %f3679, 0f00000000;
mul.rn.f32 %f5589, %f862, %f3679;
mov.u32 %r8222, 0;
bra.uni $L__BB0_928;
$L__BB0_922:
mov.b32 %r1233, %f862;
shr.u32 %r5078, %r1233, 23;
and.b32 %r5079, %r5078, 255;
add.s32 %r1234, %r5079, -128;
shl.b32 %r5080, %r1233, 8;
or.b32 %r1235, %r5080, -2147483648;
shr.u32 %r1236, %r1234, 5;
mov.u64 %rd2631, 0;
mov.u32 %r8219, 0;
mov.u64 %rd1591, __cudart_i2opi_f;
mov.u64 %rd2632, %rd2631;
$L__BB0_923:
.pragma "nounroll";
shl.b64 %rd1590, %rd2631, 2;
add.s64 %rd1592, %rd1591, %rd1590;
ld.global.nc.u32 %r5081, [%rd1592];
mad.wide.u32 %rd1593, %r5081, %r1235, %rd2632;
shr.u64 %rd2632, %rd1593, 32;
add.s64 %rd1594, %rd1, %rd1590;
st.local.u32 [%rd1594], %rd1593;
add.s32 %r8219, %r8219, 1;
cvt.s64.s32 %rd2631, %r8219;
setp.ne.s32 %p796, %r8219, 6;
@%p796 bra $L__BB0_923;
st.local.u32 [%rd4], %rd2632;
mov.u32 %r5082, 4;
sub.s32 %r1239, %r5082, %r1236;
mov.u32 %r5083, 6;
sub.s32 %r5084, %r5083, %r1236;
mul.wide.s32 %rd1595, %r5084, 4;
add.s64 %rd1596, %rd1, %rd1595;
ld.local.u32 %r8220, [%rd1596];
ld.local.u32 %r8221, [%rd1596+-4];
and.b32 %r1242, %r1234, 31;
setp.eq.s32 %p797, %r1242, 0;
@%p797 bra $L__BB0_926;
mov.u32 %r5085, 32;
sub.s32 %r5086, %r5085, %r1242;
shr.u32 %r5087, %r8221, %r5086;
shl.b32 %r5088, %r8220, %r1242;
add.s32 %r8220, %r5087, %r5088;
mul.wide.s32 %rd1597, %r1239, 4;
add.s64 %rd1598, %rd1, %rd1597;
ld.local.u32 %r5089, [%rd1598];
shr.u32 %r5090, %r5089, %r5086;
shl.b32 %r5091, %r8221, %r1242;
add.s32 %r8221, %r5090, %r5091;
$L__BB0_926:
and.b32 %r5092, %r1233, -2147483648;
shr.u32 %r5093, %r8221, 30;
shl.b32 %r5094, %r8220, 2;
or.b32 %r5095, %r5093, %r5094;
shr.u32 %r5096, %r5095, 31;
shr.u32 %r5097, %r8220, 30;
add.s32 %r5098, %r5096, %r5097;
neg.s32 %r5099, %r5098;
setp.eq.s32 %p798, %r5092, 0;
selp.b32 %r8222, %r5098, %r5099, %p798;
setp.ne.s32 %p799, %r5096, 0;
xor.b32 %r5100, %r5092, -2147483648;
selp.b32 %r5101, %r5100, %r5092, %p799;
selp.b32 %r5102, -1, 0, %p799;
xor.b32 %r5103, %r5095, %r5102;
shl.b32 %r5104, %r8221, 2;
xor.b32 %r5105, %r5104, %r5102;
cvt.u64.u32 %rd1599, %r5103;
cvt.u64.u32 %rd1600, %r5105;
bfi.b64 %rd1601, %rd1599, %rd1600, 32, 32;
cvt.rn.f64.s64 %fd127, %rd1601;
mul.f64 %fd128, %fd127, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3677, %fd128;
setp.eq.s32 %p800, %r5101, 0;
neg.f32 %f3678, %f3677;
selp.f32 %f5589, %f3677, %f3678, %p800;
$L__BB0_928:
add.s32 %r1249, %r8222, 1;
and.b32 %r1250, %r1249, 1;
setp.eq.s32 %p801, %r1250, 0;
selp.f32 %f1063, %f5589, 0f3F800000, %p801;
mul.rn.f32 %f1064, %f5589, %f5589;
mov.f32 %f5590, 0fB94D4153;
@%p801 bra $L__BB0_930;
mov.f32 %f3681, 0fBAB607ED;
mov.f32 %f3682, 0f37CBAC00;
fma.rn.f32 %f5590, %f3682, %f1064, %f3681;
$L__BB0_930:
selp.f32 %f3683, 0f3C0885E4, 0f3D2AAABB, %p801;
fma.rn.f32 %f3684, %f5590, %f1064, %f3683;
selp.f32 %f3685, 0fBE2AAAA8, 0fBEFFFFFF, %p801;
fma.rn.f32 %f3686, %f3684, %f1064, %f3685;
mov.f32 %f3687, 0f00000000;
fma.rn.f32 %f3688, %f1064, %f1063, %f3687;
fma.rn.f32 %f5591, %f3686, %f3688, %f1063;
and.b32 %r5107, %r1249, 2;
setp.eq.s32 %p803, %r5107, 0;
@%p803 bra $L__BB0_932;
mov.f32 %f3690, 0fBF800000;
fma.rn.f32 %f5591, %f5591, %f3690, %f3687;
$L__BB0_932:
add.f32 %f5592, %f5588, %f5591;
bra.uni $L__BB0_933;
$L__BB0_512:
mov.b32 %r653, %f5416;
shr.u32 %r4036, %r653, 23;
and.b32 %r4037, %r4036, 255;
add.s32 %r654, %r4037, -128;
shl.b32 %r4038, %r653, 8;
or.b32 %r655, %r4038, -2147483648;
shr.u32 %r656, %r654, 5;
mov.u64 %rd2569, 0;
mov.u32 %r8095, 0;
mov.u64 %rd1128, __cudart_i2opi_f;
mov.u64 %rd2570, %rd2569;
$L__BB0_513:
.pragma "nounroll";
shl.b64 %rd1127, %rd2569, 2;
add.s64 %rd1129, %rd1128, %rd1127;
ld.global.nc.u32 %r4039, [%rd1129];
mad.wide.u32 %rd1130, %r4039, %r655, %rd2570;
shr.u64 %rd2570, %rd1130, 32;
add.s64 %rd1131, %rd1, %rd1127;
st.local.u32 [%rd1131], %rd1130;
add.s32 %r8095, %r8095, 1;
cvt.s64.s32 %rd2569, %r8095;
setp.ne.s32 %p448, %r8095, 6;
@%p448 bra $L__BB0_513;
st.local.u32 [%rd4], %rd2570;
mov.u32 %r4040, 4;
sub.s32 %r659, %r4040, %r656;
mov.u32 %r4041, 6;
sub.s32 %r4042, %r4041, %r656;
mul.wide.s32 %rd1132, %r4042, 4;
add.s64 %rd1133, %rd1, %rd1132;
ld.local.u32 %r8096, [%rd1133];
ld.local.u32 %r8097, [%rd1133+-4];
and.b32 %r662, %r654, 31;
setp.eq.s32 %p449, %r662, 0;
@%p449 bra $L__BB0_516;
mov.u32 %r4043, 32;
sub.s32 %r4044, %r4043, %r662;
shr.u32 %r4045, %r8097, %r4044;
shl.b32 %r4046, %r8096, %r662;
add.s32 %r8096, %r4045, %r4046;
mul.wide.s32 %rd1134, %r659, 4;
add.s64 %rd1135, %rd1, %rd1134;
ld.local.u32 %r4047, [%rd1135];
shr.u32 %r4048, %r4047, %r4044;
shl.b32 %r4049, %r8097, %r662;
add.s32 %r8097, %r4048, %r4049;
$L__BB0_516:
and.b32 %r4050, %r653, -2147483648;
shr.u32 %r4051, %r8097, 30;
shl.b32 %r4052, %r8096, 2;
or.b32 %r4053, %r4051, %r4052;
shr.u32 %r4054, %r4053, 31;
shr.u32 %r4055, %r8096, 30;
add.s32 %r4056, %r4054, %r4055;
neg.s32 %r4057, %r4056;
setp.eq.s32 %p450, %r4050, 0;
selp.b32 %r8098, %r4056, %r4057, %p450;
setp.ne.s32 %p451, %r4054, 0;
xor.b32 %r4058, %r4050, -2147483648;
selp.b32 %r4059, %r4058, %r4050, %p451;
selp.b32 %r4060, -1, 0, %p451;
xor.b32 %r4061, %r4053, %r4060;
shl.b32 %r4062, %r8097, 2;
xor.b32 %r4063, %r4062, %r4060;
cvt.u64.u32 %rd1136, %r4061;
cvt.u64.u32 %rd1137, %r4063;
bfi.b64 %rd1138, %rd1136, %rd1137, 32, 32;
cvt.rn.f64.s64 %fd65, %rd1138;
mul.f64 %fd66, %fd65, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3011, %fd66;
setp.eq.s32 %p452, %r4059, 0;
neg.f32 %f3012, %f3011;
selp.f32 %f5445, %f3011, %f3012, %p452;
$L__BB0_518:
and.b32 %r669, %r8098, 1;
setp.eq.s32 %p453, %r669, 0;
selp.f32 %f593, %f5445, 0f3F800000, %p453;
mul.rn.f32 %f594, %f5445, %f5445;
mov.f32 %f5446, 0fB94D4153;
@%p453 bra $L__BB0_520;
mov.f32 %f3015, 0fBAB607ED;
mov.f32 %f3016, 0f37CBAC00;
fma.rn.f32 %f5446, %f3016, %f594, %f3015;
$L__BB0_520:
selp.f32 %f3017, 0f3C0885E4, 0f3D2AAABB, %p453;
fma.rn.f32 %f3018, %f5446, %f594, %f3017;
selp.f32 %f3019, 0fBE2AAAA8, 0fBEFFFFFF, %p453;
fma.rn.f32 %f3020, %f3018, %f594, %f3019;
mov.f32 %f3021, 0f00000000;
fma.rn.f32 %f3022, %f594, %f593, %f3021;
fma.rn.f32 %f5281, %f3020, %f3022, %f593;
and.b32 %r4065, %r8098, 2;
setp.eq.s32 %p455, %r4065, 0;
@%p455 bra $L__BB0_522;
mov.f32 %f3024, 0fBF800000;
fma.rn.f32 %f5281, %f5281, %f3024, %f3021;
$L__BB0_522:
setp.lt.s32 %p8, %r11, %r651;
@%p445 bra $L__BB0_535;
mul.f32 %f3025, %f5607, 0f3F22F983;
cvt.rni.s32.f32 %r8102, %f3025;
cvt.rn.f32.s32 %f3026, %r8102;
mov.f32 %f3027, 0fBFC90FDA;
fma.rn.f32 %f3028, %f3026, %f3027, %f5607;
mov.f32 %f3029, 0fB3A22168;
fma.rn.f32 %f3030, %f3026, %f3029, %f3028;
mov.f32 %f3031, 0fA7C234C5;
fma.rn.f32 %f5449, %f3026, %f3031, %f3030;
abs.f32 %f602, %f5607;
setp.ltu.f32 %p457, %f602, 0f47CE4780;
@%p457 bra $L__BB0_531;
setp.eq.f32 %p458, %f602, 0f7F800000;
@%p458 bra $L__BB0_530;
bra.uni $L__BB0_525;
$L__BB0_530:
mov.f32 %f3034, 0f00000000;
mul.rn.f32 %f5449, %f5607, %f3034;
mov.u32 %r8102, 0;
bra.uni $L__BB0_531;
$L__BB0_525:
mov.b32 %r671, %f5607;
shr.u32 %r4067, %r671, 23;
and.b32 %r4068, %r4067, 255;
add.s32 %r672, %r4068, -128;
shl.b32 %r4069, %r671, 8;
or.b32 %r673, %r4069, -2147483648;
shr.u32 %r674, %r672, 5;
mov.u64 %rd2571, 0;
mov.u32 %r8099, 0;
mov.u64 %rd1142, __cudart_i2opi_f;
mov.u64 %rd2572, %rd2571;
$L__BB0_526:
.pragma "nounroll";
shl.b64 %rd1141, %rd2571, 2;
add.s64 %rd1143, %rd1142, %rd1141;
ld.global.nc.u32 %r4070, [%rd1143];
mad.wide.u32 %rd1144, %r4070, %r673, %rd2572;
shr.u64 %rd2572, %rd1144, 32;
add.s64 %rd1145, %rd1, %rd1141;
st.local.u32 [%rd1145], %rd1144;
add.s32 %r8099, %r8099, 1;
cvt.s64.s32 %rd2571, %r8099;
setp.ne.s32 %p459, %r8099, 6;
@%p459 bra $L__BB0_526;
st.local.u32 [%rd4], %rd2572;
mov.u32 %r4071, 4;
sub.s32 %r677, %r4071, %r674;
mov.u32 %r4072, 6;
sub.s32 %r4073, %r4072, %r674;
mul.wide.s32 %rd1146, %r4073, 4;
add.s64 %rd1147, %rd1, %rd1146;
ld.local.u32 %r8100, [%rd1147];
ld.local.u32 %r8101, [%rd1147+-4];
and.b32 %r680, %r672, 31;
setp.eq.s32 %p460, %r680, 0;
@%p460 bra $L__BB0_529;
mov.u32 %r4074, 32;
sub.s32 %r4075, %r4074, %r680;
shr.u32 %r4076, %r8101, %r4075;
shl.b32 %r4077, %r8100, %r680;
add.s32 %r8100, %r4076, %r4077;
mul.wide.s32 %rd1148, %r677, 4;
add.s64 %rd1149, %rd1, %rd1148;
ld.local.u32 %r4078, [%rd1149];
shr.u32 %r4079, %r4078, %r4075;
shl.b32 %r4080, %r8101, %r680;
add.s32 %r8101, %r4079, %r4080;
$L__BB0_529:
and.b32 %r4081, %r671, -2147483648;
shr.u32 %r4082, %r8101, 30;
shl.b32 %r4083, %r8100, 2;
or.b32 %r4084, %r4082, %r4083;
shr.u32 %r4085, %r4084, 31;
shr.u32 %r4086, %r8100, 30;
add.s32 %r4087, %r4085, %r4086;
neg.s32 %r4088, %r4087;
setp.eq.s32 %p461, %r4081, 0;
selp.b32 %r8102, %r4087, %r4088, %p461;
setp.ne.s32 %p462, %r4085, 0;
xor.b32 %r4089, %r4081, -2147483648;
selp.b32 %r4090, %r4089, %r4081, %p462;
selp.b32 %r4091, -1, 0, %p462;
xor.b32 %r4092, %r4084, %r4091;
shl.b32 %r4093, %r8101, 2;
xor.b32 %r4094, %r4093, %r4091;
cvt.u64.u32 %rd1150, %r4092;
cvt.u64.u32 %rd1151, %r4094;
bfi.b64 %rd1152, %rd1150, %rd1151, 32, 32;
cvt.rn.f64.s64 %fd67, %rd1152;
mul.f64 %fd68, %fd67, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3032, %fd68;
setp.eq.s32 %p463, %r4090, 0;
neg.f32 %f3033, %f3032;
selp.f32 %f5449, %f3032, %f3033, %p463;
$L__BB0_531:
add.s32 %r687, %r8102, 1;
and.b32 %r688, %r687, 1;
setp.eq.s32 %p464, %r688, 0;
selp.f32 %f606, %f5449, 0f3F800000, %p464;
mul.rn.f32 %f607, %f5449, %f5449;
mov.f32 %f5450, 0fB94D4153;
@%p464 bra $L__BB0_533;
mov.f32 %f3036, 0fBAB607ED;
mov.f32 %f3037, 0f37CBAC00;
fma.rn.f32 %f5450, %f3037, %f607, %f3036;
$L__BB0_533:
selp.f32 %f3038, 0f3C0885E4, 0f3D2AAABB, %p464;
fma.rn.f32 %f3039, %f5450, %f607, %f3038;
selp.f32 %f3040, 0fBE2AAAA8, 0fBEFFFFFF, %p464;
fma.rn.f32 %f3041, %f3039, %f607, %f3040;
mov.f32 %f3042, 0f00000000;
fma.rn.f32 %f3043, %f607, %f606, %f3042;
fma.rn.f32 %f5283, %f3041, %f3043, %f606;
and.b32 %r4096, %r687, 2;
setp.eq.s32 %p466, %r4096, 0;
@%p466 bra $L__BB0_535;
mov.f32 %f3045, 0fBF800000;
fma.rn.f32 %f5283, %f5283, %f3045, %f3042;
$L__BB0_535:
selp.f32 %f614, %f5283, %f5284, %p8;
selp.f32 %f615, %f5281, %f5282, %p8;
@%p445 bra $L__BB0_537;
add.f32 %f5599, %f615, %f614;
mov.f32 %f5282, %f5281;
mov.f32 %f5284, %f5283;
$L__BB0_537:
@%p420 bra $L__BB0_566;
shl.b32 %r4097, %r12, 5;
mov.u32 %r4098, -32;
sub.s32 %r689, %r4098, %r4097;
setp.ge.s32 %p470, %r11, %r689;
@%p470 bra $L__BB0_551;
mul.f32 %f3048, %f5415, 0f3F22F983;
cvt.rni.s32.f32 %r8106, %f3048;
cvt.rn.f32.s32 %f3049, %r8106;
mov.f32 %f3050, 0fBFC90FDA;
fma.rn.f32 %f3051, %f3049, %f3050, %f5415;
mov.f32 %f3052, 0fB3A22168;
fma.rn.f32 %f3053, %f3049, %f3052, %f3051;
mov.f32 %f3054, 0fA7C234C5;
fma.rn.f32 %f5458, %f3049, %f3054, %f3053;
abs.f32 %f623, %f5415;
setp.ltu.f32 %p471, %f623, 0f47CE4780;
@%p471 bra $L__BB0_547;
setp.eq.f32 %p472, %f623, 0f7F800000;
@%p472 bra $L__BB0_546;
bra.uni $L__BB0_541;
$L__BB0_546:
mov.f32 %f3057, 0f00000000;
mul.rn.f32 %f5458, %f5415, %f3057;
mov.u32 %r8106, 0;
bra.uni $L__BB0_547;
$L__BB0_541:
mov.b32 %r691, %f5415;
shr.u32 %r4100, %r691, 23;
and.b32 %r4101, %r4100, 255;
add.s32 %r692, %r4101, -128;
shl.b32 %r4102, %r691, 8;
or.b32 %r693, %r4102, -2147483648;
shr.u32 %r694, %r692, 5;
mov.u64 %rd2573, 0;
mov.u32 %r8103, 0;
mov.u64 %rd1156, __cudart_i2opi_f;
mov.u64 %rd2574, %rd2573;
$L__BB0_542:
.pragma "nounroll";
shl.b64 %rd1155, %rd2573, 2;
add.s64 %rd1157, %rd1156, %rd1155;
ld.global.nc.u32 %r4103, [%rd1157];
mad.wide.u32 %rd1158, %r4103, %r693, %rd2574;
shr.u64 %rd2574, %rd1158, 32;
add.s64 %rd1159, %rd1, %rd1155;
st.local.u32 [%rd1159], %rd1158;
add.s32 %r8103, %r8103, 1;
cvt.s64.s32 %rd2573, %r8103;
setp.ne.s32 %p473, %r8103, 6;
@%p473 bra $L__BB0_542;
st.local.u32 [%rd4], %rd2574;
mov.u32 %r4104, 4;
sub.s32 %r697, %r4104, %r694;
mov.u32 %r4105, 6;
sub.s32 %r4106, %r4105, %r694;
mul.wide.s32 %rd1160, %r4106, 4;
add.s64 %rd1161, %rd1, %rd1160;
ld.local.u32 %r8104, [%rd1161];
ld.local.u32 %r8105, [%rd1161+-4];
and.b32 %r700, %r692, 31;
setp.eq.s32 %p474, %r700, 0;
@%p474 bra $L__BB0_545;
mov.u32 %r4107, 32;
sub.s32 %r4108, %r4107, %r700;
shr.u32 %r4109, %r8105, %r4108;
shl.b32 %r4110, %r8104, %r700;
add.s32 %r8104, %r4109, %r4110;
mul.wide.s32 %rd1162, %r697, 4;
add.s64 %rd1163, %rd1, %rd1162;
ld.local.u32 %r4111, [%rd1163];
shr.u32 %r4112, %r4111, %r4108;
shl.b32 %r4113, %r8105, %r700;
add.s32 %r8105, %r4112, %r4113;
$L__BB0_545:
and.b32 %r4114, %r691, -2147483648;
shr.u32 %r4115, %r8105, 30;
shl.b32 %r4116, %r8104, 2;
or.b32 %r4117, %r4115, %r4116;
shr.u32 %r4118, %r4117, 31;
shr.u32 %r4119, %r8104, 30;
add.s32 %r4120, %r4118, %r4119;
neg.s32 %r4121, %r4120;
setp.eq.s32 %p475, %r4114, 0;
selp.b32 %r8106, %r4120, %r4121, %p475;
setp.ne.s32 %p476, %r4118, 0;
xor.b32 %r4122, %r4114, -2147483648;
selp.b32 %r4123, %r4122, %r4114, %p476;
selp.b32 %r4124, -1, 0, %p476;
xor.b32 %r4125, %r4117, %r4124;
shl.b32 %r4126, %r8105, 2;
xor.b32 %r4127, %r4126, %r4124;
cvt.u64.u32 %rd1164, %r4125;
cvt.u64.u32 %rd1165, %r4127;
bfi.b64 %rd1166, %rd1164, %rd1165, 32, 32;
cvt.rn.f64.s64 %fd69, %rd1166;
mul.f64 %fd70, %fd69, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3055, %fd70;
setp.eq.s32 %p477, %r4123, 0;
neg.f32 %f3056, %f3055;
selp.f32 %f5458, %f3055, %f3056, %p477;
$L__BB0_547:
and.b32 %r707, %r8106, 1;
setp.eq.s32 %p478, %r707, 0;
selp.f32 %f627, %f5458, 0f3F800000, %p478;
mul.rn.f32 %f628, %f5458, %f5458;
mov.f32 %f5459, 0fB94D4153;
@%p478 bra $L__BB0_549;
mov.f32 %f3059, 0fBAB607ED;
mov.f32 %f3060, 0f37CBAC00;
fma.rn.f32 %f5459, %f3060, %f628, %f3059;
$L__BB0_549:
selp.f32 %f3061, 0f3C0885E4, 0f3D2AAABB, %p478;
fma.rn.f32 %f3062, %f5459, %f628, %f3061;
selp.f32 %f3063, 0fBE2AAAA8, 0fBEFFFFFF, %p478;
fma.rn.f32 %f3064, %f3062, %f628, %f3063;
mov.f32 %f3065, 0f00000000;
fma.rn.f32 %f3066, %f628, %f627, %f3065;
fma.rn.f32 %f5281, %f3064, %f3066, %f627;
and.b32 %r4129, %r8106, 2;
setp.eq.s32 %p480, %r4129, 0;
@%p480 bra $L__BB0_551;
mov.f32 %f3068, 0fBF800000;
fma.rn.f32 %f5281, %f5281, %f3068, %f3065;
$L__BB0_551:
setp.lt.s32 %p9, %r11, %r689;
@%p470 bra $L__BB0_564;
mul.f32 %f3069, %f5606, 0f3F22F983;
cvt.rni.s32.f32 %r8110, %f3069;
cvt.rn.f32.s32 %f3070, %r8110;
mov.f32 %f3071, 0fBFC90FDA;
fma.rn.f32 %f3072, %f3070, %f3071, %f5606;
mov.f32 %f3073, 0fB3A22168;
fma.rn.f32 %f3074, %f3070, %f3073, %f3072;
mov.f32 %f3075, 0fA7C234C5;
fma.rn.f32 %f5462, %f3070, %f3075, %f3074;
abs.f32 %f636, %f5606;
setp.ltu.f32 %p482, %f636, 0f47CE4780;
@%p482 bra $L__BB0_560;
setp.eq.f32 %p483, %f636, 0f7F800000;
@%p483 bra $L__BB0_559;
bra.uni $L__BB0_554;
$L__BB0_559:
mov.f32 %f3078, 0f00000000;
mul.rn.f32 %f5462, %f5606, %f3078;
mov.u32 %r8110, 0;
bra.uni $L__BB0_560;
$L__BB0_554:
mov.b32 %r709, %f5606;
shr.u32 %r4131, %r709, 23;
and.b32 %r4132, %r4131, 255;
add.s32 %r710, %r4132, -128;
shl.b32 %r4133, %r709, 8;
or.b32 %r711, %r4133, -2147483648;
shr.u32 %r712, %r710, 5;
mov.u64 %rd2575, 0;
mov.u32 %r8107, 0;
mov.u64 %rd1170, __cudart_i2opi_f;
mov.u64 %rd2576, %rd2575;
$L__BB0_555:
.pragma "nounroll";
shl.b64 %rd1169, %rd2575, 2;
add.s64 %rd1171, %rd1170, %rd1169;
ld.global.nc.u32 %r4134, [%rd1171];
mad.wide.u32 %rd1172, %r4134, %r711, %rd2576;
shr.u64 %rd2576, %rd1172, 32;
add.s64 %rd1173, %rd1, %rd1169;
st.local.u32 [%rd1173], %rd1172;
add.s32 %r8107, %r8107, 1;
cvt.s64.s32 %rd2575, %r8107;
setp.ne.s32 %p484, %r8107, 6;
@%p484 bra $L__BB0_555;
st.local.u32 [%rd4], %rd2576;
mov.u32 %r4135, 4;
sub.s32 %r715, %r4135, %r712;
mov.u32 %r4136, 6;
sub.s32 %r4137, %r4136, %r712;
mul.wide.s32 %rd1174, %r4137, 4;
add.s64 %rd1175, %rd1, %rd1174;
ld.local.u32 %r8108, [%rd1175];
ld.local.u32 %r8109, [%rd1175+-4];
and.b32 %r718, %r710, 31;
setp.eq.s32 %p485, %r718, 0;
@%p485 bra $L__BB0_558;
mov.u32 %r4138, 32;
sub.s32 %r4139, %r4138, %r718;
shr.u32 %r4140, %r8109, %r4139;
shl.b32 %r4141, %r8108, %r718;
add.s32 %r8108, %r4140, %r4141;
mul.wide.s32 %rd1176, %r715, 4;
add.s64 %rd1177, %rd1, %rd1176;
ld.local.u32 %r4142, [%rd1177];
shr.u32 %r4143, %r4142, %r4139;
shl.b32 %r4144, %r8109, %r718;
add.s32 %r8109, %r4143, %r4144;
$L__BB0_558:
and.b32 %r4145, %r709, -2147483648;
shr.u32 %r4146, %r8109, 30;
shl.b32 %r4147, %r8108, 2;
or.b32 %r4148, %r4146, %r4147;
shr.u32 %r4149, %r4148, 31;
shr.u32 %r4150, %r8108, 30;
add.s32 %r4151, %r4149, %r4150;
neg.s32 %r4152, %r4151;
setp.eq.s32 %p486, %r4145, 0;
selp.b32 %r8110, %r4151, %r4152, %p486;
setp.ne.s32 %p487, %r4149, 0;
xor.b32 %r4153, %r4145, -2147483648;
selp.b32 %r4154, %r4153, %r4145, %p487;
selp.b32 %r4155, -1, 0, %p487;
xor.b32 %r4156, %r4148, %r4155;
shl.b32 %r4157, %r8109, 2;
xor.b32 %r4158, %r4157, %r4155;
cvt.u64.u32 %rd1178, %r4156;
cvt.u64.u32 %rd1179, %r4158;
bfi.b64 %rd1180, %rd1178, %rd1179, 32, 32;
cvt.rn.f64.s64 %fd71, %rd1180;
mul.f64 %fd72, %fd71, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3076, %fd72;
setp.eq.s32 %p488, %r4154, 0;
neg.f32 %f3077, %f3076;
selp.f32 %f5462, %f3076, %f3077, %p488;
$L__BB0_560:
add.s32 %r725, %r8110, 1;
and.b32 %r726, %r725, 1;
setp.eq.s32 %p489, %r726, 0;
selp.f32 %f640, %f5462, 0f3F800000, %p489;
mul.rn.f32 %f641, %f5462, %f5462;
mov.f32 %f5463, 0fB94D4153;
@%p489 bra $L__BB0_562;
mov.f32 %f3080, 0fBAB607ED;
mov.f32 %f3081, 0f37CBAC00;
fma.rn.f32 %f5463, %f3081, %f641, %f3080;
$L__BB0_562:
selp.f32 %f3082, 0f3C0885E4, 0f3D2AAABB, %p489;
fma.rn.f32 %f3083, %f5463, %f641, %f3082;
selp.f32 %f3084, 0fBE2AAAA8, 0fBEFFFFFF, %p489;
fma.rn.f32 %f3085, %f3083, %f641, %f3084;
mov.f32 %f3086, 0f00000000;
fma.rn.f32 %f3087, %f641, %f640, %f3086;
fma.rn.f32 %f5283, %f3085, %f3087, %f640;
and.b32 %r4160, %r725, 2;
setp.eq.s32 %p491, %r4160, 0;
@%p491 bra $L__BB0_564;
mov.f32 %f3089, 0fBF800000;
fma.rn.f32 %f5283, %f5283, %f3089, %f3086;
$L__BB0_564:
selp.f32 %f648, %f5283, %f5284, %p9;
selp.f32 %f649, %f5281, %f5282, %p9;
@%p470 bra $L__BB0_566;
add.f32 %f5598, %f649, %f648;
mov.f32 %f5282, %f5281;
mov.f32 %f5284, %f5283;
$L__BB0_566:
@%p423 bra $L__BB0_595;
shl.b32 %r4161, %r12, 5;
neg.s32 %r727, %r4161;
setp.ge.s32 %p495, %r11, %r727;
@%p495 bra $L__BB0_580;
mul.f32 %f3092, %f5414, 0f3F22F983;
cvt.rni.s32.f32 %r8114, %f3092;
cvt.rn.f32.s32 %f3093, %r8114;
mov.f32 %f3094, 0fBFC90FDA;
fma.rn.f32 %f3095, %f3093, %f3094, %f5414;
mov.f32 %f3096, 0fB3A22168;
fma.rn.f32 %f3097, %f3093, %f3096, %f3095;
mov.f32 %f3098, 0fA7C234C5;
fma.rn.f32 %f5471, %f3093, %f3098, %f3097;
abs.f32 %f657, %f5414;
setp.ltu.f32 %p496, %f657, 0f47CE4780;
@%p496 bra $L__BB0_576;
setp.eq.f32 %p497, %f657, 0f7F800000;
@%p497 bra $L__BB0_575;
bra.uni $L__BB0_570;
$L__BB0_575:
mov.f32 %f3101, 0f00000000;
mul.rn.f32 %f5471, %f5414, %f3101;
mov.u32 %r8114, 0;
bra.uni $L__BB0_576;
$L__BB0_570:
mov.b32 %r729, %f5414;
shr.u32 %r4163, %r729, 23;
and.b32 %r4164, %r4163, 255;
add.s32 %r730, %r4164, -128;
shl.b32 %r4165, %r729, 8;
or.b32 %r731, %r4165, -2147483648;
shr.u32 %r732, %r730, 5;
mov.u64 %rd2577, 0;
mov.u32 %r8111, 0;
mov.u64 %rd1184, __cudart_i2opi_f;
mov.u64 %rd2578, %rd2577;
$L__BB0_571:
.pragma "nounroll";
shl.b64 %rd1183, %rd2577, 2;
add.s64 %rd1185, %rd1184, %rd1183;
ld.global.nc.u32 %r4166, [%rd1185];
mad.wide.u32 %rd1186, %r4166, %r731, %rd2578;
shr.u64 %rd2578, %rd1186, 32;
add.s64 %rd1187, %rd1, %rd1183;
st.local.u32 [%rd1187], %rd1186;
add.s32 %r8111, %r8111, 1;
cvt.s64.s32 %rd2577, %r8111;
setp.ne.s32 %p498, %r8111, 6;
@%p498 bra $L__BB0_571;
st.local.u32 [%rd4], %rd2578;
mov.u32 %r4167, 4;
sub.s32 %r735, %r4167, %r732;
mov.u32 %r4168, 6;
sub.s32 %r4169, %r4168, %r732;
mul.wide.s32 %rd1188, %r4169, 4;
add.s64 %rd1189, %rd1, %rd1188;
ld.local.u32 %r8112, [%rd1189];
ld.local.u32 %r8113, [%rd1189+-4];
and.b32 %r738, %r730, 31;
setp.eq.s32 %p499, %r738, 0;
@%p499 bra $L__BB0_574;
mov.u32 %r4170, 32;
sub.s32 %r4171, %r4170, %r738;
shr.u32 %r4172, %r8113, %r4171;
shl.b32 %r4173, %r8112, %r738;
add.s32 %r8112, %r4172, %r4173;
mul.wide.s32 %rd1190, %r735, 4;
add.s64 %rd1191, %rd1, %rd1190;
ld.local.u32 %r4174, [%rd1191];
shr.u32 %r4175, %r4174, %r4171;
shl.b32 %r4176, %r8113, %r738;
add.s32 %r8113, %r4175, %r4176;
$L__BB0_574:
and.b32 %r4177, %r729, -2147483648;
shr.u32 %r4178, %r8113, 30;
shl.b32 %r4179, %r8112, 2;
or.b32 %r4180, %r4178, %r4179;
shr.u32 %r4181, %r4180, 31;
shr.u32 %r4182, %r8112, 30;
add.s32 %r4183, %r4181, %r4182;
neg.s32 %r4184, %r4183;
setp.eq.s32 %p500, %r4177, 0;
selp.b32 %r8114, %r4183, %r4184, %p500;
setp.ne.s32 %p501, %r4181, 0;
xor.b32 %r4185, %r4177, -2147483648;
selp.b32 %r4186, %r4185, %r4177, %p501;
selp.b32 %r4187, -1, 0, %p501;
xor.b32 %r4188, %r4180, %r4187;
shl.b32 %r4189, %r8113, 2;
xor.b32 %r4190, %r4189, %r4187;
cvt.u64.u32 %rd1192, %r4188;
cvt.u64.u32 %rd1193, %r4190;
bfi.b64 %rd1194, %rd1192, %rd1193, 32, 32;
cvt.rn.f64.s64 %fd73, %rd1194;
mul.f64 %fd74, %fd73, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3099, %fd74;
setp.eq.s32 %p502, %r4186, 0;
neg.f32 %f3100, %f3099;
selp.f32 %f5471, %f3099, %f3100, %p502;
$L__BB0_576:
and.b32 %r745, %r8114, 1;
setp.eq.s32 %p503, %r745, 0;
selp.f32 %f661, %f5471, 0f3F800000, %p503;
mul.rn.f32 %f662, %f5471, %f5471;
mov.f32 %f5472, 0fB94D4153;
@%p503 bra $L__BB0_578;
mov.f32 %f3103, 0fBAB607ED;
mov.f32 %f3104, 0f37CBAC00;
fma.rn.f32 %f5472, %f3104, %f662, %f3103;
$L__BB0_578:
selp.f32 %f3105, 0f3C0885E4, 0f3D2AAABB, %p503;
fma.rn.f32 %f3106, %f5472, %f662, %f3105;
selp.f32 %f3107, 0fBE2AAAA8, 0fBEFFFFFF, %p503;
fma.rn.f32 %f3108, %f3106, %f662, %f3107;
mov.f32 %f3109, 0f00000000;
fma.rn.f32 %f3110, %f662, %f661, %f3109;
fma.rn.f32 %f5281, %f3108, %f3110, %f661;
and.b32 %r4192, %r8114, 2;
setp.eq.s32 %p505, %r4192, 0;
@%p505 bra $L__BB0_580;
mov.f32 %f3112, 0fBF800000;
fma.rn.f32 %f5281, %f5281, %f3112, %f3109;
$L__BB0_580:
setp.lt.s32 %p10, %r11, %r727;
@%p495 bra $L__BB0_593;
mul.f32 %f3113, %f5406, 0f3F22F983;
cvt.rni.s32.f32 %r8118, %f3113;
cvt.rn.f32.s32 %f3114, %r8118;
mov.f32 %f3115, 0fBFC90FDA;
fma.rn.f32 %f3116, %f3114, %f3115, %f5406;
mov.f32 %f3117, 0fB3A22168;
fma.rn.f32 %f3118, %f3114, %f3117, %f3116;
mov.f32 %f3119, 0fA7C234C5;
fma.rn.f32 %f5475, %f3114, %f3119, %f3118;
abs.f32 %f670, %f5406;
setp.ltu.f32 %p507, %f670, 0f47CE4780;
@%p507 bra $L__BB0_589;
setp.eq.f32 %p508, %f670, 0f7F800000;
@%p508 bra $L__BB0_588;
bra.uni $L__BB0_583;
$L__BB0_588:
mov.f32 %f3122, 0f00000000;
mul.rn.f32 %f5475, %f5406, %f3122;
mov.u32 %r8118, 0;
bra.uni $L__BB0_589;
$L__BB0_583:
mov.b32 %r747, %f5406;
shr.u32 %r4194, %r747, 23;
and.b32 %r4195, %r4194, 255;
add.s32 %r748, %r4195, -128;
shl.b32 %r4196, %r747, 8;
or.b32 %r749, %r4196, -2147483648;
shr.u32 %r750, %r748, 5;
mov.u64 %rd2579, 0;
mov.u32 %r8115, 0;
mov.u64 %rd1198, __cudart_i2opi_f;
mov.u64 %rd2580, %rd2579;
$L__BB0_584:
.pragma "nounroll";
shl.b64 %rd1197, %rd2579, 2;
add.s64 %rd1199, %rd1198, %rd1197;
ld.global.nc.u32 %r4197, [%rd1199];
mad.wide.u32 %rd1200, %r4197, %r749, %rd2580;
shr.u64 %rd2580, %rd1200, 32;
add.s64 %rd1201, %rd1, %rd1197;
st.local.u32 [%rd1201], %rd1200;
add.s32 %r8115, %r8115, 1;
cvt.s64.s32 %rd2579, %r8115;
setp.ne.s32 %p509, %r8115, 6;
@%p509 bra $L__BB0_584;
st.local.u32 [%rd4], %rd2580;
mov.u32 %r4198, 4;
sub.s32 %r753, %r4198, %r750;
mov.u32 %r4199, 6;
sub.s32 %r4200, %r4199, %r750;
mul.wide.s32 %rd1202, %r4200, 4;
add.s64 %rd1203, %rd1, %rd1202;
ld.local.u32 %r8116, [%rd1203];
ld.local.u32 %r8117, [%rd1203+-4];
and.b32 %r756, %r748, 31;
setp.eq.s32 %p510, %r756, 0;
@%p510 bra $L__BB0_587;
mov.u32 %r4201, 32;
sub.s32 %r4202, %r4201, %r756;
shr.u32 %r4203, %r8117, %r4202;
shl.b32 %r4204, %r8116, %r756;
add.s32 %r8116, %r4203, %r4204;
mul.wide.s32 %rd1204, %r753, 4;
add.s64 %rd1205, %rd1, %rd1204;
ld.local.u32 %r4205, [%rd1205];
shr.u32 %r4206, %r4205, %r4202;
shl.b32 %r4207, %r8117, %r756;
add.s32 %r8117, %r4206, %r4207;
$L__BB0_587:
and.b32 %r4208, %r747, -2147483648;
shr.u32 %r4209, %r8117, 30;
shl.b32 %r4210, %r8116, 2;
or.b32 %r4211, %r4209, %r4210;
shr.u32 %r4212, %r4211, 31;
shr.u32 %r4213, %r8116, 30;
add.s32 %r4214, %r4212, %r4213;
neg.s32 %r4215, %r4214;
setp.eq.s32 %p511, %r4208, 0;
selp.b32 %r8118, %r4214, %r4215, %p511;
setp.ne.s32 %p512, %r4212, 0;
xor.b32 %r4216, %r4208, -2147483648;
selp.b32 %r4217, %r4216, %r4208, %p512;
selp.b32 %r4218, -1, 0, %p512;
xor.b32 %r4219, %r4211, %r4218;
shl.b32 %r4220, %r8117, 2;
xor.b32 %r4221, %r4220, %r4218;
cvt.u64.u32 %rd1206, %r4219;
cvt.u64.u32 %rd1207, %r4221;
bfi.b64 %rd1208, %rd1206, %rd1207, 32, 32;
cvt.rn.f64.s64 %fd75, %rd1208;
mul.f64 %fd76, %fd75, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3120, %fd76;
setp.eq.s32 %p513, %r4217, 0;
neg.f32 %f3121, %f3120;
selp.f32 %f5475, %f3120, %f3121, %p513;
$L__BB0_589:
add.s32 %r763, %r8118, 1;
and.b32 %r764, %r763, 1;
setp.eq.s32 %p514, %r764, 0;
selp.f32 %f674, %f5475, 0f3F800000, %p514;
mul.rn.f32 %f675, %f5475, %f5475;
mov.f32 %f5476, 0fB94D4153;
@%p514 bra $L__BB0_591;
mov.f32 %f3124, 0fBAB607ED;
mov.f32 %f3125, 0f37CBAC00;
fma.rn.f32 %f5476, %f3125, %f675, %f3124;
$L__BB0_591:
selp.f32 %f3126, 0f3C0885E4, 0f3D2AAABB, %p514;
fma.rn.f32 %f3127, %f5476, %f675, %f3126;
selp.f32 %f3128, 0fBE2AAAA8, 0fBEFFFFFF, %p514;
fma.rn.f32 %f3129, %f3127, %f675, %f3128;
mov.f32 %f3130, 0f00000000;
fma.rn.f32 %f3131, %f675, %f674, %f3130;
fma.rn.f32 %f5283, %f3129, %f3131, %f674;
and.b32 %r4223, %r763, 2;
setp.eq.s32 %p516, %r4223, 0;
@%p516 bra $L__BB0_593;
mov.f32 %f3133, 0fBF800000;
fma.rn.f32 %f5283, %f5283, %f3133, %f3130;
$L__BB0_593:
selp.f32 %f682, %f5283, %f5284, %p10;
selp.f32 %f683, %f5281, %f5282, %p10;
@%p495 bra $L__BB0_595;
add.f32 %f5597, %f683, %f682;
mov.f32 %f5282, %f5281;
mov.f32 %f5284, %f5283;
$L__BB0_595:
@%p423 bra $L__BB0_624;
shl.b32 %r4224, %r12, 5;
mov.u32 %r4225, -32;
sub.s32 %r765, %r4225, %r4224;
setp.ge.s32 %p520, %r11, %r765;
@%p520 bra $L__BB0_609;
mul.f32 %f3136, %f5413, 0f3F22F983;
cvt.rni.s32.f32 %r8122, %f3136;
cvt.rn.f32.s32 %f3137, %r8122;
mov.f32 %f3138, 0fBFC90FDA;
fma.rn.f32 %f3139, %f3137, %f3138, %f5413;
mov.f32 %f3140, 0fB3A22168;
fma.rn.f32 %f3141, %f3137, %f3140, %f3139;
mov.f32 %f3142, 0fA7C234C5;
fma.rn.f32 %f5484, %f3137, %f3142, %f3141;
abs.f32 %f691, %f5413;
setp.ltu.f32 %p521, %f691, 0f47CE4780;
@%p521 bra $L__BB0_605;
setp.eq.f32 %p522, %f691, 0f7F800000;
@%p522 bra $L__BB0_604;
bra.uni $L__BB0_599;
$L__BB0_604:
mov.f32 %f3145, 0f00000000;
mul.rn.f32 %f5484, %f5413, %f3145;
mov.u32 %r8122, 0;
bra.uni $L__BB0_605;
$L__BB0_599:
mov.b32 %r767, %f5413;
shr.u32 %r4227, %r767, 23;
and.b32 %r4228, %r4227, 255;
add.s32 %r768, %r4228, -128;
shl.b32 %r4229, %r767, 8;
or.b32 %r769, %r4229, -2147483648;
shr.u32 %r770, %r768, 5;
mov.u64 %rd2581, 0;
mov.u32 %r8119, 0;
mov.u64 %rd1212, __cudart_i2opi_f;
mov.u64 %rd2582, %rd2581;
$L__BB0_600:
.pragma "nounroll";
shl.b64 %rd1211, %rd2581, 2;
add.s64 %rd1213, %rd1212, %rd1211;
ld.global.nc.u32 %r4230, [%rd1213];
mad.wide.u32 %rd1214, %r4230, %r769, %rd2582;
shr.u64 %rd2582, %rd1214, 32;
add.s64 %rd1215, %rd1, %rd1211;
st.local.u32 [%rd1215], %rd1214;
add.s32 %r8119, %r8119, 1;
cvt.s64.s32 %rd2581, %r8119;
setp.ne.s32 %p523, %r8119, 6;
@%p523 bra $L__BB0_600;
st.local.u32 [%rd4], %rd2582;
mov.u32 %r4231, 4;
sub.s32 %r773, %r4231, %r770;
mov.u32 %r4232, 6;
sub.s32 %r4233, %r4232, %r770;
mul.wide.s32 %rd1216, %r4233, 4;
add.s64 %rd1217, %rd1, %rd1216;
ld.local.u32 %r8120, [%rd1217];
ld.local.u32 %r8121, [%rd1217+-4];
and.b32 %r776, %r768, 31;
setp.eq.s32 %p524, %r776, 0;
@%p524 bra $L__BB0_603;
mov.u32 %r4234, 32;
sub.s32 %r4235, %r4234, %r776;
shr.u32 %r4236, %r8121, %r4235;
shl.b32 %r4237, %r8120, %r776;
add.s32 %r8120, %r4236, %r4237;
mul.wide.s32 %rd1218, %r773, 4;
add.s64 %rd1219, %rd1, %rd1218;
ld.local.u32 %r4238, [%rd1219];
shr.u32 %r4239, %r4238, %r4235;
shl.b32 %r4240, %r8121, %r776;
add.s32 %r8121, %r4239, %r4240;
$L__BB0_603:
and.b32 %r4241, %r767, -2147483648;
shr.u32 %r4242, %r8121, 30;
shl.b32 %r4243, %r8120, 2;
or.b32 %r4244, %r4242, %r4243;
shr.u32 %r4245, %r4244, 31;
shr.u32 %r4246, %r8120, 30;
add.s32 %r4247, %r4245, %r4246;
neg.s32 %r4248, %r4247;
setp.eq.s32 %p525, %r4241, 0;
selp.b32 %r8122, %r4247, %r4248, %p525;
setp.ne.s32 %p526, %r4245, 0;
xor.b32 %r4249, %r4241, -2147483648;
selp.b32 %r4250, %r4249, %r4241, %p526;
selp.b32 %r4251, -1, 0, %p526;
xor.b32 %r4252, %r4244, %r4251;
shl.b32 %r4253, %r8121, 2;
xor.b32 %r4254, %r4253, %r4251;
cvt.u64.u32 %rd1220, %r4252;
cvt.u64.u32 %rd1221, %r4254;
bfi.b64 %rd1222, %rd1220, %rd1221, 32, 32;
cvt.rn.f64.s64 %fd77, %rd1222;
mul.f64 %fd78, %fd77, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3143, %fd78;
setp.eq.s32 %p527, %r4250, 0;
neg.f32 %f3144, %f3143;
selp.f32 %f5484, %f3143, %f3144, %p527;
$L__BB0_605:
and.b32 %r783, %r8122, 1;
setp.eq.s32 %p528, %r783, 0;
selp.f32 %f695, %f5484, 0f3F800000, %p528;
mul.rn.f32 %f696, %f5484, %f5484;
mov.f32 %f5485, 0fB94D4153;
@%p528 bra $L__BB0_607;
mov.f32 %f3147, 0fBAB607ED;
mov.f32 %f3148, 0f37CBAC00;
fma.rn.f32 %f5485, %f3148, %f696, %f3147;
$L__BB0_607:
selp.f32 %f3149, 0f3C0885E4, 0f3D2AAABB, %p528;
fma.rn.f32 %f3150, %f5485, %f696, %f3149;
selp.f32 %f3151, 0fBE2AAAA8, 0fBEFFFFFF, %p528;
fma.rn.f32 %f3152, %f3150, %f696, %f3151;
mov.f32 %f3153, 0f00000000;
fma.rn.f32 %f3154, %f696, %f695, %f3153;
fma.rn.f32 %f5281, %f3152, %f3154, %f695;
and.b32 %r4256, %r8122, 2;
setp.eq.s32 %p530, %r4256, 0;
@%p530 bra $L__BB0_609;
mov.f32 %f3156, 0fBF800000;
fma.rn.f32 %f5281, %f5281, %f3156, %f3153;
$L__BB0_609:
setp.lt.s32 %p11, %r11, %r765;
@%p520 bra $L__BB0_622;
mul.f32 %f3157, %f5405, 0f3F22F983;
cvt.rni.s32.f32 %r8126, %f3157;
cvt.rn.f32.s32 %f3158, %r8126;
mov.f32 %f3159, 0fBFC90FDA;
fma.rn.f32 %f3160, %f3158, %f3159, %f5405;
mov.f32 %f3161, 0fB3A22168;
fma.rn.f32 %f3162, %f3158, %f3161, %f3160;
mov.f32 %f3163, 0fA7C234C5;
fma.rn.f32 %f5488, %f3158, %f3163, %f3162;
abs.f32 %f704, %f5405;
setp.ltu.f32 %p532, %f704, 0f47CE4780;
@%p532 bra $L__BB0_618;
setp.eq.f32 %p533, %f704, 0f7F800000;
@%p533 bra $L__BB0_617;
bra.uni $L__BB0_612;
$L__BB0_617:
mov.f32 %f3166, 0f00000000;
mul.rn.f32 %f5488, %f5405, %f3166;
mov.u32 %r8126, 0;
bra.uni $L__BB0_618;
$L__BB0_612:
mov.b32 %r785, %f5405;
shr.u32 %r4258, %r785, 23;
and.b32 %r4259, %r4258, 255;
add.s32 %r786, %r4259, -128;
shl.b32 %r4260, %r785, 8;
or.b32 %r787, %r4260, -2147483648;
shr.u32 %r788, %r786, 5;
mov.u64 %rd2583, 0;
mov.u32 %r8123, 0;
mov.u64 %rd1226, __cudart_i2opi_f;
mov.u64 %rd2584, %rd2583;
$L__BB0_613:
.pragma "nounroll";
shl.b64 %rd1225, %rd2583, 2;
add.s64 %rd1227, %rd1226, %rd1225;
ld.global.nc.u32 %r4261, [%rd1227];
mad.wide.u32 %rd1228, %r4261, %r787, %rd2584;
shr.u64 %rd2584, %rd1228, 32;
add.s64 %rd1229, %rd1, %rd1225;
st.local.u32 [%rd1229], %rd1228;
add.s32 %r8123, %r8123, 1;
cvt.s64.s32 %rd2583, %r8123;
setp.ne.s32 %p534, %r8123, 6;
@%p534 bra $L__BB0_613;
st.local.u32 [%rd4], %rd2584;
mov.u32 %r4262, 4;
sub.s32 %r791, %r4262, %r788;
mov.u32 %r4263, 6;
sub.s32 %r4264, %r4263, %r788;
mul.wide.s32 %rd1230, %r4264, 4;
add.s64 %rd1231, %rd1, %rd1230;
ld.local.u32 %r8124, [%rd1231];
ld.local.u32 %r8125, [%rd1231+-4];
and.b32 %r794, %r786, 31;
setp.eq.s32 %p535, %r794, 0;
@%p535 bra $L__BB0_616;
mov.u32 %r4265, 32;
sub.s32 %r4266, %r4265, %r794;
shr.u32 %r4267, %r8125, %r4266;
shl.b32 %r4268, %r8124, %r794;
add.s32 %r8124, %r4267, %r4268;
mul.wide.s32 %rd1232, %r791, 4;
add.s64 %rd1233, %rd1, %rd1232;
ld.local.u32 %r4269, [%rd1233];
shr.u32 %r4270, %r4269, %r4266;
shl.b32 %r4271, %r8125, %r794;
add.s32 %r8125, %r4270, %r4271;
$L__BB0_616:
and.b32 %r4272, %r785, -2147483648;
shr.u32 %r4273, %r8125, 30;
shl.b32 %r4274, %r8124, 2;
or.b32 %r4275, %r4273, %r4274;
shr.u32 %r4276, %r4275, 31;
shr.u32 %r4277, %r8124, 30;
add.s32 %r4278, %r4276, %r4277;
neg.s32 %r4279, %r4278;
setp.eq.s32 %p536, %r4272, 0;
selp.b32 %r8126, %r4278, %r4279, %p536;
setp.ne.s32 %p537, %r4276, 0;
xor.b32 %r4280, %r4272, -2147483648;
selp.b32 %r4281, %r4280, %r4272, %p537;
selp.b32 %r4282, -1, 0, %p537;
xor.b32 %r4283, %r4275, %r4282;
shl.b32 %r4284, %r8125, 2;
xor.b32 %r4285, %r4284, %r4282;
cvt.u64.u32 %rd1234, %r4283;
cvt.u64.u32 %rd1235, %r4285;
bfi.b64 %rd1236, %rd1234, %rd1235, 32, 32;
cvt.rn.f64.s64 %fd79, %rd1236;
mul.f64 %fd80, %fd79, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3164, %fd80;
setp.eq.s32 %p538, %r4281, 0;
neg.f32 %f3165, %f3164;
selp.f32 %f5488, %f3164, %f3165, %p538;
$L__BB0_618:
add.s32 %r801, %r8126, 1;
and.b32 %r802, %r801, 1;
setp.eq.s32 %p539, %r802, 0;
selp.f32 %f708, %f5488, 0f3F800000, %p539;
mul.rn.f32 %f709, %f5488, %f5488;
mov.f32 %f5489, 0fB94D4153;
@%p539 bra $L__BB0_620;
mov.f32 %f3168, 0fBAB607ED;
mov.f32 %f3169, 0f37CBAC00;
fma.rn.f32 %f5489, %f3169, %f709, %f3168;
$L__BB0_620:
selp.f32 %f3170, 0f3C0885E4, 0f3D2AAABB, %p539;
fma.rn.f32 %f3171, %f5489, %f709, %f3170;
selp.f32 %f3172, 0fBE2AAAA8, 0fBEFFFFFF, %p539;
fma.rn.f32 %f3173, %f3171, %f709, %f3172;
mov.f32 %f3174, 0f00000000;
fma.rn.f32 %f3175, %f709, %f708, %f3174;
fma.rn.f32 %f5283, %f3173, %f3175, %f708;
and.b32 %r4287, %r801, 2;
setp.eq.s32 %p541, %r4287, 0;
@%p541 bra $L__BB0_622;
mov.f32 %f3177, 0fBF800000;
fma.rn.f32 %f5283, %f5283, %f3177, %f3174;
$L__BB0_622:
selp.f32 %f716, %f5283, %f5284, %p11;
selp.f32 %f717, %f5281, %f5282, %p11;
@%p520 bra $L__BB0_624;
add.f32 %f5596, %f717, %f716;
mov.f32 %f5282, %f5281;
mov.f32 %f5284, %f5283;
$L__BB0_624:
@%p426 bra $L__BB0_653;
shl.b32 %r4288, %r12, 5;
neg.s32 %r803, %r4288;
setp.ge.s32 %p545, %r11, %r803;
@%p545 bra $L__BB0_638;
mul.f32 %f3180, %f5412, 0f3F22F983;
cvt.rni.s32.f32 %r8130, %f3180;
cvt.rn.f32.s32 %f3181, %r8130;
mov.f32 %f3182, 0fBFC90FDA;
fma.rn.f32 %f3183, %f3181, %f3182, %f5412;
mov.f32 %f3184, 0fB3A22168;
fma.rn.f32 %f3185, %f3181, %f3184, %f3183;
mov.f32 %f3186, 0fA7C234C5;
fma.rn.f32 %f5497, %f3181, %f3186, %f3185;
abs.f32 %f725, %f5412;
setp.ltu.f32 %p546, %f725, 0f47CE4780;
@%p546 bra $L__BB0_634;
setp.eq.f32 %p547, %f725, 0f7F800000;
@%p547 bra $L__BB0_633;
bra.uni $L__BB0_628;
$L__BB0_633:
mov.f32 %f3189, 0f00000000;
mul.rn.f32 %f5497, %f5412, %f3189;
mov.u32 %r8130, 0;
bra.uni $L__BB0_634;
$L__BB0_628:
mov.b32 %r805, %f5412;
shr.u32 %r4290, %r805, 23;
and.b32 %r4291, %r4290, 255;
add.s32 %r806, %r4291, -128;
shl.b32 %r4292, %r805, 8;
or.b32 %r807, %r4292, -2147483648;
shr.u32 %r808, %r806, 5;
mov.u64 %rd2585, 0;
mov.u32 %r8127, 0;
mov.u64 %rd1240, __cudart_i2opi_f;
mov.u64 %rd2586, %rd2585;
$L__BB0_629:
.pragma "nounroll";
shl.b64 %rd1239, %rd2585, 2;
add.s64 %rd1241, %rd1240, %rd1239;
ld.global.nc.u32 %r4293, [%rd1241];
mad.wide.u32 %rd1242, %r4293, %r807, %rd2586;
shr.u64 %rd2586, %rd1242, 32;
add.s64 %rd1243, %rd1, %rd1239;
st.local.u32 [%rd1243], %rd1242;
add.s32 %r8127, %r8127, 1;
cvt.s64.s32 %rd2585, %r8127;
setp.ne.s32 %p548, %r8127, 6;
@%p548 bra $L__BB0_629;
st.local.u32 [%rd4], %rd2586;
mov.u32 %r4294, 4;
sub.s32 %r811, %r4294, %r808;
mov.u32 %r4295, 6;
sub.s32 %r4296, %r4295, %r808;
mul.wide.s32 %rd1244, %r4296, 4;
add.s64 %rd1245, %rd1, %rd1244;
ld.local.u32 %r8128, [%rd1245];
ld.local.u32 %r8129, [%rd1245+-4];
and.b32 %r814, %r806, 31;
setp.eq.s32 %p549, %r814, 0;
@%p549 bra $L__BB0_632;
mov.u32 %r4297, 32;
sub.s32 %r4298, %r4297, %r814;
shr.u32 %r4299, %r8129, %r4298;
shl.b32 %r4300, %r8128, %r814;
add.s32 %r8128, %r4299, %r4300;
mul.wide.s32 %rd1246, %r811, 4;
add.s64 %rd1247, %rd1, %rd1246;
ld.local.u32 %r4301, [%rd1247];
shr.u32 %r4302, %r4301, %r4298;
shl.b32 %r4303, %r8129, %r814;
add.s32 %r8129, %r4302, %r4303;
$L__BB0_632:
and.b32 %r4304, %r805, -2147483648;
shr.u32 %r4305, %r8129, 30;
shl.b32 %r4306, %r8128, 2;
or.b32 %r4307, %r4305, %r4306;
shr.u32 %r4308, %r4307, 31;
shr.u32 %r4309, %r8128, 30;
add.s32 %r4310, %r4308, %r4309;
neg.s32 %r4311, %r4310;
setp.eq.s32 %p550, %r4304, 0;
selp.b32 %r8130, %r4310, %r4311, %p550;
setp.ne.s32 %p551, %r4308, 0;
xor.b32 %r4312, %r4304, -2147483648;
selp.b32 %r4313, %r4312, %r4304, %p551;
selp.b32 %r4314, -1, 0, %p551;
xor.b32 %r4315, %r4307, %r4314;
shl.b32 %r4316, %r8129, 2;
xor.b32 %r4317, %r4316, %r4314;
cvt.u64.u32 %rd1248, %r4315;
cvt.u64.u32 %rd1249, %r4317;
bfi.b64 %rd1250, %rd1248, %rd1249, 32, 32;
cvt.rn.f64.s64 %fd81, %rd1250;
mul.f64 %fd82, %fd81, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3187, %fd82;
setp.eq.s32 %p552, %r4313, 0;
neg.f32 %f3188, %f3187;
selp.f32 %f5497, %f3187, %f3188, %p552;
$L__BB0_634:
and.b32 %r821, %r8130, 1;
setp.eq.s32 %p553, %r821, 0;
selp.f32 %f729, %f5497, 0f3F800000, %p553;
mul.rn.f32 %f730, %f5497, %f5497;
mov.f32 %f5498, 0fB94D4153;
@%p553 bra $L__BB0_636;
mov.f32 %f3191, 0fBAB607ED;
mov.f32 %f3192, 0f37CBAC00;
fma.rn.f32 %f5498, %f3192, %f730, %f3191;
$L__BB0_636:
selp.f32 %f3193, 0f3C0885E4, 0f3D2AAABB, %p553;
fma.rn.f32 %f3194, %f5498, %f730, %f3193;
selp.f32 %f3195, 0fBE2AAAA8, 0fBEFFFFFF, %p553;
fma.rn.f32 %f3196, %f3194, %f730, %f3195;
mov.f32 %f3197, 0f00000000;
fma.rn.f32 %f3198, %f730, %f729, %f3197;
fma.rn.f32 %f5281, %f3196, %f3198, %f729;
and.b32 %r4319, %r8130, 2;
setp.eq.s32 %p555, %r4319, 0;
@%p555 bra $L__BB0_638;
mov.f32 %f3200, 0fBF800000;
fma.rn.f32 %f5281, %f5281, %f3200, %f3197;
$L__BB0_638:
setp.lt.s32 %p12, %r11, %r803;
@%p545 bra $L__BB0_651;
mul.f32 %f3201, %f5404, 0f3F22F983;
cvt.rni.s32.f32 %r8134, %f3201;
cvt.rn.f32.s32 %f3202, %r8134;
mov.f32 %f3203, 0fBFC90FDA;
fma.rn.f32 %f3204, %f3202, %f3203, %f5404;
mov.f32 %f3205, 0fB3A22168;
fma.rn.f32 %f3206, %f3202, %f3205, %f3204;
mov.f32 %f3207, 0fA7C234C5;
fma.rn.f32 %f5501, %f3202, %f3207, %f3206;
abs.f32 %f738, %f5404;
setp.ltu.f32 %p557, %f738, 0f47CE4780;
@%p557 bra $L__BB0_647;
setp.eq.f32 %p558, %f738, 0f7F800000;
@%p558 bra $L__BB0_646;
bra.uni $L__BB0_641;
$L__BB0_646:
mov.f32 %f3210, 0f00000000;
mul.rn.f32 %f5501, %f5404, %f3210;
mov.u32 %r8134, 0;
bra.uni $L__BB0_647;
$L__BB0_641:
mov.b32 %r823, %f5404;
shr.u32 %r4321, %r823, 23;
and.b32 %r4322, %r4321, 255;
add.s32 %r824, %r4322, -128;
shl.b32 %r4323, %r823, 8;
or.b32 %r825, %r4323, -2147483648;
shr.u32 %r826, %r824, 5;
mov.u64 %rd2587, 0;
mov.u32 %r8131, 0;
mov.u64 %rd1254, __cudart_i2opi_f;
mov.u64 %rd2588, %rd2587;
$L__BB0_642:
.pragma "nounroll";
shl.b64 %rd1253, %rd2587, 2;
add.s64 %rd1255, %rd1254, %rd1253;
ld.global.nc.u32 %r4324, [%rd1255];
mad.wide.u32 %rd1256, %r4324, %r825, %rd2588;
shr.u64 %rd2588, %rd1256, 32;
add.s64 %rd1257, %rd1, %rd1253;
st.local.u32 [%rd1257], %rd1256;
add.s32 %r8131, %r8131, 1;
cvt.s64.s32 %rd2587, %r8131;
setp.ne.s32 %p559, %r8131, 6;
@%p559 bra $L__BB0_642;
st.local.u32 [%rd4], %rd2588;
mov.u32 %r4325, 4;
sub.s32 %r829, %r4325, %r826;
mov.u32 %r4326, 6;
sub.s32 %r4327, %r4326, %r826;
mul.wide.s32 %rd1258, %r4327, 4;
add.s64 %rd1259, %rd1, %rd1258;
ld.local.u32 %r8132, [%rd1259];
ld.local.u32 %r8133, [%rd1259+-4];
and.b32 %r832, %r824, 31;
setp.eq.s32 %p560, %r832, 0;
@%p560 bra $L__BB0_645;
mov.u32 %r4328, 32;
sub.s32 %r4329, %r4328, %r832;
shr.u32 %r4330, %r8133, %r4329;
shl.b32 %r4331, %r8132, %r832;
add.s32 %r8132, %r4330, %r4331;
mul.wide.s32 %rd1260, %r829, 4;
add.s64 %rd1261, %rd1, %rd1260;
ld.local.u32 %r4332, [%rd1261];
shr.u32 %r4333, %r4332, %r4329;
shl.b32 %r4334, %r8133, %r832;
add.s32 %r8133, %r4333, %r4334;
$L__BB0_645:
and.b32 %r4335, %r823, -2147483648;
shr.u32 %r4336, %r8133, 30;
shl.b32 %r4337, %r8132, 2;
or.b32 %r4338, %r4336, %r4337;
shr.u32 %r4339, %r4338, 31;
shr.u32 %r4340, %r8132, 30;
add.s32 %r4341, %r4339, %r4340;
neg.s32 %r4342, %r4341;
setp.eq.s32 %p561, %r4335, 0;
selp.b32 %r8134, %r4341, %r4342, %p561;
setp.ne.s32 %p562, %r4339, 0;
xor.b32 %r4343, %r4335, -2147483648;
selp.b32 %r4344, %r4343, %r4335, %p562;
selp.b32 %r4345, -1, 0, %p562;
xor.b32 %r4346, %r4338, %r4345;
shl.b32 %r4347, %r8133, 2;
xor.b32 %r4348, %r4347, %r4345;
cvt.u64.u32 %rd1262, %r4346;
cvt.u64.u32 %rd1263, %r4348;
bfi.b64 %rd1264, %rd1262, %rd1263, 32, 32;
cvt.rn.f64.s64 %fd83, %rd1264;
mul.f64 %fd84, %fd83, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3208, %fd84;
setp.eq.s32 %p563, %r4344, 0;
neg.f32 %f3209, %f3208;
selp.f32 %f5501, %f3208, %f3209, %p563;
$L__BB0_647:
add.s32 %r839, %r8134, 1;
and.b32 %r840, %r839, 1;
setp.eq.s32 %p564, %r840, 0;
selp.f32 %f742, %f5501, 0f3F800000, %p564;
mul.rn.f32 %f743, %f5501, %f5501;
mov.f32 %f5502, 0fB94D4153;
@%p564 bra $L__BB0_649;
mov.f32 %f3212, 0fBAB607ED;
mov.f32 %f3213, 0f37CBAC00;
fma.rn.f32 %f5502, %f3213, %f743, %f3212;
$L__BB0_649:
selp.f32 %f3214, 0f3C0885E4, 0f3D2AAABB, %p564;
fma.rn.f32 %f3215, %f5502, %f743, %f3214;
selp.f32 %f3216, 0fBE2AAAA8, 0fBEFFFFFF, %p564;
fma.rn.f32 %f3217, %f3215, %f743, %f3216;
mov.f32 %f3218, 0f00000000;
fma.rn.f32 %f3219, %f743, %f742, %f3218;
fma.rn.f32 %f5283, %f3217, %f3219, %f742;
and.b32 %r4350, %r839, 2;
setp.eq.s32 %p566, %r4350, 0;
@%p566 bra $L__BB0_651;
mov.f32 %f3221, 0fBF800000;
fma.rn.f32 %f5283, %f5283, %f3221, %f3218;
$L__BB0_651:
selp.f32 %f750, %f5283, %f5284, %p12;
selp.f32 %f751, %f5281, %f5282, %p12;
@%p545 bra $L__BB0_653;
add.f32 %f5595, %f751, %f750;
mov.f32 %f5282, %f5281;
mov.f32 %f5284, %f5283;
$L__BB0_653:
@%p426 bra $L__BB0_682;
shl.b32 %r4351, %r12, 5;
mov.u32 %r4352, -32;
sub.s32 %r841, %r4352, %r4351;
setp.ge.s32 %p570, %r11, %r841;
@%p570 bra $L__BB0_667;
mul.f32 %f3224, %f5411, 0f3F22F983;
cvt.rni.s32.f32 %r8138, %f3224;
cvt.rn.f32.s32 %f3225, %r8138;
mov.f32 %f3226, 0fBFC90FDA;
fma.rn.f32 %f3227, %f3225, %f3226, %f5411;
mov.f32 %f3228, 0fB3A22168;
fma.rn.f32 %f3229, %f3225, %f3228, %f3227;
mov.f32 %f3230, 0fA7C234C5;
fma.rn.f32 %f5510, %f3225, %f3230, %f3229;
abs.f32 %f759, %f5411;
setp.ltu.f32 %p571, %f759, 0f47CE4780;
@%p571 bra $L__BB0_663;
setp.eq.f32 %p572, %f759, 0f7F800000;
@%p572 bra $L__BB0_662;
bra.uni $L__BB0_657;
$L__BB0_662:
mov.f32 %f3233, 0f00000000;
mul.rn.f32 %f5510, %f5411, %f3233;
mov.u32 %r8138, 0;
bra.uni $L__BB0_663;
$L__BB0_657:
mov.b32 %r843, %f5411;
shr.u32 %r4354, %r843, 23;
and.b32 %r4355, %r4354, 255;
add.s32 %r844, %r4355, -128;
shl.b32 %r4356, %r843, 8;
or.b32 %r845, %r4356, -2147483648;
shr.u32 %r846, %r844, 5;
mov.u64 %rd2589, 0;
mov.u32 %r8135, 0;
mov.u64 %rd1268, __cudart_i2opi_f;
mov.u64 %rd2590, %rd2589;
$L__BB0_658:
.pragma "nounroll";
shl.b64 %rd1267, %rd2589, 2;
add.s64 %rd1269, %rd1268, %rd1267;
ld.global.nc.u32 %r4357, [%rd1269];
mad.wide.u32 %rd1270, %r4357, %r845, %rd2590;
shr.u64 %rd2590, %rd1270, 32;
add.s64 %rd1271, %rd1, %rd1267;
st.local.u32 [%rd1271], %rd1270;
add.s32 %r8135, %r8135, 1;
cvt.s64.s32 %rd2589, %r8135;
setp.ne.s32 %p573, %r8135, 6;
@%p573 bra $L__BB0_658;
st.local.u32 [%rd4], %rd2590;
mov.u32 %r4358, 4;
sub.s32 %r849, %r4358, %r846;
mov.u32 %r4359, 6;
sub.s32 %r4360, %r4359, %r846;
mul.wide.s32 %rd1272, %r4360, 4;
add.s64 %rd1273, %rd1, %rd1272;
ld.local.u32 %r8136, [%rd1273];
ld.local.u32 %r8137, [%rd1273+-4];
and.b32 %r852, %r844, 31;
setp.eq.s32 %p574, %r852, 0;
@%p574 bra $L__BB0_661;
mov.u32 %r4361, 32;
sub.s32 %r4362, %r4361, %r852;
shr.u32 %r4363, %r8137, %r4362;
shl.b32 %r4364, %r8136, %r852;
add.s32 %r8136, %r4363, %r4364;
mul.wide.s32 %rd1274, %r849, 4;
add.s64 %rd1275, %rd1, %rd1274;
ld.local.u32 %r4365, [%rd1275];
shr.u32 %r4366, %r4365, %r4362;
shl.b32 %r4367, %r8137, %r852;
add.s32 %r8137, %r4366, %r4367;
$L__BB0_661:
and.b32 %r4368, %r843, -2147483648;
shr.u32 %r4369, %r8137, 30;
shl.b32 %r4370, %r8136, 2;
or.b32 %r4371, %r4369, %r4370;
shr.u32 %r4372, %r4371, 31;
shr.u32 %r4373, %r8136, 30;
add.s32 %r4374, %r4372, %r4373;
neg.s32 %r4375, %r4374;
setp.eq.s32 %p575, %r4368, 0;
selp.b32 %r8138, %r4374, %r4375, %p575;
setp.ne.s32 %p576, %r4372, 0;
xor.b32 %r4376, %r4368, -2147483648;
selp.b32 %r4377, %r4376, %r4368, %p576;
selp.b32 %r4378, -1, 0, %p576;
xor.b32 %r4379, %r4371, %r4378;
shl.b32 %r4380, %r8137, 2;
xor.b32 %r4381, %r4380, %r4378;
cvt.u64.u32 %rd1276, %r4379;
cvt.u64.u32 %rd1277, %r4381;
bfi.b64 %rd1278, %rd1276, %rd1277, 32, 32;
cvt.rn.f64.s64 %fd85, %rd1278;
mul.f64 %fd86, %fd85, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3231, %fd86;
setp.eq.s32 %p577, %r4377, 0;
neg.f32 %f3232, %f3231;
selp.f32 %f5510, %f3231, %f3232, %p577;
$L__BB0_663:
and.b32 %r859, %r8138, 1;
setp.eq.s32 %p578, %r859, 0;
selp.f32 %f763, %f5510, 0f3F800000, %p578;
mul.rn.f32 %f764, %f5510, %f5510;
mov.f32 %f5511, 0fB94D4153;
@%p578 bra $L__BB0_665;
mov.f32 %f3235, 0fBAB607ED;
mov.f32 %f3236, 0f37CBAC00;
fma.rn.f32 %f5511, %f3236, %f764, %f3235;
$L__BB0_665:
selp.f32 %f3237, 0f3C0885E4, 0f3D2AAABB, %p578;
fma.rn.f32 %f3238, %f5511, %f764, %f3237;
selp.f32 %f3239, 0fBE2AAAA8, 0fBEFFFFFF, %p578;
fma.rn.f32 %f3240, %f3238, %f764, %f3239;
mov.f32 %f3241, 0f00000000;
fma.rn.f32 %f3242, %f764, %f763, %f3241;
fma.rn.f32 %f5281, %f3240, %f3242, %f763;
and.b32 %r4383, %r8138, 2;
setp.eq.s32 %p580, %r4383, 0;
@%p580 bra $L__BB0_667;
mov.f32 %f3244, 0fBF800000;
fma.rn.f32 %f5281, %f5281, %f3244, %f3241;
$L__BB0_667:
setp.lt.s32 %p13, %r11, %r841;
@%p570 bra $L__BB0_680;
mul.f32 %f3245, %f5403, 0f3F22F983;
cvt.rni.s32.f32 %r8142, %f3245;
cvt.rn.f32.s32 %f3246, %r8142;
mov.f32 %f3247, 0fBFC90FDA;
fma.rn.f32 %f3248, %f3246, %f3247, %f5403;
mov.f32 %f3249, 0fB3A22168;
fma.rn.f32 %f3250, %f3246, %f3249, %f3248;
mov.f32 %f3251, 0fA7C234C5;
fma.rn.f32 %f5514, %f3246, %f3251, %f3250;
abs.f32 %f772, %f5403;
setp.ltu.f32 %p582, %f772, 0f47CE4780;
@%p582 bra $L__BB0_676;
setp.eq.f32 %p583, %f772, 0f7F800000;
@%p583 bra $L__BB0_675;
bra.uni $L__BB0_670;
$L__BB0_675:
mov.f32 %f3254, 0f00000000;
mul.rn.f32 %f5514, %f5403, %f3254;
mov.u32 %r8142, 0;
bra.uni $L__BB0_676;
$L__BB0_670:
mov.b32 %r861, %f5403;
shr.u32 %r4385, %r861, 23;
and.b32 %r4386, %r4385, 255;
add.s32 %r862, %r4386, -128;
shl.b32 %r4387, %r861, 8;
or.b32 %r863, %r4387, -2147483648;
shr.u32 %r864, %r862, 5;
mov.u64 %rd2591, 0;
mov.u32 %r8139, 0;
mov.u64 %rd1282, __cudart_i2opi_f;
mov.u64 %rd2592, %rd2591;
$L__BB0_671:
.pragma "nounroll";
shl.b64 %rd1281, %rd2591, 2;
add.s64 %rd1283, %rd1282, %rd1281;
ld.global.nc.u32 %r4388, [%rd1283];
mad.wide.u32 %rd1284, %r4388, %r863, %rd2592;
shr.u64 %rd2592, %rd1284, 32;
add.s64 %rd1285, %rd1, %rd1281;
st.local.u32 [%rd1285], %rd1284;
add.s32 %r8139, %r8139, 1;
cvt.s64.s32 %rd2591, %r8139;
setp.ne.s32 %p584, %r8139, 6;
@%p584 bra $L__BB0_671;
st.local.u32 [%rd4], %rd2592;
mov.u32 %r4389, 4;
sub.s32 %r867, %r4389, %r864;
mov.u32 %r4390, 6;
sub.s32 %r4391, %r4390, %r864;
mul.wide.s32 %rd1286, %r4391, 4;
add.s64 %rd1287, %rd1, %rd1286;
ld.local.u32 %r8140, [%rd1287];
ld.local.u32 %r8141, [%rd1287+-4];
and.b32 %r870, %r862, 31;
setp.eq.s32 %p585, %r870, 0;
@%p585 bra $L__BB0_674;
mov.u32 %r4392, 32;
sub.s32 %r4393, %r4392, %r870;
shr.u32 %r4394, %r8141, %r4393;
shl.b32 %r4395, %r8140, %r870;
add.s32 %r8140, %r4394, %r4395;
mul.wide.s32 %rd1288, %r867, 4;
add.s64 %rd1289, %rd1, %rd1288;
ld.local.u32 %r4396, [%rd1289];
shr.u32 %r4397, %r4396, %r4393;
shl.b32 %r4398, %r8141, %r870;
add.s32 %r8141, %r4397, %r4398;
$L__BB0_674:
and.b32 %r4399, %r861, -2147483648;
shr.u32 %r4400, %r8141, 30;
shl.b32 %r4401, %r8140, 2;
or.b32 %r4402, %r4400, %r4401;
shr.u32 %r4403, %r4402, 31;
shr.u32 %r4404, %r8140, 30;
add.s32 %r4405, %r4403, %r4404;
neg.s32 %r4406, %r4405;
setp.eq.s32 %p586, %r4399, 0;
selp.b32 %r8142, %r4405, %r4406, %p586;
setp.ne.s32 %p587, %r4403, 0;
xor.b32 %r4407, %r4399, -2147483648;
selp.b32 %r4408, %r4407, %r4399, %p587;
selp.b32 %r4409, -1, 0, %p587;
xor.b32 %r4410, %r4402, %r4409;
shl.b32 %r4411, %r8141, 2;
xor.b32 %r4412, %r4411, %r4409;
cvt.u64.u32 %rd1290, %r4410;
cvt.u64.u32 %rd1291, %r4412;
bfi.b64 %rd1292, %rd1290, %rd1291, 32, 32;
cvt.rn.f64.s64 %fd87, %rd1292;
mul.f64 %fd88, %fd87, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3252, %fd88;
setp.eq.s32 %p588, %r4408, 0;
neg.f32 %f3253, %f3252;
selp.f32 %f5514, %f3252, %f3253, %p588;
$L__BB0_676:
add.s32 %r877, %r8142, 1;
and.b32 %r878, %r877, 1;
setp.eq.s32 %p589, %r878, 0;
selp.f32 %f776, %f5514, 0f3F800000, %p589;
mul.rn.f32 %f777, %f5514, %f5514;
mov.f32 %f5515, 0fB94D4153;
@%p589 bra $L__BB0_678;
mov.f32 %f3256, 0fBAB607ED;
mov.f32 %f3257, 0f37CBAC00;
fma.rn.f32 %f5515, %f3257, %f777, %f3256;
$L__BB0_678:
selp.f32 %f3258, 0f3C0885E4, 0f3D2AAABB, %p589;
fma.rn.f32 %f3259, %f5515, %f777, %f3258;
selp.f32 %f3260, 0fBE2AAAA8, 0fBEFFFFFF, %p589;
fma.rn.f32 %f3261, %f3259, %f777, %f3260;
mov.f32 %f3262, 0f00000000;
fma.rn.f32 %f3263, %f777, %f776, %f3262;
fma.rn.f32 %f5283, %f3261, %f3263, %f776;
and.b32 %r4414, %r877, 2;
setp.eq.s32 %p591, %r4414, 0;
@%p591 bra $L__BB0_680;
mov.f32 %f3265, 0fBF800000;
fma.rn.f32 %f5283, %f5283, %f3265, %f3262;
$L__BB0_680:
selp.f32 %f784, %f5283, %f5284, %p13;
selp.f32 %f785, %f5281, %f5282, %p13;
@%p570 bra $L__BB0_682;
add.f32 %f5594, %f785, %f784;
mov.f32 %f5282, %f5281;
mov.f32 %f5284, %f5283;
$L__BB0_682:
@%p429 bra $L__BB0_711;
shl.b32 %r4415, %r12, 5;
neg.s32 %r879, %r4415;
setp.ge.s32 %p595, %r11, %r879;
@%p595 bra $L__BB0_696;
mul.f32 %f3268, %f5410, 0f3F22F983;
cvt.rni.s32.f32 %r8146, %f3268;
cvt.rn.f32.s32 %f3269, %r8146;
mov.f32 %f3270, 0fBFC90FDA;
fma.rn.f32 %f3271, %f3269, %f3270, %f5410;
mov.f32 %f3272, 0fB3A22168;
fma.rn.f32 %f3273, %f3269, %f3272, %f3271;
mov.f32 %f3274, 0fA7C234C5;
fma.rn.f32 %f5523, %f3269, %f3274, %f3273;
abs.f32 %f793, %f5410;
setp.ltu.f32 %p596, %f793, 0f47CE4780;
@%p596 bra $L__BB0_692;
setp.eq.f32 %p597, %f793, 0f7F800000;
@%p597 bra $L__BB0_691;
bra.uni $L__BB0_686;
$L__BB0_691:
mov.f32 %f3277, 0f00000000;
mul.rn.f32 %f5523, %f5410, %f3277;
mov.u32 %r8146, 0;
bra.uni $L__BB0_692;
$L__BB0_686:
mov.b32 %r881, %f5410;
shr.u32 %r4417, %r881, 23;
and.b32 %r4418, %r4417, 255;
add.s32 %r882, %r4418, -128;
shl.b32 %r4419, %r881, 8;
or.b32 %r883, %r4419, -2147483648;
shr.u32 %r884, %r882, 5;
mov.u64 %rd2593, 0;
mov.u32 %r8143, 0;
mov.u64 %rd1296, __cudart_i2opi_f;
mov.u64 %rd2594, %rd2593;
$L__BB0_687:
.pragma "nounroll";
shl.b64 %rd1295, %rd2593, 2;
add.s64 %rd1297, %rd1296, %rd1295;
ld.global.nc.u32 %r4420, [%rd1297];
mad.wide.u32 %rd1298, %r4420, %r883, %rd2594;
shr.u64 %rd2594, %rd1298, 32;
add.s64 %rd1299, %rd1, %rd1295;
st.local.u32 [%rd1299], %rd1298;
add.s32 %r8143, %r8143, 1;
cvt.s64.s32 %rd2593, %r8143;
setp.ne.s32 %p598, %r8143, 6;
@%p598 bra $L__BB0_687;
st.local.u32 [%rd4], %rd2594;
mov.u32 %r4421, 4;
sub.s32 %r887, %r4421, %r884;
mov.u32 %r4422, 6;
sub.s32 %r4423, %r4422, %r884;
mul.wide.s32 %rd1300, %r4423, 4;
add.s64 %rd1301, %rd1, %rd1300;
ld.local.u32 %r8144, [%rd1301];
ld.local.u32 %r8145, [%rd1301+-4];
and.b32 %r890, %r882, 31;
setp.eq.s32 %p599, %r890, 0;
@%p599 bra $L__BB0_690;
mov.u32 %r4424, 32;
sub.s32 %r4425, %r4424, %r890;
shr.u32 %r4426, %r8145, %r4425;
shl.b32 %r4427, %r8144, %r890;
add.s32 %r8144, %r4426, %r4427;
mul.wide.s32 %rd1302, %r887, 4;
add.s64 %rd1303, %rd1, %rd1302;
ld.local.u32 %r4428, [%rd1303];
shr.u32 %r4429, %r4428, %r4425;
shl.b32 %r4430, %r8145, %r890;
add.s32 %r8145, %r4429, %r4430;
$L__BB0_690:
and.b32 %r4431, %r881, -2147483648;
shr.u32 %r4432, %r8145, 30;
shl.b32 %r4433, %r8144, 2;
or.b32 %r4434, %r4432, %r4433;
shr.u32 %r4435, %r4434, 31;
shr.u32 %r4436, %r8144, 30;
add.s32 %r4437, %r4435, %r4436;
neg.s32 %r4438, %r4437;
setp.eq.s32 %p600, %r4431, 0;
selp.b32 %r8146, %r4437, %r4438, %p600;
setp.ne.s32 %p601, %r4435, 0;
xor.b32 %r4439, %r4431, -2147483648;
selp.b32 %r4440, %r4439, %r4431, %p601;
selp.b32 %r4441, -1, 0, %p601;
xor.b32 %r4442, %r4434, %r4441;
shl.b32 %r4443, %r8145, 2;
xor.b32 %r4444, %r4443, %r4441;
cvt.u64.u32 %rd1304, %r4442;
cvt.u64.u32 %rd1305, %r4444;
bfi.b64 %rd1306, %rd1304, %rd1305, 32, 32;
cvt.rn.f64.s64 %fd89, %rd1306;
mul.f64 %fd90, %fd89, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3275, %fd90;
setp.eq.s32 %p602, %r4440, 0;
neg.f32 %f3276, %f3275;
selp.f32 %f5523, %f3275, %f3276, %p602;
$L__BB0_692:
and.b32 %r897, %r8146, 1;
setp.eq.s32 %p603, %r897, 0;
selp.f32 %f797, %f5523, 0f3F800000, %p603;
mul.rn.f32 %f798, %f5523, %f5523;
mov.f32 %f5524, 0fB94D4153;
@%p603 bra $L__BB0_694;
mov.f32 %f3279, 0fBAB607ED;
mov.f32 %f3280, 0f37CBAC00;
fma.rn.f32 %f5524, %f3280, %f798, %f3279;
$L__BB0_694:
selp.f32 %f3281, 0f3C0885E4, 0f3D2AAABB, %p603;
fma.rn.f32 %f3282, %f5524, %f798, %f3281;
selp.f32 %f3283, 0fBE2AAAA8, 0fBEFFFFFF, %p603;
fma.rn.f32 %f3284, %f3282, %f798, %f3283;
mov.f32 %f3285, 0f00000000;
fma.rn.f32 %f3286, %f798, %f797, %f3285;
fma.rn.f32 %f5281, %f3284, %f3286, %f797;
and.b32 %r4446, %r8146, 2;
setp.eq.s32 %p605, %r4446, 0;
@%p605 bra $L__BB0_696;
mov.f32 %f3288, 0fBF800000;
fma.rn.f32 %f5281, %f5281, %f3288, %f3285;
$L__BB0_696:
setp.lt.s32 %p14, %r11, %r879;
@%p595 bra $L__BB0_709;
mul.f32 %f3289, %f5402, 0f3F22F983;
cvt.rni.s32.f32 %r8150, %f3289;
cvt.rn.f32.s32 %f3290, %r8150;
mov.f32 %f3291, 0fBFC90FDA;
fma.rn.f32 %f3292, %f3290, %f3291, %f5402;
mov.f32 %f3293, 0fB3A22168;
fma.rn.f32 %f3294, %f3290, %f3293, %f3292;
mov.f32 %f3295, 0fA7C234C5;
fma.rn.f32 %f5527, %f3290, %f3295, %f3294;
abs.f32 %f806, %f5402;
setp.ltu.f32 %p607, %f806, 0f47CE4780;
@%p607 bra $L__BB0_705;
setp.eq.f32 %p608, %f806, 0f7F800000;
@%p608 bra $L__BB0_704;
bra.uni $L__BB0_699;
$L__BB0_704:
mov.f32 %f3298, 0f00000000;
mul.rn.f32 %f5527, %f5402, %f3298;
mov.u32 %r8150, 0;
bra.uni $L__BB0_705;
$L__BB0_699:
mov.b32 %r899, %f5402;
shr.u32 %r4448, %r899, 23;
and.b32 %r4449, %r4448, 255;
add.s32 %r900, %r4449, -128;
shl.b32 %r4450, %r899, 8;
or.b32 %r901, %r4450, -2147483648;
shr.u32 %r902, %r900, 5;
mov.u64 %rd2595, 0;
mov.u32 %r8147, 0;
mov.u64 %rd1310, __cudart_i2opi_f;
mov.u64 %rd2596, %rd2595;
$L__BB0_700:
.pragma "nounroll";
shl.b64 %rd1309, %rd2595, 2;
add.s64 %rd1311, %rd1310, %rd1309;
ld.global.nc.u32 %r4451, [%rd1311];
mad.wide.u32 %rd1312, %r4451, %r901, %rd2596;
shr.u64 %rd2596, %rd1312, 32;
add.s64 %rd1313, %rd1, %rd1309;
st.local.u32 [%rd1313], %rd1312;
add.s32 %r8147, %r8147, 1;
cvt.s64.s32 %rd2595, %r8147;
setp.ne.s32 %p609, %r8147, 6;
@%p609 bra $L__BB0_700;
st.local.u32 [%rd4], %rd2596;
mov.u32 %r4452, 4;
sub.s32 %r905, %r4452, %r902;
mov.u32 %r4453, 6;
sub.s32 %r4454, %r4453, %r902;
mul.wide.s32 %rd1314, %r4454, 4;
add.s64 %rd1315, %rd1, %rd1314;
ld.local.u32 %r8148, [%rd1315];
ld.local.u32 %r8149, [%rd1315+-4];
and.b32 %r908, %r900, 31;
setp.eq.s32 %p610, %r908, 0;
@%p610 bra $L__BB0_703;
mov.u32 %r4455, 32;
sub.s32 %r4456, %r4455, %r908;
shr.u32 %r4457, %r8149, %r4456;
shl.b32 %r4458, %r8148, %r908;
add.s32 %r8148, %r4457, %r4458;
mul.wide.s32 %rd1316, %r905, 4;
add.s64 %rd1317, %rd1, %rd1316;
ld.local.u32 %r4459, [%rd1317];
shr.u32 %r4460, %r4459, %r4456;
shl.b32 %r4461, %r8149, %r908;
add.s32 %r8149, %r4460, %r4461;
$L__BB0_703:
and.b32 %r4462, %r899, -2147483648;
shr.u32 %r4463, %r8149, 30;
shl.b32 %r4464, %r8148, 2;
or.b32 %r4465, %r4463, %r4464;
shr.u32 %r4466, %r4465, 31;
shr.u32 %r4467, %r8148, 30;
add.s32 %r4468, %r4466, %r4467;
neg.s32 %r4469, %r4468;
setp.eq.s32 %p611, %r4462, 0;
selp.b32 %r8150, %r4468, %r4469, %p611;
setp.ne.s32 %p612, %r4466, 0;
xor.b32 %r4470, %r4462, -2147483648;
selp.b32 %r4471, %r4470, %r4462, %p612;
selp.b32 %r4472, -1, 0, %p612;
xor.b32 %r4473, %r4465, %r4472;
shl.b32 %r4474, %r8149, 2;
xor.b32 %r4475, %r4474, %r4472;
cvt.u64.u32 %rd1318, %r4473;
cvt.u64.u32 %rd1319, %r4475;
bfi.b64 %rd1320, %rd1318, %rd1319, 32, 32;
cvt.rn.f64.s64 %fd91, %rd1320;
mul.f64 %fd92, %fd91, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3296, %fd92;
setp.eq.s32 %p613, %r4471, 0;
neg.f32 %f3297, %f3296;
selp.f32 %f5527, %f3296, %f3297, %p613;
$L__BB0_705:
add.s32 %r915, %r8150, 1;
and.b32 %r916, %r915, 1;
setp.eq.s32 %p614, %r916, 0;
selp.f32 %f810, %f5527, 0f3F800000, %p614;
mul.rn.f32 %f811, %f5527, %f5527;
mov.f32 %f5528, 0fB94D4153;
@%p614 bra $L__BB0_707;
mov.f32 %f3300, 0fBAB607ED;
mov.f32 %f3301, 0f37CBAC00;
fma.rn.f32 %f5528, %f3301, %f811, %f3300;
$L__BB0_707:
selp.f32 %f3302, 0f3C0885E4, 0f3D2AAABB, %p614;
fma.rn.f32 %f3303, %f5528, %f811, %f3302;
selp.f32 %f3304, 0fBE2AAAA8, 0fBEFFFFFF, %p614;
fma.rn.f32 %f3305, %f3303, %f811, %f3304;
mov.f32 %f3306, 0f00000000;
fma.rn.f32 %f3307, %f811, %f810, %f3306;
fma.rn.f32 %f5283, %f3305, %f3307, %f810;
and.b32 %r4477, %r915, 2;
setp.eq.s32 %p616, %r4477, 0;
@%p616 bra $L__BB0_709;
mov.f32 %f3309, 0fBF800000;
fma.rn.f32 %f5283, %f5283, %f3309, %f3306;
$L__BB0_709:
selp.f32 %f818, %f5283, %f5284, %p14;
selp.f32 %f819, %f5281, %f5282, %p14;
@%p595 bra $L__BB0_711;
add.f32 %f5593, %f819, %f818;
mov.f32 %f5282, %f5281;
mov.f32 %f5284, %f5283;
$L__BB0_711:
@%p429 bra $L__BB0_933;
shl.b32 %r4478, %r12, 5;
mov.u32 %r4479, -32;
sub.s32 %r917, %r4479, %r4478;
setp.ge.s32 %p620, %r11, %r917;
@%p620 bra $L__BB0_725;
mul.f32 %f3312, %f5409, 0f3F22F983;
cvt.rni.s32.f32 %r8154, %f3312;
cvt.rn.f32.s32 %f3313, %r8154;
mov.f32 %f3314, 0fBFC90FDA;
fma.rn.f32 %f3315, %f3313, %f3314, %f5409;
mov.f32 %f3316, 0fB3A22168;
fma.rn.f32 %f3317, %f3313, %f3316, %f3315;
mov.f32 %f3318, 0fA7C234C5;
fma.rn.f32 %f5536, %f3313, %f3318, %f3317;
abs.f32 %f827, %f5409;
setp.ltu.f32 %p621, %f827, 0f47CE4780;
@%p621 bra $L__BB0_721;
setp.eq.f32 %p622, %f827, 0f7F800000;
@%p622 bra $L__BB0_720;
bra.uni $L__BB0_715;
$L__BB0_720:
mov.f32 %f3321, 0f00000000;
mul.rn.f32 %f5536, %f5409, %f3321;
mov.u32 %r8154, 0;
bra.uni $L__BB0_721;
$L__BB0_715:
mov.b32 %r919, %f5409;
shr.u32 %r4481, %r919, 23;
and.b32 %r4482, %r4481, 255;
add.s32 %r920, %r4482, -128;
shl.b32 %r4483, %r919, 8;
or.b32 %r921, %r4483, -2147483648;
shr.u32 %r922, %r920, 5;
mov.u64 %rd2597, 0;
mov.u32 %r8151, 0;
mov.u64 %rd1324, __cudart_i2opi_f;
mov.u64 %rd2598, %rd2597;
$L__BB0_716:
.pragma "nounroll";
shl.b64 %rd1323, %rd2597, 2;
add.s64 %rd1325, %rd1324, %rd1323;
ld.global.nc.u32 %r4484, [%rd1325];
mad.wide.u32 %rd1326, %r4484, %r921, %rd2598;
shr.u64 %rd2598, %rd1326, 32;
add.s64 %rd1327, %rd1, %rd1323;
st.local.u32 [%rd1327], %rd1326;
add.s32 %r8151, %r8151, 1;
cvt.s64.s32 %rd2597, %r8151;
setp.ne.s32 %p623, %r8151, 6;
@%p623 bra $L__BB0_716;
st.local.u32 [%rd4], %rd2598;
mov.u32 %r4485, 4;
sub.s32 %r925, %r4485, %r922;
mov.u32 %r4486, 6;
sub.s32 %r4487, %r4486, %r922;
mul.wide.s32 %rd1328, %r4487, 4;
add.s64 %rd1329, %rd1, %rd1328;
ld.local.u32 %r8152, [%rd1329];
ld.local.u32 %r8153, [%rd1329+-4];
and.b32 %r928, %r920, 31;
setp.eq.s32 %p624, %r928, 0;
@%p624 bra $L__BB0_719;
mov.u32 %r4488, 32;
sub.s32 %r4489, %r4488, %r928;
shr.u32 %r4490, %r8153, %r4489;
shl.b32 %r4491, %r8152, %r928;
add.s32 %r8152, %r4490, %r4491;
mul.wide.s32 %rd1330, %r925, 4;
add.s64 %rd1331, %rd1, %rd1330;
ld.local.u32 %r4492, [%rd1331];
shr.u32 %r4493, %r4492, %r4489;
shl.b32 %r4494, %r8153, %r928;
add.s32 %r8153, %r4493, %r4494;
$L__BB0_719:
and.b32 %r4495, %r919, -2147483648;
shr.u32 %r4496, %r8153, 30;
shl.b32 %r4497, %r8152, 2;
or.b32 %r4498, %r4496, %r4497;
shr.u32 %r4499, %r4498, 31;
shr.u32 %r4500, %r8152, 30;
add.s32 %r4501, %r4499, %r4500;
neg.s32 %r4502, %r4501;
setp.eq.s32 %p625, %r4495, 0;
selp.b32 %r8154, %r4501, %r4502, %p625;
setp.ne.s32 %p626, %r4499, 0;
xor.b32 %r4503, %r4495, -2147483648;
selp.b32 %r4504, %r4503, %r4495, %p626;
selp.b32 %r4505, -1, 0, %p626;
xor.b32 %r4506, %r4498, %r4505;
shl.b32 %r4507, %r8153, 2;
xor.b32 %r4508, %r4507, %r4505;
cvt.u64.u32 %rd1332, %r4506;
cvt.u64.u32 %rd1333, %r4508;
bfi.b64 %rd1334, %rd1332, %rd1333, 32, 32;
cvt.rn.f64.s64 %fd93, %rd1334;
mul.f64 %fd94, %fd93, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3319, %fd94;
setp.eq.s32 %p627, %r4504, 0;
neg.f32 %f3320, %f3319;
selp.f32 %f5536, %f3319, %f3320, %p627;
$L__BB0_721:
and.b32 %r935, %r8154, 1;
setp.eq.s32 %p628, %r935, 0;
selp.f32 %f831, %f5536, 0f3F800000, %p628;
mul.rn.f32 %f832, %f5536, %f5536;
mov.f32 %f5537, 0fB94D4153;
@%p628 bra $L__BB0_723;
mov.f32 %f3323, 0fBAB607ED;
mov.f32 %f3324, 0f37CBAC00;
fma.rn.f32 %f5537, %f3324, %f832, %f3323;
$L__BB0_723:
selp.f32 %f3325, 0f3C0885E4, 0f3D2AAABB, %p628;
fma.rn.f32 %f3326, %f5537, %f832, %f3325;
selp.f32 %f3327, 0fBE2AAAA8, 0fBEFFFFFF, %p628;
fma.rn.f32 %f3328, %f3326, %f832, %f3327;
mov.f32 %f3329, 0f00000000;
fma.rn.f32 %f3330, %f832, %f831, %f3329;
fma.rn.f32 %f5281, %f3328, %f3330, %f831;
and.b32 %r4510, %r8154, 2;
setp.eq.s32 %p630, %r4510, 0;
@%p630 bra $L__BB0_725;
mov.f32 %f3332, 0fBF800000;
fma.rn.f32 %f5281, %f5281, %f3332, %f3329;
$L__BB0_725:
setp.lt.s32 %p15, %r11, %r917;
@%p620 bra $L__BB0_738;
mul.f32 %f3333, %f5401, 0f3F22F983;
cvt.rni.s32.f32 %r8158, %f3333;
cvt.rn.f32.s32 %f3334, %r8158;
mov.f32 %f3335, 0fBFC90FDA;
fma.rn.f32 %f3336, %f3334, %f3335, %f5401;
mov.f32 %f3337, 0fB3A22168;
fma.rn.f32 %f3338, %f3334, %f3337, %f3336;
mov.f32 %f3339, 0fA7C234C5;
fma.rn.f32 %f5540, %f3334, %f3339, %f3338;
abs.f32 %f840, %f5401;
setp.ltu.f32 %p632, %f840, 0f47CE4780;
@%p632 bra $L__BB0_734;
setp.eq.f32 %p633, %f840, 0f7F800000;
@%p633 bra $L__BB0_733;
bra.uni $L__BB0_728;
$L__BB0_733:
mov.f32 %f3342, 0f00000000;
mul.rn.f32 %f5540, %f5401, %f3342;
mov.u32 %r8158, 0;
bra.uni $L__BB0_734;
$L__BB0_728:
mov.b32 %r937, %f5401;
shr.u32 %r4512, %r937, 23;
and.b32 %r4513, %r4512, 255;
add.s32 %r938, %r4513, -128;
shl.b32 %r4514, %r937, 8;
or.b32 %r939, %r4514, -2147483648;
shr.u32 %r940, %r938, 5;
mov.u64 %rd2599, 0;
mov.u32 %r8155, 0;
mov.u64 %rd1338, __cudart_i2opi_f;
mov.u64 %rd2600, %rd2599;
$L__BB0_729:
.pragma "nounroll";
shl.b64 %rd1337, %rd2599, 2;
add.s64 %rd1339, %rd1338, %rd1337;
ld.global.nc.u32 %r4515, [%rd1339];
mad.wide.u32 %rd1340, %r4515, %r939, %rd2600;
shr.u64 %rd2600, %rd1340, 32;
add.s64 %rd1341, %rd1, %rd1337;
st.local.u32 [%rd1341], %rd1340;
add.s32 %r8155, %r8155, 1;
cvt.s64.s32 %rd2599, %r8155;
setp.ne.s32 %p634, %r8155, 6;
@%p634 bra $L__BB0_729;
st.local.u32 [%rd4], %rd2600;
mov.u32 %r4516, 4;
sub.s32 %r943, %r4516, %r940;
mov.u32 %r4517, 6;
sub.s32 %r4518, %r4517, %r940;
mul.wide.s32 %rd1342, %r4518, 4;
add.s64 %rd1343, %rd1, %rd1342;
ld.local.u32 %r8156, [%rd1343];
ld.local.u32 %r8157, [%rd1343+-4];
and.b32 %r946, %r938, 31;
setp.eq.s32 %p635, %r946, 0;
@%p635 bra $L__BB0_732;
mov.u32 %r4519, 32;
sub.s32 %r4520, %r4519, %r946;
shr.u32 %r4521, %r8157, %r4520;
shl.b32 %r4522, %r8156, %r946;
add.s32 %r8156, %r4521, %r4522;
mul.wide.s32 %rd1344, %r943, 4;
add.s64 %rd1345, %rd1, %rd1344;
ld.local.u32 %r4523, [%rd1345];
shr.u32 %r4524, %r4523, %r4520;
shl.b32 %r4525, %r8157, %r946;
add.s32 %r8157, %r4524, %r4525;
$L__BB0_732:
and.b32 %r4526, %r937, -2147483648;
shr.u32 %r4527, %r8157, 30;
shl.b32 %r4528, %r8156, 2;
or.b32 %r4529, %r4527, %r4528;
shr.u32 %r4530, %r4529, 31;
shr.u32 %r4531, %r8156, 30;
add.s32 %r4532, %r4530, %r4531;
neg.s32 %r4533, %r4532;
setp.eq.s32 %p636, %r4526, 0;
selp.b32 %r8158, %r4532, %r4533, %p636;
setp.ne.s32 %p637, %r4530, 0;
xor.b32 %r4534, %r4526, -2147483648;
selp.b32 %r4535, %r4534, %r4526, %p637;
selp.b32 %r4536, -1, 0, %p637;
xor.b32 %r4537, %r4529, %r4536;
shl.b32 %r4538, %r8157, 2;
xor.b32 %r4539, %r4538, %r4536;
cvt.u64.u32 %rd1346, %r4537;
cvt.u64.u32 %rd1347, %r4539;
bfi.b64 %rd1348, %rd1346, %rd1347, 32, 32;
cvt.rn.f64.s64 %fd95, %rd1348;
mul.f64 %fd96, %fd95, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3340, %fd96;
setp.eq.s32 %p638, %r4535, 0;
neg.f32 %f3341, %f3340;
selp.f32 %f5540, %f3340, %f3341, %p638;
$L__BB0_734:
add.s32 %r953, %r8158, 1;
and.b32 %r954, %r953, 1;
setp.eq.s32 %p639, %r954, 0;
selp.f32 %f844, %f5540, 0f3F800000, %p639;
mul.rn.f32 %f845, %f5540, %f5540;
mov.f32 %f5541, 0fB94D4153;
@%p639 bra $L__BB0_736;
mov.f32 %f3344, 0fBAB607ED;
mov.f32 %f3345, 0f37CBAC00;
fma.rn.f32 %f5541, %f3345, %f845, %f3344;
$L__BB0_736:
selp.f32 %f3346, 0f3C0885E4, 0f3D2AAABB, %p639;
fma.rn.f32 %f3347, %f5541, %f845, %f3346;
selp.f32 %f3348, 0fBE2AAAA8, 0fBEFFFFFF, %p639;
fma.rn.f32 %f3349, %f3347, %f845, %f3348;
mov.f32 %f3350, 0f00000000;
fma.rn.f32 %f3351, %f845, %f844, %f3350;
fma.rn.f32 %f5283, %f3349, %f3351, %f844;
and.b32 %r4541, %r953, 2;
setp.eq.s32 %p641, %r4541, 0;
@%p641 bra $L__BB0_738;
mov.f32 %f3353, 0fBF800000;
fma.rn.f32 %f5283, %f5283, %f3353, %f3350;
$L__BB0_738:
selp.f32 %f852, %f5283, %f5284, %p15;
selp.f32 %f853, %f5281, %f5282, %p15;
@%p620 bra $L__BB0_933;
add.f32 %f5592, %f853, %f852;
mov.f32 %f5282, %f5281;
mov.f32 %f5284, %f5283;
$L__BB0_933:
setp.lt.s32 %p804, %r12, 1;
and.pred %p806, %p33, %p804;
@%p806 bra $L__BB0_1206;
bra.uni $L__BB0_934;
$L__BB0_1206:
mov.u32 %r7786, %ctaid.x;
shl.b32 %r5805, %r12, 5;
add.s32 %r5806, %r5805, %r1;
mul.hi.s32 %r5807, %r5806, -1840700269;
add.s32 %r5808, %r5807, %r5806;
shr.u32 %r5809, %r5808, 31;
shr.s32 %r5810, %r5808, 2;
add.s32 %r5811, %r5810, %r5809;
mul.lo.s32 %r5812, %r5811, %r2589;
add.s32 %r5813, %r2586, %r14;
mad.lo.s32 %r5814, %r2587, 3, %r5813;
add.s32 %r5815, %r5814, %r5812;
mul.lo.s32 %r5816, %r5811, 7;
sub.s32 %r5817, %r5806, %r5816;
mul.lo.s32 %r5818, %r5817, %r2590;
add.s32 %r5819, %r5815, %r5818;
mul.wide.s32 %rd1858, %r5819, 4;
add.s64 %rd1859, %rd3, %rd1858;
ld.global.f32 %f1406, [%rd1859];
add.s32 %r5820, %r5806, 32;
mul.hi.s32 %r5821, %r5820, -1840700269;
add.s32 %r5822, %r5821, %r5820;
shr.u32 %r5823, %r5822, 31;
shr.s32 %r5824, %r5822, 2;
add.s32 %r5825, %r5824, %r5823;
mul.lo.s32 %r5826, %r5825, %r2589;
add.s32 %r5827, %r5814, %r5826;
mul.lo.s32 %r5828, %r5825, 7;
sub.s32 %r5829, %r5820, %r5828;
mul.lo.s32 %r5830, %r5829, %r2590;
add.s32 %r5831, %r5827, %r5830;
mul.wide.s32 %rd1860, %r5831, 4;
add.s64 %rd1861, %rd3, %rd1860;
ld.global.f32 %f1407, [%rd1861];
add.s32 %r5832, %r5814, %r2587;
add.s32 %r5833, %r5832, %r5812;
add.s32 %r5834, %r5833, %r5818;
mul.wide.s32 %rd1862, %r5834, 4;
add.s64 %rd1863, %rd3, %rd1862;
ld.global.f32 %f1408, [%rd1863];
add.s32 %r5835, %r5832, %r5826;
add.s32 %r5836, %r5835, %r5830;
mul.wide.s32 %rd1864, %r5836, 4;
add.s64 %rd1865, %rd3, %rd1864;
ld.global.f32 %f1409, [%rd1865];
add.s32 %r5837, %r5813, %r2586;
add.s32 %r5838, %r5837, %r5812;
add.s32 %r5839, %r5838, %r5818;
mul.wide.s32 %rd1866, %r5839, 4;
add.s64 %rd1867, %rd3, %rd1866;
ld.global.f32 %f1410, [%rd1867];
add.s32 %r5840, %r5837, %r5826;
add.s32 %r5841, %r5840, %r5830;
mul.wide.s32 %rd1868, %r5841, 4;
add.s64 %rd1869, %rd3, %rd1868;
ld.global.f32 %f1411, [%rd1869];
add.s32 %r5842, %r5837, %r2587;
add.s32 %r5843, %r5842, %r5812;
add.s32 %r5844, %r5843, %r5818;
mul.wide.s32 %rd1870, %r5844, 4;
add.s64 %rd1871, %rd3, %rd1870;
ld.global.f32 %f1412, [%rd1871];
add.s32 %r5845, %r5842, %r5826;
add.s32 %r5846, %r5845, %r5830;
mul.wide.s32 %rd1872, %r5846, 4;
add.s64 %rd1873, %rd3, %rd1872;
ld.global.f32 %f1413, [%rd1873];
mul.hi.s32 %r5848, %r5806, 954437177;
shr.u32 %r5849, %r5848, 31;
shr.s32 %r5850, %r5848, 1;
add.s32 %r5851, %r5850, %r5849;
mul.lo.s32 %r5852, %r5851, %r2579;
shl.b32 %r5853, %r2576, 1;
mad.lo.s32 %r5854, %r2578, %r7786, %r5853;
shl.b32 %r5855, %r2577, 1;
add.s32 %r5856, %r5854, %r5855;
add.s32 %r5857, %r5856, %r5852;
mul.lo.s32 %r5858, %r5851, 9;
sub.s32 %r5859, %r5806, %r5858;
mul.lo.s32 %r5860, %r5859, %r2580;
add.s32 %r5861, %r5857, %r5860;
mul.wide.s32 %rd1874, %r5861, 4;
add.s64 %rd1875, %rd2, %rd1874;
ld.global.f32 %f1414, [%rd1875];
mul.hi.s32 %r5862, %r5820, 954437177;
shr.u32 %r5863, %r5862, 31;
shr.s32 %r5864, %r5862, 1;
add.s32 %r5865, %r5864, %r5863;
mul.lo.s32 %r5866, %r5865, %r2579;
add.s32 %r5867, %r5856, %r5866;
mul.lo.s32 %r5868, %r5865, 9;
sub.s32 %r5869, %r5820, %r5868;
mul.lo.s32 %r5870, %r5869, %r2580;
add.s32 %r5871, %r5867, %r5870;
mul.wide.s32 %rd1876, %r5871, 4;
add.s64 %rd1877, %rd2, %rd1876;
ld.global.f32 %f1415, [%rd1877];
add.s32 %r5872, %r5854, %r2576;
add.s32 %r5873, %r5872, %r5852;
add.s32 %r5874, %r5873, %r5860;
mul.wide.s32 %rd1878, %r5874, 4;
add.s64 %rd1879, %rd2, %rd1878;
ld.global.f32 %f1416, [%rd1879];
add.s32 %r5875, %r5872, %r5866;
add.s32 %r5876, %r5875, %r5870;
mul.wide.s32 %rd1880, %r5876, 4;
add.s64 %rd1881, %rd2, %rd1880;
ld.global.f32 %f1417, [%rd1881];
mul.wide.s32 %rd1882, %r2577, 4;
add.s64 %rd1883, %rd1879, %rd1882;
ld.global.f32 %f1418, [%rd1883];
add.s64 %rd1884, %rd1881, %rd1882;
ld.global.f32 %f1419, [%rd1884];
add.s64 %rd1885, %rd1883, %rd1882;
ld.global.f32 %f1420, [%rd1885];
add.s64 %rd1886, %rd1884, %rd1882;
ld.global.f32 %f1421, [%rd1886];
mul.f32 %f4043, %f1414, 0f3F22F983;
cvt.rni.s32.f32 %r8290, %f4043;
cvt.rn.f32.s32 %f4044, %r8290;
mov.f32 %f4045, 0fBFC90FDA;
fma.rn.f32 %f4046, %f4044, %f4045, %f1414;
mov.f32 %f4047, 0fB3A22168;
fma.rn.f32 %f4048, %f4044, %f4047, %f4046;
mov.f32 %f4049, 0fA7C234C5;
fma.rn.f32 %f5743, %f4044, %f4049, %f4048;
abs.f32 %f1423, %f1414;
setp.ltu.f32 %p1031, %f1423, 0f47CE4780;
@%p1031 bra $L__BB0_1214;
setp.eq.f32 %p1032, %f1423, 0f7F800000;
@%p1032 bra $L__BB0_1213;
bra.uni $L__BB0_1208;
$L__BB0_1213:
mov.f32 %f4052, 0f00000000;
mul.rn.f32 %f5743, %f1414, %f4052;
mov.u32 %r8290, 0;
bra.uni $L__BB0_1214;
$L__BB0_934:
add.s32 %r1251, %r12, 8;
setp.gt.s32 %p807, %r1251, 14;
add.s32 %r5108, %r2586, %r14;
mad.lo.s32 %r1252, %r2587, 3, %r5108;
@%p807 bra $L__BB0_939;
shl.b32 %r1253, %r12, 5;
neg.s32 %r5109, %r1253;
setp.ge.s32 %p808, %r11, %r5109;
@%p808 bra $L__BB0_937;
add.s32 %r5110, %r1253, %r1;
mul.hi.s32 %r5111, %r5110, -1840700269;
add.s32 %r5112, %r5111, %r5110;
shr.u32 %r5113, %r5112, 31;
shr.s32 %r5114, %r5112, 2;
add.s32 %r5115, %r5114, %r5113;
mad.lo.s32 %r5116, %r5115, %r2589, %r1252;
mul.lo.s32 %r5117, %r5115, 7;
sub.s32 %r5118, %r5110, %r5117;
mad.lo.s32 %r5119, %r5118, %r2590, %r5116;
mul.wide.s32 %rd1602, %r5119, 4;
add.s64 %rd1603, %rd3, %rd1602;
ld.global.f32 %f5607, [%rd1603];
$L__BB0_937:
mov.u32 %r5120, -32;
sub.s32 %r5121, %r5120, %r1253;
setp.ge.s32 %p809, %r11, %r5121;
@%p809 bra $L__BB0_939;
add.s32 %r5122, %r1253, %r1;
add.s32 %r5123, %r5122, 32;
mul.hi.s32 %r5124, %r5123, -1840700269;
add.s32 %r5125, %r5124, %r5123;
shr.u32 %r5126, %r5125, 31;
shr.s32 %r5127, %r5125, 2;
add.s32 %r5128, %r5127, %r5126;
mad.lo.s32 %r5129, %r5128, %r2589, %r1252;
mul.lo.s32 %r5130, %r5128, 7;
sub.s32 %r5131, %r5123, %r5130;
mad.lo.s32 %r5132, %r5131, %r2590, %r5129;
mul.wide.s32 %rd1604, %r5132, 4;
add.s64 %rd1605, %rd3, %rd1604;
ld.global.f32 %f5606, [%rd1605];
$L__BB0_939:
add.s32 %r1254, %r12, 9;
setp.gt.s32 %p810, %r1254, 14;
add.s32 %r1255, %r1252, %r2587;
@%p810 bra $L__BB0_944;
shl.b32 %r1256, %r12, 5;
neg.s32 %r5133, %r1256;
setp.ge.s32 %p811, %r11, %r5133;
@%p811 bra $L__BB0_942;
add.s32 %r5134, %r1256, %r1;
mul.hi.s32 %r5135, %r5134, -1840700269;
add.s32 %r5136, %r5135, %r5134;
shr.u32 %r5137, %r5136, 31;
shr.s32 %r5138, %r5136, 2;
add.s32 %r5139, %r5138, %r5137;
mad.lo.s32 %r5140, %r5139, %r2589, %r1255;
mul.lo.s32 %r5141, %r5139, 7;
sub.s32 %r5142, %r5134, %r5141;
mad.lo.s32 %r5143, %r5142, %r2590, %r5140;
mul.wide.s32 %rd1606, %r5143, 4;
add.s64 %rd1607, %rd3, %rd1606;
ld.global.f32 %f5406, [%rd1607];
$L__BB0_942:
mov.u32 %r5144, -32;
sub.s32 %r5145, %r5144, %r1256;
setp.ge.s32 %p812, %r11, %r5145;
@%p812 bra $L__BB0_944;
add.s32 %r5146, %r1256, %r1;
add.s32 %r5147, %r5146, 32;
mul.hi.s32 %r5148, %r5147, -1840700269;
add.s32 %r5149, %r5148, %r5147;
shr.u32 %r5150, %r5149, 31;
shr.s32 %r5151, %r5149, 2;
add.s32 %r5152, %r5151, %r5150;
mad.lo.s32 %r5153, %r5152, %r2589, %r1255;
mul.lo.s32 %r5154, %r5152, 7;
sub.s32 %r5155, %r5147, %r5154;
mad.lo.s32 %r5156, %r5155, %r2590, %r5153;
mul.wide.s32 %rd1608, %r5156, 4;
add.s64 %rd1609, %rd3, %rd1608;
ld.global.f32 %f5405, [%rd1609];
$L__BB0_944:
add.s32 %r1257, %r12, 10;
setp.gt.s32 %p813, %r1257, 14;
shl.b32 %r5157, %r2586, 1;
add.s32 %r1258, %r5157, %r14;
@%p813 bra $L__BB0_949;
shl.b32 %r1259, %r12, 5;
neg.s32 %r5158, %r1259;
setp.ge.s32 %p814, %r11, %r5158;
@%p814 bra $L__BB0_947;
add.s32 %r5159, %r1259, %r1;
mul.hi.s32 %r5160, %r5159, -1840700269;
add.s32 %r5161, %r5160, %r5159;
shr.u32 %r5162, %r5161, 31;
shr.s32 %r5163, %r5161, 2;
add.s32 %r5164, %r5163, %r5162;
mad.lo.s32 %r5165, %r5164, %r2589, %r1258;
mul.lo.s32 %r5166, %r5164, 7;
sub.s32 %r5167, %r5159, %r5166;
mad.lo.s32 %r5168, %r5167, %r2590, %r5165;
mul.wide.s32 %rd1610, %r5168, 4;
add.s64 %rd1611, %rd3, %rd1610;
ld.global.f32 %f5404, [%rd1611];
$L__BB0_947:
mov.u32 %r5169, -32;
sub.s32 %r5170, %r5169, %r1259;
setp.ge.s32 %p815, %r11, %r5170;
@%p815 bra $L__BB0_949;
add.s32 %r5171, %r1259, %r1;
add.s32 %r5172, %r5171, 32;
mul.hi.s32 %r5173, %r5172, -1840700269;
add.s32 %r5174, %r5173, %r5172;
shr.u32 %r5175, %r5174, 31;
shr.s32 %r5176, %r5174, 2;
add.s32 %r5177, %r5176, %r5175;
mad.lo.s32 %r5178, %r5177, %r2589, %r1258;
mul.lo.s32 %r5179, %r5177, 7;
sub.s32 %r5180, %r5172, %r5179;
mad.lo.s32 %r5181, %r5180, %r2590, %r5178;
mul.wide.s32 %rd1612, %r5181, 4;
add.s64 %rd1613, %rd3, %rd1612;
ld.global.f32 %f5403, [%rd1613];
$L__BB0_949:
add.s32 %r1260, %r12, 11;
setp.gt.s32 %p816, %r1260, 14;
add.s32 %r1261, %r1258, %r2587;
@%p816 bra $L__BB0_954;
shl.b32 %r1262, %r12, 5;
neg.s32 %r5182, %r1262;
setp.ge.s32 %p817, %r11, %r5182;
@%p817 bra $L__BB0_952;
add.s32 %r5183, %r1262, %r1;
mul.hi.s32 %r5184, %r5183, -1840700269;
add.s32 %r5185, %r5184, %r5183;
shr.u32 %r5186, %r5185, 31;
shr.s32 %r5187, %r5185, 2;
add.s32 %r5188, %r5187, %r5186;
mad.lo.s32 %r5189, %r5188, %r2589, %r1261;
mul.lo.s32 %r5190, %r5188, 7;
sub.s32 %r5191, %r5183, %r5190;
mad.lo.s32 %r5192, %r5191, %r2590, %r5189;
mul.wide.s32 %rd1614, %r5192, 4;
add.s64 %rd1615, %rd3, %rd1614;
ld.global.f32 %f5402, [%rd1615];
$L__BB0_952:
mov.u32 %r5193, -32;
sub.s32 %r5194, %r5193, %r1262;
setp.ge.s32 %p818, %r11, %r5194;
@%p818 bra $L__BB0_954;
add.s32 %r5195, %r1262, %r1;
add.s32 %r5196, %r5195, 32;
mul.hi.s32 %r5197, %r5196, -1840700269;
add.s32 %r5198, %r5197, %r5196;
shr.u32 %r5199, %r5198, 31;
shr.s32 %r5200, %r5198, 2;
add.s32 %r5201, %r5200, %r5199;
mad.lo.s32 %r5202, %r5201, %r2589, %r1261;
mul.lo.s32 %r5203, %r5201, 7;
sub.s32 %r5204, %r5196, %r5203;
mad.lo.s32 %r5205, %r5204, %r2590, %r5202;
mul.wide.s32 %rd1616, %r5205, 4;
add.s64 %rd1617, %rd3, %rd1616;
ld.global.f32 %f5401, [%rd1617];
$L__BB0_954:
mov.u32 %r7782, %ctaid.x;
shl.b32 %r5207, %r2576, 1;
mad.lo.s32 %r1263, %r2578, %r7782, %r5207;
shl.b32 %r5208, %r2577, 1;
add.s32 %r1264, %r1263, %r5208;
@%p807 bra $L__BB0_959;
shl.b32 %r1265, %r12, 5;
neg.s32 %r5209, %r1265;
setp.ge.s32 %p820, %r11, %r5209;
@%p820 bra $L__BB0_957;
add.s32 %r5210, %r1265, %r1;
mul.hi.s32 %r5211, %r5210, 954437177;
shr.u32 %r5212, %r5211, 31;
shr.s32 %r5213, %r5211, 1;
add.s32 %r5214, %r5213, %r5212;
mad.lo.s32 %r5215, %r5214, %r2579, %r1264;
mul.lo.s32 %r5216, %r5214, 9;
sub.s32 %r5217, %r5210, %r5216;
mad.lo.s32 %r5218, %r5217, %r2580, %r5215;
mul.wide.s32 %rd1618, %r5218, 4;
add.s64 %rd1619, %rd2, %rd1618;
ld.global.f32 %f5416, [%rd1619];
$L__BB0_957:
mov.u32 %r5219, -32;
sub.s32 %r5220, %r5219, %r1265;
setp.ge.s32 %p821, %r11, %r5220;
@%p821 bra $L__BB0_959;
add.s32 %r5221, %r1265, %r1;
add.s32 %r5222, %r5221, 32;
mul.hi.s32 %r5223, %r5222, 954437177;
shr.u32 %r5224, %r5223, 31;
shr.s32 %r5225, %r5223, 1;
add.s32 %r5226, %r5225, %r5224;
mad.lo.s32 %r5227, %r5226, %r2579, %r1264;
mul.lo.s32 %r5228, %r5226, 9;
sub.s32 %r5229, %r5222, %r5228;
mad.lo.s32 %r5230, %r5229, %r2580, %r5227;
mul.wide.s32 %rd1620, %r5230, 4;
add.s64 %rd1621, %rd2, %rd1620;
ld.global.f32 %f5415, [%rd1621];
$L__BB0_959:
add.s32 %r1266, %r1263, %r2576;
@%p810 bra $L__BB0_964;
shl.b32 %r1267, %r12, 5;
neg.s32 %r5231, %r1267;
setp.ge.s32 %p823, %r11, %r5231;
@%p823 bra $L__BB0_962;
add.s32 %r5232, %r1267, %r1;
mul.hi.s32 %r5233, %r5232, 954437177;
shr.u32 %r5234, %r5233, 31;
shr.s32 %r5235, %r5233, 1;
add.s32 %r5236, %r5235, %r5234;
mad.lo.s32 %r5237, %r5236, %r2579, %r1266;
mul.lo.s32 %r5238, %r5236, 9;
sub.s32 %r5239, %r5232, %r5238;
mad.lo.s32 %r5240, %r5239, %r2580, %r5237;
mul.wide.s32 %rd1622, %r5240, 4;
add.s64 %rd1623, %rd2, %rd1622;
ld.global.f32 %f5414, [%rd1623];
$L__BB0_962:
mov.u32 %r5241, -32;
sub.s32 %r5242, %r5241, %r1267;
setp.ge.s32 %p824, %r11, %r5242;
@%p824 bra $L__BB0_964;
add.s32 %r5243, %r1267, %r1;
add.s32 %r5244, %r5243, 32;
mul.hi.s32 %r5245, %r5244, 954437177;
shr.u32 %r5246, %r5245, 31;
shr.s32 %r5247, %r5245, 1;
add.s32 %r5248, %r5247, %r5246;
mad.lo.s32 %r5249, %r5248, %r2579, %r1266;
mul.lo.s32 %r5250, %r5248, 9;
sub.s32 %r5251, %r5244, %r5250;
mad.lo.s32 %r5252, %r5251, %r2580, %r5249;
mul.wide.s32 %rd1624, %r5252, 4;
add.s64 %rd1625, %rd2, %rd1624;
ld.global.f32 %f5413, [%rd1625];
$L__BB0_964:
add.s32 %r1268, %r1266, %r2577;
@%p813 bra $L__BB0_969;
shl.b32 %r1269, %r12, 5;
neg.s32 %r5253, %r1269;
setp.ge.s32 %p826, %r11, %r5253;
@%p826 bra $L__BB0_967;
add.s32 %r5254, %r1269, %r1;
mul.hi.s32 %r5255, %r5254, 954437177;
shr.u32 %r5256, %r5255, 31;
shr.s32 %r5257, %r5255, 1;
add.s32 %r5258, %r5257, %r5256;
mad.lo.s32 %r5259, %r5258, %r2579, %r1268;
mul.lo.s32 %r5260, %r5258, 9;
sub.s32 %r5261, %r5254, %r5260;
mad.lo.s32 %r5262, %r5261, %r2580, %r5259;
mul.wide.s32 %rd1626, %r5262, 4;
add.s64 %rd1627, %rd2, %rd1626;
ld.global.f32 %f5412, [%rd1627];
$L__BB0_967:
mov.u32 %r5263, -32;
sub.s32 %r5264, %r5263, %r1269;
setp.ge.s32 %p827, %r11, %r5264;
@%p827 bra $L__BB0_969;
add.s32 %r5265, %r1269, %r1;
add.s32 %r5266, %r5265, 32;
mul.hi.s32 %r5267, %r5266, 954437177;
shr.u32 %r5268, %r5267, 31;
shr.s32 %r5269, %r5267, 1;
add.s32 %r5270, %r5269, %r5268;
mad.lo.s32 %r5271, %r5270, %r2579, %r1268;
mul.lo.s32 %r5272, %r5270, 9;
sub.s32 %r5273, %r5266, %r5272;
mad.lo.s32 %r5274, %r5273, %r2580, %r5271;
mul.wide.s32 %rd1628, %r5274, 4;
add.s64 %rd1629, %rd2, %rd1628;
ld.global.f32 %f5411, [%rd1629];
$L__BB0_969:
add.s32 %r1270, %r1268, %r2577;
@%p816 bra $L__BB0_974;
shl.b32 %r1271, %r12, 5;
neg.s32 %r5275, %r1271;
setp.ge.s32 %p829, %r11, %r5275;
@%p829 bra $L__BB0_972;
add.s32 %r5276, %r1271, %r1;
mul.hi.s32 %r5277, %r5276, 954437177;
shr.u32 %r5278, %r5277, 31;
shr.s32 %r5279, %r5277, 1;
add.s32 %r5280, %r5279, %r5278;
mad.lo.s32 %r5281, %r5280, %r2579, %r1270;
mul.lo.s32 %r5282, %r5280, 9;
sub.s32 %r5283, %r5276, %r5282;
mad.lo.s32 %r5284, %r5283, %r2580, %r5281;
mul.wide.s32 %rd1630, %r5284, 4;
add.s64 %rd1631, %rd2, %rd1630;
ld.global.f32 %f5410, [%rd1631];
$L__BB0_972:
mov.u32 %r5285, -32;
sub.s32 %r5286, %r5285, %r1271;
setp.ge.s32 %p830, %r11, %r5286;
@%p830 bra $L__BB0_974;
add.s32 %r5287, %r1271, %r1;
add.s32 %r5288, %r5287, 32;
mul.hi.s32 %r5289, %r5288, 954437177;
shr.u32 %r5290, %r5289, 31;
shr.s32 %r5291, %r5289, 1;
add.s32 %r5292, %r5291, %r5290;
mad.lo.s32 %r5293, %r5292, %r2579, %r1270;
mul.lo.s32 %r5294, %r5292, 9;
sub.s32 %r5295, %r5288, %r5294;
mad.lo.s32 %r5296, %r5295, %r2580, %r5293;
mul.wide.s32 %rd1632, %r5296, 4;
add.s64 %rd1633, %rd2, %rd1632;
ld.global.f32 %f5409, [%rd1633];
$L__BB0_974:
@%p807 bra $L__BB0_1003;
shl.b32 %r5297, %r12, 5;
neg.s32 %r1272, %r5297;
setp.ge.s32 %p832, %r11, %r1272;
@%p832 bra $L__BB0_988;
mul.f32 %f3692, %f5416, 0f3F22F983;
cvt.rni.s32.f32 %r8226, %f3692;
cvt.rn.f32.s32 %f3693, %r8226;
mov.f32 %f3694, 0fBFC90FDA;
fma.rn.f32 %f3695, %f3693, %f3694, %f5416;
mov.f32 %f3696, 0fB3A22168;
fma.rn.f32 %f3697, %f3693, %f3696, %f3695;
mov.f32 %f3698, 0fA7C234C5;
fma.rn.f32 %f5644, %f3693, %f3698, %f3697;
abs.f32 %f1140, %f5416;
setp.ltu.f32 %p833, %f1140, 0f47CE4780;
@%p833 bra $L__BB0_984;
setp.eq.f32 %p834, %f1140, 0f7F800000;
@%p834 bra $L__BB0_983;
bra.uni $L__BB0_978;
$L__BB0_983:
mov.f32 %f3701, 0f00000000;
mul.rn.f32 %f5644, %f5416, %f3701;
mov.u32 %r8226, 0;
bra.uni $L__BB0_984;
$L__BB0_1208:
mov.b32 %r1577, %f1414;
shr.u32 %r5878, %r1577, 23;
and.b32 %r5879, %r5878, 255;
add.s32 %r1578, %r5879, -128;
shl.b32 %r5880, %r1577, 8;
or.b32 %r1579, %r5880, -2147483648;
shr.u32 %r1580, %r1578, 5;
mov.u64 %rd2665, 0;
mov.u32 %r8287, 0;
mov.u64 %rd1890, __cudart_i2opi_f;
mov.u64 %rd2666, %rd2665;
$L__BB0_1209:
.pragma "nounroll";
shl.b64 %rd1889, %rd2665, 2;
add.s64 %rd1891, %rd1890, %rd1889;
ld.global.nc.u32 %r5881, [%rd1891];
mad.wide.u32 %rd1892, %r5881, %r1579, %rd2666;
shr.u64 %rd2666, %rd1892, 32;
add.s64 %rd1893, %rd1, %rd1889;
st.local.u32 [%rd1893], %rd1892;
add.s32 %r8287, %r8287, 1;
cvt.s64.s32 %rd2665, %r8287;
setp.ne.s32 %p1033, %r8287, 6;
@%p1033 bra $L__BB0_1209;
st.local.u32 [%rd4], %rd2666;
mov.u32 %r5882, 4;
sub.s32 %r1583, %r5882, %r1580;
mov.u32 %r5883, 6;
sub.s32 %r5884, %r5883, %r1580;
mul.wide.s32 %rd1894, %r5884, 4;
add.s64 %rd1895, %rd1, %rd1894;
ld.local.u32 %r8288, [%rd1895];
ld.local.u32 %r8289, [%rd1895+-4];
and.b32 %r1586, %r1578, 31;
setp.eq.s32 %p1034, %r1586, 0;
@%p1034 bra $L__BB0_1212;
mov.u32 %r5885, 32;
sub.s32 %r5886, %r5885, %r1586;
shr.u32 %r5887, %r8289, %r5886;
shl.b32 %r5888, %r8288, %r1586;
add.s32 %r8288, %r5887, %r5888;
mul.wide.s32 %rd1896, %r1583, 4;
add.s64 %rd1897, %rd1, %rd1896;
ld.local.u32 %r5889, [%rd1897];
shr.u32 %r5890, %r5889, %r5886;
shl.b32 %r5891, %r8289, %r1586;
add.s32 %r8289, %r5890, %r5891;
$L__BB0_1212:
and.b32 %r5892, %r1577, -2147483648;
shr.u32 %r5893, %r8289, 30;
shl.b32 %r5894, %r8288, 2;
or.b32 %r5895, %r5893, %r5894;
shr.u32 %r5896, %r5895, 31;
shr.u32 %r5897, %r8288, 30;
add.s32 %r5898, %r5896, %r5897;
neg.s32 %r5899, %r5898;
setp.eq.s32 %p1035, %r5892, 0;
selp.b32 %r8290, %r5898, %r5899, %p1035;
setp.ne.s32 %p1036, %r5896, 0;
xor.b32 %r5900, %r5892, -2147483648;
selp.b32 %r5901, %r5900, %r5892, %p1036;
selp.b32 %r5902, -1, 0, %p1036;
xor.b32 %r5903, %r5895, %r5902;
shl.b32 %r5904, %r8289, 2;
xor.b32 %r5905, %r5904, %r5902;
cvt.u64.u32 %rd1898, %r5903;
cvt.u64.u32 %rd1899, %r5905;
bfi.b64 %rd1900, %rd1898, %rd1899, 32, 32;
cvt.rn.f64.s64 %fd161, %rd1900;
mul.f64 %fd162, %fd161, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4050, %fd162;
setp.eq.s32 %p1037, %r5901, 0;
neg.f32 %f4051, %f4050;
selp.f32 %f5743, %f4050, %f4051, %p1037;
$L__BB0_1214:
and.b32 %r1593, %r8290, 1;
setp.eq.s32 %p1038, %r1593, 0;
selp.f32 %f1427, %f5743, 0f3F800000, %p1038;
mul.rn.f32 %f1428, %f5743, %f5743;
mov.f32 %f5744, 0fB94D4153;
@%p1038 bra $L__BB0_1216;
mov.f32 %f4054, 0fBAB607ED;
mov.f32 %f4055, 0f37CBAC00;
fma.rn.f32 %f5744, %f4055, %f1428, %f4054;
$L__BB0_1216:
selp.f32 %f4056, 0f3C0885E4, 0f3D2AAABB, %p1038;
fma.rn.f32 %f4057, %f5744, %f1428, %f4056;
selp.f32 %f4058, 0fBE2AAAA8, 0fBEFFFFFF, %p1038;
fma.rn.f32 %f4059, %f4057, %f1428, %f4058;
mov.f32 %f4060, 0f00000000;
fma.rn.f32 %f4061, %f1428, %f1427, %f4060;
fma.rn.f32 %f5745, %f4059, %f4061, %f1427;
and.b32 %r5907, %r8290, 2;
setp.eq.s32 %p1040, %r5907, 0;
@%p1040 bra $L__BB0_1218;
mov.f32 %f4063, 0fBF800000;
fma.rn.f32 %f5745, %f5745, %f4063, %f4060;
$L__BB0_1218:
mul.f32 %f4064, %f1406, 0f3F22F983;
cvt.rni.s32.f32 %r8294, %f4064;
cvt.rn.f32.s32 %f4065, %r8294;
mov.f32 %f4066, 0fBFC90FDA;
fma.rn.f32 %f4067, %f4065, %f4066, %f1406;
mov.f32 %f4068, 0fB3A22168;
fma.rn.f32 %f4069, %f4065, %f4068, %f4067;
mov.f32 %f4070, 0fA7C234C5;
fma.rn.f32 %f5746, %f4065, %f4070, %f4069;
abs.f32 %f1435, %f1406;
setp.ltu.f32 %p1041, %f1435, 0f47CE4780;
@%p1041 bra $L__BB0_1226;
setp.eq.f32 %p1042, %f1435, 0f7F800000;
@%p1042 bra $L__BB0_1225;
bra.uni $L__BB0_1220;
$L__BB0_1225:
mov.f32 %f4073, 0f00000000;
mul.rn.f32 %f5746, %f1406, %f4073;
mov.u32 %r8294, 0;
bra.uni $L__BB0_1226;
$L__BB0_1220:
mov.b32 %r1595, %f1406;
shr.u32 %r5909, %r1595, 23;
and.b32 %r5910, %r5909, 255;
add.s32 %r1596, %r5910, -128;
shl.b32 %r5911, %r1595, 8;
or.b32 %r1597, %r5911, -2147483648;
shr.u32 %r1598, %r1596, 5;
mov.u64 %rd2667, 0;
mov.u32 %r8291, 0;
mov.u64 %rd1904, __cudart_i2opi_f;
mov.u64 %rd2668, %rd2667;
$L__BB0_1221:
.pragma "nounroll";
shl.b64 %rd1903, %rd2667, 2;
add.s64 %rd1905, %rd1904, %rd1903;
ld.global.nc.u32 %r5912, [%rd1905];
mad.wide.u32 %rd1906, %r5912, %r1597, %rd2668;
shr.u64 %rd2668, %rd1906, 32;
add.s64 %rd1907, %rd1, %rd1903;
st.local.u32 [%rd1907], %rd1906;
add.s32 %r8291, %r8291, 1;
cvt.s64.s32 %rd2667, %r8291;
setp.ne.s32 %p1043, %r8291, 6;
@%p1043 bra $L__BB0_1221;
st.local.u32 [%rd4], %rd2668;
mov.u32 %r5913, 4;
sub.s32 %r1601, %r5913, %r1598;
mov.u32 %r5914, 6;
sub.s32 %r5915, %r5914, %r1598;
mul.wide.s32 %rd1908, %r5915, 4;
add.s64 %rd1909, %rd1, %rd1908;
ld.local.u32 %r8292, [%rd1909];
ld.local.u32 %r8293, [%rd1909+-4];
and.b32 %r1604, %r1596, 31;
setp.eq.s32 %p1044, %r1604, 0;
@%p1044 bra $L__BB0_1224;
mov.u32 %r5916, 32;
sub.s32 %r5917, %r5916, %r1604;
shr.u32 %r5918, %r8293, %r5917;
shl.b32 %r5919, %r8292, %r1604;
add.s32 %r8292, %r5918, %r5919;
mul.wide.s32 %rd1910, %r1601, 4;
add.s64 %rd1911, %rd1, %rd1910;
ld.local.u32 %r5920, [%rd1911];
shr.u32 %r5921, %r5920, %r5917;
shl.b32 %r5922, %r8293, %r1604;
add.s32 %r8293, %r5921, %r5922;
$L__BB0_1224:
and.b32 %r5923, %r1595, -2147483648;
shr.u32 %r5924, %r8293, 30;
shl.b32 %r5925, %r8292, 2;
or.b32 %r5926, %r5924, %r5925;
shr.u32 %r5927, %r5926, 31;
shr.u32 %r5928, %r8292, 30;
add.s32 %r5929, %r5927, %r5928;
neg.s32 %r5930, %r5929;
setp.eq.s32 %p1045, %r5923, 0;
selp.b32 %r8294, %r5929, %r5930, %p1045;
setp.ne.s32 %p1046, %r5927, 0;
xor.b32 %r5931, %r5923, -2147483648;
selp.b32 %r5932, %r5931, %r5923, %p1046;
selp.b32 %r5933, -1, 0, %p1046;
xor.b32 %r5934, %r5926, %r5933;
shl.b32 %r5935, %r8293, 2;
xor.b32 %r5936, %r5935, %r5933;
cvt.u64.u32 %rd1912, %r5934;
cvt.u64.u32 %rd1913, %r5936;
bfi.b64 %rd1914, %rd1912, %rd1913, 32, 32;
cvt.rn.f64.s64 %fd163, %rd1914;
mul.f64 %fd164, %fd163, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4071, %fd164;
setp.eq.s32 %p1047, %r5932, 0;
neg.f32 %f4072, %f4071;
selp.f32 %f5746, %f4071, %f4072, %p1047;
$L__BB0_1226:
add.s32 %r1611, %r8294, 1;
and.b32 %r1612, %r1611, 1;
setp.eq.s32 %p1048, %r1612, 0;
selp.f32 %f1439, %f5746, 0f3F800000, %p1048;
mul.rn.f32 %f1440, %f5746, %f5746;
mov.f32 %f5747, 0fB94D4153;
@%p1048 bra $L__BB0_1228;
mov.f32 %f4075, 0fBAB607ED;
mov.f32 %f4076, 0f37CBAC00;
fma.rn.f32 %f5747, %f4076, %f1440, %f4075;
$L__BB0_1228:
selp.f32 %f4077, 0f3C0885E4, 0f3D2AAABB, %p1048;
fma.rn.f32 %f4078, %f5747, %f1440, %f4077;
selp.f32 %f4079, 0fBE2AAAA8, 0fBEFFFFFF, %p1048;
fma.rn.f32 %f4080, %f4078, %f1440, %f4079;
mov.f32 %f4081, 0f00000000;
fma.rn.f32 %f4082, %f1440, %f1439, %f4081;
fma.rn.f32 %f5748, %f4080, %f4082, %f1439;
and.b32 %r5938, %r1611, 2;
setp.eq.s32 %p1050, %r5938, 0;
@%p1050 bra $L__BB0_1230;
mov.f32 %f4084, 0fBF800000;
fma.rn.f32 %f5748, %f5748, %f4084, %f4081;
$L__BB0_1230:
add.f32 %f5798, %f5745, %f5748;
mul.f32 %f4085, %f1415, 0f3F22F983;
cvt.rni.s32.f32 %r8298, %f4085;
cvt.rn.f32.s32 %f4086, %r8298;
mov.f32 %f4087, 0fBFC90FDA;
fma.rn.f32 %f4088, %f4086, %f4087, %f1415;
mov.f32 %f4089, 0fB3A22168;
fma.rn.f32 %f4090, %f4086, %f4089, %f4088;
mov.f32 %f4091, 0fA7C234C5;
fma.rn.f32 %f5749, %f4086, %f4091, %f4090;
abs.f32 %f1448, %f1415;
setp.ltu.f32 %p1051, %f1448, 0f47CE4780;
@%p1051 bra $L__BB0_1238;
setp.eq.f32 %p1052, %f1448, 0f7F800000;
@%p1052 bra $L__BB0_1237;
bra.uni $L__BB0_1232;
$L__BB0_1237:
mov.f32 %f4094, 0f00000000;
mul.rn.f32 %f5749, %f1415, %f4094;
mov.u32 %r8298, 0;
bra.uni $L__BB0_1238;
$L__BB0_1232:
mov.b32 %r1614, %f1415;
shr.u32 %r5940, %r1614, 23;
and.b32 %r5941, %r5940, 255;
add.s32 %r1615, %r5941, -128;
shl.b32 %r5942, %r1614, 8;
or.b32 %r1616, %r5942, -2147483648;
shr.u32 %r1617, %r1615, 5;
mov.u64 %rd2669, 0;
mov.u32 %r8295, 0;
mov.u64 %rd1918, __cudart_i2opi_f;
mov.u64 %rd2670, %rd2669;
$L__BB0_1233:
.pragma "nounroll";
shl.b64 %rd1917, %rd2669, 2;
add.s64 %rd1919, %rd1918, %rd1917;
ld.global.nc.u32 %r5943, [%rd1919];
mad.wide.u32 %rd1920, %r5943, %r1616, %rd2670;
shr.u64 %rd2670, %rd1920, 32;
add.s64 %rd1921, %rd1, %rd1917;
st.local.u32 [%rd1921], %rd1920;
add.s32 %r8295, %r8295, 1;
cvt.s64.s32 %rd2669, %r8295;
setp.ne.s32 %p1053, %r8295, 6;
@%p1053 bra $L__BB0_1233;
st.local.u32 [%rd4], %rd2670;
mov.u32 %r5944, 4;
sub.s32 %r1620, %r5944, %r1617;
mov.u32 %r5945, 6;
sub.s32 %r5946, %r5945, %r1617;
mul.wide.s32 %rd1922, %r5946, 4;
add.s64 %rd1923, %rd1, %rd1922;
ld.local.u32 %r8296, [%rd1923];
ld.local.u32 %r8297, [%rd1923+-4];
and.b32 %r1623, %r1615, 31;
setp.eq.s32 %p1054, %r1623, 0;
@%p1054 bra $L__BB0_1236;
mov.u32 %r5947, 32;
sub.s32 %r5948, %r5947, %r1623;
shr.u32 %r5949, %r8297, %r5948;
shl.b32 %r5950, %r8296, %r1623;
add.s32 %r8296, %r5949, %r5950;
mul.wide.s32 %rd1924, %r1620, 4;
add.s64 %rd1925, %rd1, %rd1924;
ld.local.u32 %r5951, [%rd1925];
shr.u32 %r5952, %r5951, %r5948;
shl.b32 %r5953, %r8297, %r1623;
add.s32 %r8297, %r5952, %r5953;
$L__BB0_1236:
and.b32 %r5954, %r1614, -2147483648;
shr.u32 %r5955, %r8297, 30;
shl.b32 %r5956, %r8296, 2;
or.b32 %r5957, %r5955, %r5956;
shr.u32 %r5958, %r5957, 31;
shr.u32 %r5959, %r8296, 30;
add.s32 %r5960, %r5958, %r5959;
neg.s32 %r5961, %r5960;
setp.eq.s32 %p1055, %r5954, 0;
selp.b32 %r8298, %r5960, %r5961, %p1055;
setp.ne.s32 %p1056, %r5958, 0;
xor.b32 %r5962, %r5954, -2147483648;
selp.b32 %r5963, %r5962, %r5954, %p1056;
selp.b32 %r5964, -1, 0, %p1056;
xor.b32 %r5965, %r5957, %r5964;
shl.b32 %r5966, %r8297, 2;
xor.b32 %r5967, %r5966, %r5964;
cvt.u64.u32 %rd1926, %r5965;
cvt.u64.u32 %rd1927, %r5967;
bfi.b64 %rd1928, %rd1926, %rd1927, 32, 32;
cvt.rn.f64.s64 %fd165, %rd1928;
mul.f64 %fd166, %fd165, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4092, %fd166;
setp.eq.s32 %p1057, %r5963, 0;
neg.f32 %f4093, %f4092;
selp.f32 %f5749, %f4092, %f4093, %p1057;
$L__BB0_1238:
and.b32 %r1630, %r8298, 1;
setp.eq.s32 %p1058, %r1630, 0;
selp.f32 %f1452, %f5749, 0f3F800000, %p1058;
mul.rn.f32 %f1453, %f5749, %f5749;
mov.f32 %f5750, 0fB94D4153;
@%p1058 bra $L__BB0_1240;
mov.f32 %f4096, 0fBAB607ED;
mov.f32 %f4097, 0f37CBAC00;
fma.rn.f32 %f5750, %f4097, %f1453, %f4096;
$L__BB0_1240:
selp.f32 %f4098, 0f3C0885E4, 0f3D2AAABB, %p1058;
fma.rn.f32 %f4099, %f5750, %f1453, %f4098;
selp.f32 %f4100, 0fBE2AAAA8, 0fBEFFFFFF, %p1058;
fma.rn.f32 %f4101, %f4099, %f1453, %f4100;
mov.f32 %f4102, 0f00000000;
fma.rn.f32 %f4103, %f1453, %f1452, %f4102;
fma.rn.f32 %f5751, %f4101, %f4103, %f1452;
and.b32 %r5969, %r8298, 2;
setp.eq.s32 %p1060, %r5969, 0;
@%p1060 bra $L__BB0_1242;
mov.f32 %f4105, 0fBF800000;
fma.rn.f32 %f5751, %f5751, %f4105, %f4102;
$L__BB0_1242:
mul.f32 %f4106, %f1407, 0f3F22F983;
cvt.rni.s32.f32 %r8302, %f4106;
cvt.rn.f32.s32 %f4107, %r8302;
mov.f32 %f4108, 0fBFC90FDA;
fma.rn.f32 %f4109, %f4107, %f4108, %f1407;
mov.f32 %f4110, 0fB3A22168;
fma.rn.f32 %f4111, %f4107, %f4110, %f4109;
mov.f32 %f4112, 0fA7C234C5;
fma.rn.f32 %f5752, %f4107, %f4112, %f4111;
abs.f32 %f1460, %f1407;
setp.ltu.f32 %p1061, %f1460, 0f47CE4780;
@%p1061 bra $L__BB0_1250;
setp.eq.f32 %p1062, %f1460, 0f7F800000;
@%p1062 bra $L__BB0_1249;
bra.uni $L__BB0_1244;
$L__BB0_1249:
mov.f32 %f4115, 0f00000000;
mul.rn.f32 %f5752, %f1407, %f4115;
mov.u32 %r8302, 0;
bra.uni $L__BB0_1250;
$L__BB0_1244:
mov.b32 %r1632, %f1407;
shr.u32 %r5971, %r1632, 23;
and.b32 %r5972, %r5971, 255;
add.s32 %r1633, %r5972, -128;
shl.b32 %r5973, %r1632, 8;
or.b32 %r1634, %r5973, -2147483648;
shr.u32 %r1635, %r1633, 5;
mov.u64 %rd2671, 0;
mov.u32 %r8299, 0;
mov.u64 %rd1932, __cudart_i2opi_f;
mov.u64 %rd2672, %rd2671;
$L__BB0_1245:
.pragma "nounroll";
shl.b64 %rd1931, %rd2671, 2;
add.s64 %rd1933, %rd1932, %rd1931;
ld.global.nc.u32 %r5974, [%rd1933];
mad.wide.u32 %rd1934, %r5974, %r1634, %rd2672;
shr.u64 %rd2672, %rd1934, 32;
add.s64 %rd1935, %rd1, %rd1931;
st.local.u32 [%rd1935], %rd1934;
add.s32 %r8299, %r8299, 1;
cvt.s64.s32 %rd2671, %r8299;
setp.ne.s32 %p1063, %r8299, 6;
@%p1063 bra $L__BB0_1245;
st.local.u32 [%rd4], %rd2672;
mov.u32 %r5975, 4;
sub.s32 %r1638, %r5975, %r1635;
mov.u32 %r5976, 6;
sub.s32 %r5977, %r5976, %r1635;
mul.wide.s32 %rd1936, %r5977, 4;
add.s64 %rd1937, %rd1, %rd1936;
ld.local.u32 %r8300, [%rd1937];
ld.local.u32 %r8301, [%rd1937+-4];
and.b32 %r1641, %r1633, 31;
setp.eq.s32 %p1064, %r1641, 0;
@%p1064 bra $L__BB0_1248;
mov.u32 %r5978, 32;
sub.s32 %r5979, %r5978, %r1641;
shr.u32 %r5980, %r8301, %r5979;
shl.b32 %r5981, %r8300, %r1641;
add.s32 %r8300, %r5980, %r5981;
mul.wide.s32 %rd1938, %r1638, 4;
add.s64 %rd1939, %rd1, %rd1938;
ld.local.u32 %r5982, [%rd1939];
shr.u32 %r5983, %r5982, %r5979;
shl.b32 %r5984, %r8301, %r1641;
add.s32 %r8301, %r5983, %r5984;
$L__BB0_1248:
and.b32 %r5985, %r1632, -2147483648;
shr.u32 %r5986, %r8301, 30;
shl.b32 %r5987, %r8300, 2;
or.b32 %r5988, %r5986, %r5987;
shr.u32 %r5989, %r5988, 31;
shr.u32 %r5990, %r8300, 30;
add.s32 %r5991, %r5989, %r5990;
neg.s32 %r5992, %r5991;
setp.eq.s32 %p1065, %r5985, 0;
selp.b32 %r8302, %r5991, %r5992, %p1065;
setp.ne.s32 %p1066, %r5989, 0;
xor.b32 %r5993, %r5985, -2147483648;
selp.b32 %r5994, %r5993, %r5985, %p1066;
selp.b32 %r5995, -1, 0, %p1066;
xor.b32 %r5996, %r5988, %r5995;
shl.b32 %r5997, %r8301, 2;
xor.b32 %r5998, %r5997, %r5995;
cvt.u64.u32 %rd1940, %r5996;
cvt.u64.u32 %rd1941, %r5998;
bfi.b64 %rd1942, %rd1940, %rd1941, 32, 32;
cvt.rn.f64.s64 %fd167, %rd1942;
mul.f64 %fd168, %fd167, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4113, %fd168;
setp.eq.s32 %p1067, %r5994, 0;
neg.f32 %f4114, %f4113;
selp.f32 %f5752, %f4113, %f4114, %p1067;
$L__BB0_1250:
add.s32 %r1648, %r8302, 1;
and.b32 %r1649, %r1648, 1;
setp.eq.s32 %p1068, %r1649, 0;
selp.f32 %f1464, %f5752, 0f3F800000, %p1068;
mul.rn.f32 %f1465, %f5752, %f5752;
mov.f32 %f5753, 0fB94D4153;
@%p1068 bra $L__BB0_1252;
mov.f32 %f4117, 0fBAB607ED;
mov.f32 %f4118, 0f37CBAC00;
fma.rn.f32 %f5753, %f4118, %f1465, %f4117;
$L__BB0_1252:
selp.f32 %f4119, 0f3C0885E4, 0f3D2AAABB, %p1068;
fma.rn.f32 %f4120, %f5753, %f1465, %f4119;
selp.f32 %f4121, 0fBE2AAAA8, 0fBEFFFFFF, %p1068;
fma.rn.f32 %f4122, %f4120, %f1465, %f4121;
mov.f32 %f4123, 0f00000000;
fma.rn.f32 %f4124, %f1465, %f1464, %f4123;
fma.rn.f32 %f5754, %f4122, %f4124, %f1464;
and.b32 %r6000, %r1648, 2;
setp.eq.s32 %p1070, %r6000, 0;
@%p1070 bra $L__BB0_1254;
mov.f32 %f4126, 0fBF800000;
fma.rn.f32 %f5754, %f5754, %f4126, %f4123;
$L__BB0_1254:
add.f32 %f5797, %f5751, %f5754;
mul.f32 %f4127, %f1416, 0f3F22F983;
cvt.rni.s32.f32 %r8306, %f4127;
cvt.rn.f32.s32 %f4128, %r8306;
mov.f32 %f4129, 0fBFC90FDA;
fma.rn.f32 %f4130, %f4128, %f4129, %f1416;
mov.f32 %f4131, 0fB3A22168;
fma.rn.f32 %f4132, %f4128, %f4131, %f4130;
mov.f32 %f4133, 0fA7C234C5;
fma.rn.f32 %f5755, %f4128, %f4133, %f4132;
abs.f32 %f1473, %f1416;
setp.ltu.f32 %p1071, %f1473, 0f47CE4780;
@%p1071 bra $L__BB0_1262;
setp.eq.f32 %p1072, %f1473, 0f7F800000;
@%p1072 bra $L__BB0_1261;
bra.uni $L__BB0_1256;
$L__BB0_1261:
mov.f32 %f4136, 0f00000000;
mul.rn.f32 %f5755, %f1416, %f4136;
mov.u32 %r8306, 0;
bra.uni $L__BB0_1262;
$L__BB0_1256:
mov.b32 %r1651, %f1416;
shr.u32 %r6002, %r1651, 23;
and.b32 %r6003, %r6002, 255;
add.s32 %r1652, %r6003, -128;
shl.b32 %r6004, %r1651, 8;
or.b32 %r1653, %r6004, -2147483648;
shr.u32 %r1654, %r1652, 5;
mov.u64 %rd2673, 0;
mov.u32 %r8303, 0;
mov.u64 %rd1946, __cudart_i2opi_f;
mov.u64 %rd2674, %rd2673;
$L__BB0_1257:
.pragma "nounroll";
shl.b64 %rd1945, %rd2673, 2;
add.s64 %rd1947, %rd1946, %rd1945;
ld.global.nc.u32 %r6005, [%rd1947];
mad.wide.u32 %rd1948, %r6005, %r1653, %rd2674;
shr.u64 %rd2674, %rd1948, 32;
add.s64 %rd1949, %rd1, %rd1945;
st.local.u32 [%rd1949], %rd1948;
add.s32 %r8303, %r8303, 1;
cvt.s64.s32 %rd2673, %r8303;
setp.ne.s32 %p1073, %r8303, 6;
@%p1073 bra $L__BB0_1257;
st.local.u32 [%rd4], %rd2674;
mov.u32 %r6006, 4;
sub.s32 %r1657, %r6006, %r1654;
mov.u32 %r6007, 6;
sub.s32 %r6008, %r6007, %r1654;
mul.wide.s32 %rd1950, %r6008, 4;
add.s64 %rd1951, %rd1, %rd1950;
ld.local.u32 %r8304, [%rd1951];
ld.local.u32 %r8305, [%rd1951+-4];
and.b32 %r1660, %r1652, 31;
setp.eq.s32 %p1074, %r1660, 0;
@%p1074 bra $L__BB0_1260;
mov.u32 %r6009, 32;
sub.s32 %r6010, %r6009, %r1660;
shr.u32 %r6011, %r8305, %r6010;
shl.b32 %r6012, %r8304, %r1660;
add.s32 %r8304, %r6011, %r6012;
mul.wide.s32 %rd1952, %r1657, 4;
add.s64 %rd1953, %rd1, %rd1952;
ld.local.u32 %r6013, [%rd1953];
shr.u32 %r6014, %r6013, %r6010;
shl.b32 %r6015, %r8305, %r1660;
add.s32 %r8305, %r6014, %r6015;
$L__BB0_1260:
and.b32 %r6016, %r1651, -2147483648;
shr.u32 %r6017, %r8305, 30;
shl.b32 %r6018, %r8304, 2;
or.b32 %r6019, %r6017, %r6018;
shr.u32 %r6020, %r6019, 31;
shr.u32 %r6021, %r8304, 30;
add.s32 %r6022, %r6020, %r6021;
neg.s32 %r6023, %r6022;
setp.eq.s32 %p1075, %r6016, 0;
selp.b32 %r8306, %r6022, %r6023, %p1075;
setp.ne.s32 %p1076, %r6020, 0;
xor.b32 %r6024, %r6016, -2147483648;
selp.b32 %r6025, %r6024, %r6016, %p1076;
selp.b32 %r6026, -1, 0, %p1076;
xor.b32 %r6027, %r6019, %r6026;
shl.b32 %r6028, %r8305, 2;
xor.b32 %r6029, %r6028, %r6026;
cvt.u64.u32 %rd1954, %r6027;
cvt.u64.u32 %rd1955, %r6029;
bfi.b64 %rd1956, %rd1954, %rd1955, 32, 32;
cvt.rn.f64.s64 %fd169, %rd1956;
mul.f64 %fd170, %fd169, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4134, %fd170;
setp.eq.s32 %p1077, %r6025, 0;
neg.f32 %f4135, %f4134;
selp.f32 %f5755, %f4134, %f4135, %p1077;
$L__BB0_1262:
and.b32 %r1667, %r8306, 1;
setp.eq.s32 %p1078, %r1667, 0;
selp.f32 %f1477, %f5755, 0f3F800000, %p1078;
mul.rn.f32 %f1478, %f5755, %f5755;
mov.f32 %f5756, 0fB94D4153;
@%p1078 bra $L__BB0_1264;
mov.f32 %f4138, 0fBAB607ED;
mov.f32 %f4139, 0f37CBAC00;
fma.rn.f32 %f5756, %f4139, %f1478, %f4138;
$L__BB0_1264:
selp.f32 %f4140, 0f3C0885E4, 0f3D2AAABB, %p1078;
fma.rn.f32 %f4141, %f5756, %f1478, %f4140;
selp.f32 %f4142, 0fBE2AAAA8, 0fBEFFFFFF, %p1078;
fma.rn.f32 %f4143, %f4141, %f1478, %f4142;
mov.f32 %f4144, 0f00000000;
fma.rn.f32 %f4145, %f1478, %f1477, %f4144;
fma.rn.f32 %f5757, %f4143, %f4145, %f1477;
and.b32 %r6031, %r8306, 2;
setp.eq.s32 %p1080, %r6031, 0;
@%p1080 bra $L__BB0_1266;
mov.f32 %f4147, 0fBF800000;
fma.rn.f32 %f5757, %f5757, %f4147, %f4144;
$L__BB0_1266:
mul.f32 %f4148, %f1408, 0f3F22F983;
cvt.rni.s32.f32 %r8310, %f4148;
cvt.rn.f32.s32 %f4149, %r8310;
mov.f32 %f4150, 0fBFC90FDA;
fma.rn.f32 %f4151, %f4149, %f4150, %f1408;
mov.f32 %f4152, 0fB3A22168;
fma.rn.f32 %f4153, %f4149, %f4152, %f4151;
mov.f32 %f4154, 0fA7C234C5;
fma.rn.f32 %f5758, %f4149, %f4154, %f4153;
abs.f32 %f1485, %f1408;
setp.ltu.f32 %p1081, %f1485, 0f47CE4780;
@%p1081 bra $L__BB0_1274;
setp.eq.f32 %p1082, %f1485, 0f7F800000;
@%p1082 bra $L__BB0_1273;
bra.uni $L__BB0_1268;
$L__BB0_1273:
mov.f32 %f4157, 0f00000000;
mul.rn.f32 %f5758, %f1408, %f4157;
mov.u32 %r8310, 0;
bra.uni $L__BB0_1274;
$L__BB0_1268:
mov.b32 %r1669, %f1408;
shr.u32 %r6033, %r1669, 23;
and.b32 %r6034, %r6033, 255;
add.s32 %r1670, %r6034, -128;
shl.b32 %r6035, %r1669, 8;
or.b32 %r1671, %r6035, -2147483648;
shr.u32 %r1672, %r1670, 5;
mov.u64 %rd2675, 0;
mov.u32 %r8307, 0;
mov.u64 %rd1960, __cudart_i2opi_f;
mov.u64 %rd2676, %rd2675;
$L__BB0_1269:
.pragma "nounroll";
shl.b64 %rd1959, %rd2675, 2;
add.s64 %rd1961, %rd1960, %rd1959;
ld.global.nc.u32 %r6036, [%rd1961];
mad.wide.u32 %rd1962, %r6036, %r1671, %rd2676;
shr.u64 %rd2676, %rd1962, 32;
add.s64 %rd1963, %rd1, %rd1959;
st.local.u32 [%rd1963], %rd1962;
add.s32 %r8307, %r8307, 1;
cvt.s64.s32 %rd2675, %r8307;
setp.ne.s32 %p1083, %r8307, 6;
@%p1083 bra $L__BB0_1269;
st.local.u32 [%rd4], %rd2676;
mov.u32 %r6037, 4;
sub.s32 %r1675, %r6037, %r1672;
mov.u32 %r6038, 6;
sub.s32 %r6039, %r6038, %r1672;
mul.wide.s32 %rd1964, %r6039, 4;
add.s64 %rd1965, %rd1, %rd1964;
ld.local.u32 %r8308, [%rd1965];
ld.local.u32 %r8309, [%rd1965+-4];
and.b32 %r1678, %r1670, 31;
setp.eq.s32 %p1084, %r1678, 0;
@%p1084 bra $L__BB0_1272;
mov.u32 %r6040, 32;
sub.s32 %r6041, %r6040, %r1678;
shr.u32 %r6042, %r8309, %r6041;
shl.b32 %r6043, %r8308, %r1678;
add.s32 %r8308, %r6042, %r6043;
mul.wide.s32 %rd1966, %r1675, 4;
add.s64 %rd1967, %rd1, %rd1966;
ld.local.u32 %r6044, [%rd1967];
shr.u32 %r6045, %r6044, %r6041;
shl.b32 %r6046, %r8309, %r1678;
add.s32 %r8309, %r6045, %r6046;
$L__BB0_1272:
and.b32 %r6047, %r1669, -2147483648;
shr.u32 %r6048, %r8309, 30;
shl.b32 %r6049, %r8308, 2;
or.b32 %r6050, %r6048, %r6049;
shr.u32 %r6051, %r6050, 31;
shr.u32 %r6052, %r8308, 30;
add.s32 %r6053, %r6051, %r6052;
neg.s32 %r6054, %r6053;
setp.eq.s32 %p1085, %r6047, 0;
selp.b32 %r8310, %r6053, %r6054, %p1085;
setp.ne.s32 %p1086, %r6051, 0;
xor.b32 %r6055, %r6047, -2147483648;
selp.b32 %r6056, %r6055, %r6047, %p1086;
selp.b32 %r6057, -1, 0, %p1086;
xor.b32 %r6058, %r6050, %r6057;
shl.b32 %r6059, %r8309, 2;
xor.b32 %r6060, %r6059, %r6057;
cvt.u64.u32 %rd1968, %r6058;
cvt.u64.u32 %rd1969, %r6060;
bfi.b64 %rd1970, %rd1968, %rd1969, 32, 32;
cvt.rn.f64.s64 %fd171, %rd1970;
mul.f64 %fd172, %fd171, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4155, %fd172;
setp.eq.s32 %p1087, %r6056, 0;
neg.f32 %f4156, %f4155;
selp.f32 %f5758, %f4155, %f4156, %p1087;
$L__BB0_1274:
add.s32 %r1685, %r8310, 1;
and.b32 %r1686, %r1685, 1;
setp.eq.s32 %p1088, %r1686, 0;
selp.f32 %f1489, %f5758, 0f3F800000, %p1088;
mul.rn.f32 %f1490, %f5758, %f5758;
mov.f32 %f5759, 0fB94D4153;
@%p1088 bra $L__BB0_1276;
mov.f32 %f4159, 0fBAB607ED;
mov.f32 %f4160, 0f37CBAC00;
fma.rn.f32 %f5759, %f4160, %f1490, %f4159;
$L__BB0_1276:
selp.f32 %f4161, 0f3C0885E4, 0f3D2AAABB, %p1088;
fma.rn.f32 %f4162, %f5759, %f1490, %f4161;
selp.f32 %f4163, 0fBE2AAAA8, 0fBEFFFFFF, %p1088;
fma.rn.f32 %f4164, %f4162, %f1490, %f4163;
mov.f32 %f4165, 0f00000000;
fma.rn.f32 %f4166, %f1490, %f1489, %f4165;
fma.rn.f32 %f5760, %f4164, %f4166, %f1489;
and.b32 %r6062, %r1685, 2;
setp.eq.s32 %p1090, %r6062, 0;
@%p1090 bra $L__BB0_1278;
mov.f32 %f4168, 0fBF800000;
fma.rn.f32 %f5760, %f5760, %f4168, %f4165;
$L__BB0_1278:
add.f32 %f5796, %f5757, %f5760;
mul.f32 %f4169, %f1417, 0f3F22F983;
cvt.rni.s32.f32 %r8314, %f4169;
cvt.rn.f32.s32 %f4170, %r8314;
mov.f32 %f4171, 0fBFC90FDA;
fma.rn.f32 %f4172, %f4170, %f4171, %f1417;
mov.f32 %f4173, 0fB3A22168;
fma.rn.f32 %f4174, %f4170, %f4173, %f4172;
mov.f32 %f4175, 0fA7C234C5;
fma.rn.f32 %f5761, %f4170, %f4175, %f4174;
abs.f32 %f1498, %f1417;
setp.ltu.f32 %p1091, %f1498, 0f47CE4780;
@%p1091 bra $L__BB0_1286;
setp.eq.f32 %p1092, %f1498, 0f7F800000;
@%p1092 bra $L__BB0_1285;
bra.uni $L__BB0_1280;
$L__BB0_1285:
mov.f32 %f4178, 0f00000000;
mul.rn.f32 %f5761, %f1417, %f4178;
mov.u32 %r8314, 0;
bra.uni $L__BB0_1286;
$L__BB0_1280:
mov.b32 %r1688, %f1417;
shr.u32 %r6064, %r1688, 23;
and.b32 %r6065, %r6064, 255;
add.s32 %r1689, %r6065, -128;
shl.b32 %r6066, %r1688, 8;
or.b32 %r1690, %r6066, -2147483648;
shr.u32 %r1691, %r1689, 5;
mov.u64 %rd2677, 0;
mov.u32 %r8311, 0;
mov.u64 %rd1974, __cudart_i2opi_f;
mov.u64 %rd2678, %rd2677;
$L__BB0_1281:
.pragma "nounroll";
shl.b64 %rd1973, %rd2677, 2;
add.s64 %rd1975, %rd1974, %rd1973;
ld.global.nc.u32 %r6067, [%rd1975];
mad.wide.u32 %rd1976, %r6067, %r1690, %rd2678;
shr.u64 %rd2678, %rd1976, 32;
add.s64 %rd1977, %rd1, %rd1973;
st.local.u32 [%rd1977], %rd1976;
add.s32 %r8311, %r8311, 1;
cvt.s64.s32 %rd2677, %r8311;
setp.ne.s32 %p1093, %r8311, 6;
@%p1093 bra $L__BB0_1281;
st.local.u32 [%rd4], %rd2678;
mov.u32 %r6068, 4;
sub.s32 %r1694, %r6068, %r1691;
mov.u32 %r6069, 6;
sub.s32 %r6070, %r6069, %r1691;
mul.wide.s32 %rd1978, %r6070, 4;
add.s64 %rd1979, %rd1, %rd1978;
ld.local.u32 %r8312, [%rd1979];
ld.local.u32 %r8313, [%rd1979+-4];
and.b32 %r1697, %r1689, 31;
setp.eq.s32 %p1094, %r1697, 0;
@%p1094 bra $L__BB0_1284;
mov.u32 %r6071, 32;
sub.s32 %r6072, %r6071, %r1697;
shr.u32 %r6073, %r8313, %r6072;
shl.b32 %r6074, %r8312, %r1697;
add.s32 %r8312, %r6073, %r6074;
mul.wide.s32 %rd1980, %r1694, 4;
add.s64 %rd1981, %rd1, %rd1980;
ld.local.u32 %r6075, [%rd1981];
shr.u32 %r6076, %r6075, %r6072;
shl.b32 %r6077, %r8313, %r1697;
add.s32 %r8313, %r6076, %r6077;
$L__BB0_1284:
and.b32 %r6078, %r1688, -2147483648;
shr.u32 %r6079, %r8313, 30;
shl.b32 %r6080, %r8312, 2;
or.b32 %r6081, %r6079, %r6080;
shr.u32 %r6082, %r6081, 31;
shr.u32 %r6083, %r8312, 30;
add.s32 %r6084, %r6082, %r6083;
neg.s32 %r6085, %r6084;
setp.eq.s32 %p1095, %r6078, 0;
selp.b32 %r8314, %r6084, %r6085, %p1095;
setp.ne.s32 %p1096, %r6082, 0;
xor.b32 %r6086, %r6078, -2147483648;
selp.b32 %r6087, %r6086, %r6078, %p1096;
selp.b32 %r6088, -1, 0, %p1096;
xor.b32 %r6089, %r6081, %r6088;
shl.b32 %r6090, %r8313, 2;
xor.b32 %r6091, %r6090, %r6088;
cvt.u64.u32 %rd1982, %r6089;
cvt.u64.u32 %rd1983, %r6091;
bfi.b64 %rd1984, %rd1982, %rd1983, 32, 32;
cvt.rn.f64.s64 %fd173, %rd1984;
mul.f64 %fd174, %fd173, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4176, %fd174;
setp.eq.s32 %p1097, %r6087, 0;
neg.f32 %f4177, %f4176;
selp.f32 %f5761, %f4176, %f4177, %p1097;
$L__BB0_1286:
and.b32 %r1704, %r8314, 1;
setp.eq.s32 %p1098, %r1704, 0;
selp.f32 %f1502, %f5761, 0f3F800000, %p1098;
mul.rn.f32 %f1503, %f5761, %f5761;
mov.f32 %f5762, 0fB94D4153;
@%p1098 bra $L__BB0_1288;
mov.f32 %f4180, 0fBAB607ED;
mov.f32 %f4181, 0f37CBAC00;
fma.rn.f32 %f5762, %f4181, %f1503, %f4180;
$L__BB0_1288:
selp.f32 %f4182, 0f3C0885E4, 0f3D2AAABB, %p1098;
fma.rn.f32 %f4183, %f5762, %f1503, %f4182;
selp.f32 %f4184, 0fBE2AAAA8, 0fBEFFFFFF, %p1098;
fma.rn.f32 %f4185, %f4183, %f1503, %f4184;
mov.f32 %f4186, 0f00000000;
fma.rn.f32 %f4187, %f1503, %f1502, %f4186;
fma.rn.f32 %f5763, %f4185, %f4187, %f1502;
and.b32 %r6093, %r8314, 2;
setp.eq.s32 %p1100, %r6093, 0;
@%p1100 bra $L__BB0_1290;
mov.f32 %f4189, 0fBF800000;
fma.rn.f32 %f5763, %f5763, %f4189, %f4186;
$L__BB0_1290:
mul.f32 %f4190, %f1409, 0f3F22F983;
cvt.rni.s32.f32 %r8318, %f4190;
cvt.rn.f32.s32 %f4191, %r8318;
mov.f32 %f4192, 0fBFC90FDA;
fma.rn.f32 %f4193, %f4191, %f4192, %f1409;
mov.f32 %f4194, 0fB3A22168;
fma.rn.f32 %f4195, %f4191, %f4194, %f4193;
mov.f32 %f4196, 0fA7C234C5;
fma.rn.f32 %f5764, %f4191, %f4196, %f4195;
abs.f32 %f1510, %f1409;
setp.ltu.f32 %p1101, %f1510, 0f47CE4780;
@%p1101 bra $L__BB0_1298;
setp.eq.f32 %p1102, %f1510, 0f7F800000;
@%p1102 bra $L__BB0_1297;
bra.uni $L__BB0_1292;
$L__BB0_1297:
mov.f32 %f4199, 0f00000000;
mul.rn.f32 %f5764, %f1409, %f4199;
mov.u32 %r8318, 0;
bra.uni $L__BB0_1298;
$L__BB0_1292:
mov.b32 %r1706, %f1409;
shr.u32 %r6095, %r1706, 23;
and.b32 %r6096, %r6095, 255;
add.s32 %r1707, %r6096, -128;
shl.b32 %r6097, %r1706, 8;
or.b32 %r1708, %r6097, -2147483648;
shr.u32 %r1709, %r1707, 5;
mov.u64 %rd2679, 0;
mov.u32 %r8315, 0;
mov.u64 %rd1988, __cudart_i2opi_f;
mov.u64 %rd2680, %rd2679;
$L__BB0_1293:
.pragma "nounroll";
shl.b64 %rd1987, %rd2679, 2;
add.s64 %rd1989, %rd1988, %rd1987;
ld.global.nc.u32 %r6098, [%rd1989];
mad.wide.u32 %rd1990, %r6098, %r1708, %rd2680;
shr.u64 %rd2680, %rd1990, 32;
add.s64 %rd1991, %rd1, %rd1987;
st.local.u32 [%rd1991], %rd1990;
add.s32 %r8315, %r8315, 1;
cvt.s64.s32 %rd2679, %r8315;
setp.ne.s32 %p1103, %r8315, 6;
@%p1103 bra $L__BB0_1293;
st.local.u32 [%rd4], %rd2680;
mov.u32 %r6099, 4;
sub.s32 %r1712, %r6099, %r1709;
mov.u32 %r6100, 6;
sub.s32 %r6101, %r6100, %r1709;
mul.wide.s32 %rd1992, %r6101, 4;
add.s64 %rd1993, %rd1, %rd1992;
ld.local.u32 %r8316, [%rd1993];
ld.local.u32 %r8317, [%rd1993+-4];
and.b32 %r1715, %r1707, 31;
setp.eq.s32 %p1104, %r1715, 0;
@%p1104 bra $L__BB0_1296;
mov.u32 %r6102, 32;
sub.s32 %r6103, %r6102, %r1715;
shr.u32 %r6104, %r8317, %r6103;
shl.b32 %r6105, %r8316, %r1715;
add.s32 %r8316, %r6104, %r6105;
mul.wide.s32 %rd1994, %r1712, 4;
add.s64 %rd1995, %rd1, %rd1994;
ld.local.u32 %r6106, [%rd1995];
shr.u32 %r6107, %r6106, %r6103;
shl.b32 %r6108, %r8317, %r1715;
add.s32 %r8317, %r6107, %r6108;
$L__BB0_1296:
and.b32 %r6109, %r1706, -2147483648;
shr.u32 %r6110, %r8317, 30;
shl.b32 %r6111, %r8316, 2;
or.b32 %r6112, %r6110, %r6111;
shr.u32 %r6113, %r6112, 31;
shr.u32 %r6114, %r8316, 30;
add.s32 %r6115, %r6113, %r6114;
neg.s32 %r6116, %r6115;
setp.eq.s32 %p1105, %r6109, 0;
selp.b32 %r8318, %r6115, %r6116, %p1105;
setp.ne.s32 %p1106, %r6113, 0;
xor.b32 %r6117, %r6109, -2147483648;
selp.b32 %r6118, %r6117, %r6109, %p1106;
selp.b32 %r6119, -1, 0, %p1106;
xor.b32 %r6120, %r6112, %r6119;
shl.b32 %r6121, %r8317, 2;
xor.b32 %r6122, %r6121, %r6119;
cvt.u64.u32 %rd1996, %r6120;
cvt.u64.u32 %rd1997, %r6122;
bfi.b64 %rd1998, %rd1996, %rd1997, 32, 32;
cvt.rn.f64.s64 %fd175, %rd1998;
mul.f64 %fd176, %fd175, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4197, %fd176;
setp.eq.s32 %p1107, %r6118, 0;
neg.f32 %f4198, %f4197;
selp.f32 %f5764, %f4197, %f4198, %p1107;
$L__BB0_1298:
add.s32 %r1722, %r8318, 1;
and.b32 %r1723, %r1722, 1;
setp.eq.s32 %p1108, %r1723, 0;
selp.f32 %f1514, %f5764, 0f3F800000, %p1108;
mul.rn.f32 %f1515, %f5764, %f5764;
mov.f32 %f5765, 0fB94D4153;
@%p1108 bra $L__BB0_1300;
mov.f32 %f4201, 0fBAB607ED;
mov.f32 %f4202, 0f37CBAC00;
fma.rn.f32 %f5765, %f4202, %f1515, %f4201;
$L__BB0_1300:
selp.f32 %f4203, 0f3C0885E4, 0f3D2AAABB, %p1108;
fma.rn.f32 %f4204, %f5765, %f1515, %f4203;
selp.f32 %f4205, 0fBE2AAAA8, 0fBEFFFFFF, %p1108;
fma.rn.f32 %f4206, %f4204, %f1515, %f4205;
mov.f32 %f4207, 0f00000000;
fma.rn.f32 %f4208, %f1515, %f1514, %f4207;
fma.rn.f32 %f5766, %f4206, %f4208, %f1514;
and.b32 %r6124, %r1722, 2;
setp.eq.s32 %p1110, %r6124, 0;
@%p1110 bra $L__BB0_1302;
mov.f32 %f4210, 0fBF800000;
fma.rn.f32 %f5766, %f5766, %f4210, %f4207;
$L__BB0_1302:
add.f32 %f5795, %f5763, %f5766;
mul.f32 %f4211, %f1418, 0f3F22F983;
cvt.rni.s32.f32 %r8322, %f4211;
cvt.rn.f32.s32 %f4212, %r8322;
mov.f32 %f4213, 0fBFC90FDA;
fma.rn.f32 %f4214, %f4212, %f4213, %f1418;
mov.f32 %f4215, 0fB3A22168;
fma.rn.f32 %f4216, %f4212, %f4215, %f4214;
mov.f32 %f4217, 0fA7C234C5;
fma.rn.f32 %f5767, %f4212, %f4217, %f4216;
abs.f32 %f1523, %f1418;
setp.ltu.f32 %p1111, %f1523, 0f47CE4780;
@%p1111 bra $L__BB0_1310;
setp.eq.f32 %p1112, %f1523, 0f7F800000;
@%p1112 bra $L__BB0_1309;
bra.uni $L__BB0_1304;
$L__BB0_1309:
mov.f32 %f4220, 0f00000000;
mul.rn.f32 %f5767, %f1418, %f4220;
mov.u32 %r8322, 0;
bra.uni $L__BB0_1310;
$L__BB0_1304:
mov.b32 %r1725, %f1418;
shr.u32 %r6126, %r1725, 23;
and.b32 %r6127, %r6126, 255;
add.s32 %r1726, %r6127, -128;
shl.b32 %r6128, %r1725, 8;
or.b32 %r1727, %r6128, -2147483648;
shr.u32 %r1728, %r1726, 5;
mov.u64 %rd2681, 0;
mov.u32 %r8319, 0;
mov.u64 %rd2002, __cudart_i2opi_f;
mov.u64 %rd2682, %rd2681;
$L__BB0_1305:
.pragma "nounroll";
shl.b64 %rd2001, %rd2681, 2;
add.s64 %rd2003, %rd2002, %rd2001;
ld.global.nc.u32 %r6129, [%rd2003];
mad.wide.u32 %rd2004, %r6129, %r1727, %rd2682;
shr.u64 %rd2682, %rd2004, 32;
add.s64 %rd2005, %rd1, %rd2001;
st.local.u32 [%rd2005], %rd2004;
add.s32 %r8319, %r8319, 1;
cvt.s64.s32 %rd2681, %r8319;
setp.ne.s32 %p1113, %r8319, 6;
@%p1113 bra $L__BB0_1305;
st.local.u32 [%rd4], %rd2682;
mov.u32 %r6130, 4;
sub.s32 %r1731, %r6130, %r1728;
mov.u32 %r6131, 6;
sub.s32 %r6132, %r6131, %r1728;
mul.wide.s32 %rd2006, %r6132, 4;
add.s64 %rd2007, %rd1, %rd2006;
ld.local.u32 %r8320, [%rd2007];
ld.local.u32 %r8321, [%rd2007+-4];
and.b32 %r1734, %r1726, 31;
setp.eq.s32 %p1114, %r1734, 0;
@%p1114 bra $L__BB0_1308;
mov.u32 %r6133, 32;
sub.s32 %r6134, %r6133, %r1734;
shr.u32 %r6135, %r8321, %r6134;
shl.b32 %r6136, %r8320, %r1734;
add.s32 %r8320, %r6135, %r6136;
mul.wide.s32 %rd2008, %r1731, 4;
add.s64 %rd2009, %rd1, %rd2008;
ld.local.u32 %r6137, [%rd2009];
shr.u32 %r6138, %r6137, %r6134;
shl.b32 %r6139, %r8321, %r1734;
add.s32 %r8321, %r6138, %r6139;
$L__BB0_1308:
and.b32 %r6140, %r1725, -2147483648;
shr.u32 %r6141, %r8321, 30;
shl.b32 %r6142, %r8320, 2;
or.b32 %r6143, %r6141, %r6142;
shr.u32 %r6144, %r6143, 31;
shr.u32 %r6145, %r8320, 30;
add.s32 %r6146, %r6144, %r6145;
neg.s32 %r6147, %r6146;
setp.eq.s32 %p1115, %r6140, 0;
selp.b32 %r8322, %r6146, %r6147, %p1115;
setp.ne.s32 %p1116, %r6144, 0;
xor.b32 %r6148, %r6140, -2147483648;
selp.b32 %r6149, %r6148, %r6140, %p1116;
selp.b32 %r6150, -1, 0, %p1116;
xor.b32 %r6151, %r6143, %r6150;
shl.b32 %r6152, %r8321, 2;
xor.b32 %r6153, %r6152, %r6150;
cvt.u64.u32 %rd2010, %r6151;
cvt.u64.u32 %rd2011, %r6153;
bfi.b64 %rd2012, %rd2010, %rd2011, 32, 32;
cvt.rn.f64.s64 %fd177, %rd2012;
mul.f64 %fd178, %fd177, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4218, %fd178;
setp.eq.s32 %p1117, %r6149, 0;
neg.f32 %f4219, %f4218;
selp.f32 %f5767, %f4218, %f4219, %p1117;
$L__BB0_1310:
and.b32 %r1741, %r8322, 1;
setp.eq.s32 %p1118, %r1741, 0;
selp.f32 %f1527, %f5767, 0f3F800000, %p1118;
mul.rn.f32 %f1528, %f5767, %f5767;
mov.f32 %f5768, 0fB94D4153;
@%p1118 bra $L__BB0_1312;
mov.f32 %f4222, 0fBAB607ED;
mov.f32 %f4223, 0f37CBAC00;
fma.rn.f32 %f5768, %f4223, %f1528, %f4222;
$L__BB0_1312:
selp.f32 %f4224, 0f3C0885E4, 0f3D2AAABB, %p1118;
fma.rn.f32 %f4225, %f5768, %f1528, %f4224;
selp.f32 %f4226, 0fBE2AAAA8, 0fBEFFFFFF, %p1118;
fma.rn.f32 %f4227, %f4225, %f1528, %f4226;
mov.f32 %f4228, 0f00000000;
fma.rn.f32 %f4229, %f1528, %f1527, %f4228;
fma.rn.f32 %f5769, %f4227, %f4229, %f1527;
and.b32 %r6155, %r8322, 2;
setp.eq.s32 %p1120, %r6155, 0;
@%p1120 bra $L__BB0_1314;
mov.f32 %f4231, 0fBF800000;
fma.rn.f32 %f5769, %f5769, %f4231, %f4228;
$L__BB0_1314:
mul.f32 %f4232, %f1410, 0f3F22F983;
cvt.rni.s32.f32 %r8326, %f4232;
cvt.rn.f32.s32 %f4233, %r8326;
mov.f32 %f4234, 0fBFC90FDA;
fma.rn.f32 %f4235, %f4233, %f4234, %f1410;
mov.f32 %f4236, 0fB3A22168;
fma.rn.f32 %f4237, %f4233, %f4236, %f4235;
mov.f32 %f4238, 0fA7C234C5;
fma.rn.f32 %f5770, %f4233, %f4238, %f4237;
abs.f32 %f1535, %f1410;
setp.ltu.f32 %p1121, %f1535, 0f47CE4780;
@%p1121 bra $L__BB0_1322;
setp.eq.f32 %p1122, %f1535, 0f7F800000;
@%p1122 bra $L__BB0_1321;
bra.uni $L__BB0_1316;
$L__BB0_1321:
mov.f32 %f4241, 0f00000000;
mul.rn.f32 %f5770, %f1410, %f4241;
mov.u32 %r8326, 0;
bra.uni $L__BB0_1322;
$L__BB0_1316:
mov.b32 %r1743, %f1410;
shr.u32 %r6157, %r1743, 23;
and.b32 %r6158, %r6157, 255;
add.s32 %r1744, %r6158, -128;
shl.b32 %r6159, %r1743, 8;
or.b32 %r1745, %r6159, -2147483648;
shr.u32 %r1746, %r1744, 5;
mov.u64 %rd2683, 0;
mov.u32 %r8323, 0;
mov.u64 %rd2016, __cudart_i2opi_f;
mov.u64 %rd2684, %rd2683;
$L__BB0_1317:
.pragma "nounroll";
shl.b64 %rd2015, %rd2683, 2;
add.s64 %rd2017, %rd2016, %rd2015;
ld.global.nc.u32 %r6160, [%rd2017];
mad.wide.u32 %rd2018, %r6160, %r1745, %rd2684;
shr.u64 %rd2684, %rd2018, 32;
add.s64 %rd2019, %rd1, %rd2015;
st.local.u32 [%rd2019], %rd2018;
add.s32 %r8323, %r8323, 1;
cvt.s64.s32 %rd2683, %r8323;
setp.ne.s32 %p1123, %r8323, 6;
@%p1123 bra $L__BB0_1317;
st.local.u32 [%rd4], %rd2684;
mov.u32 %r6161, 4;
sub.s32 %r1749, %r6161, %r1746;
mov.u32 %r6162, 6;
sub.s32 %r6163, %r6162, %r1746;
mul.wide.s32 %rd2020, %r6163, 4;
add.s64 %rd2021, %rd1, %rd2020;
ld.local.u32 %r8324, [%rd2021];
ld.local.u32 %r8325, [%rd2021+-4];
and.b32 %r1752, %r1744, 31;
setp.eq.s32 %p1124, %r1752, 0;
@%p1124 bra $L__BB0_1320;
mov.u32 %r6164, 32;
sub.s32 %r6165, %r6164, %r1752;
shr.u32 %r6166, %r8325, %r6165;
shl.b32 %r6167, %r8324, %r1752;
add.s32 %r8324, %r6166, %r6167;
mul.wide.s32 %rd2022, %r1749, 4;
add.s64 %rd2023, %rd1, %rd2022;
ld.local.u32 %r6168, [%rd2023];
shr.u32 %r6169, %r6168, %r6165;
shl.b32 %r6170, %r8325, %r1752;
add.s32 %r8325, %r6169, %r6170;
$L__BB0_1320:
and.b32 %r6171, %r1743, -2147483648;
shr.u32 %r6172, %r8325, 30;
shl.b32 %r6173, %r8324, 2;
or.b32 %r6174, %r6172, %r6173;
shr.u32 %r6175, %r6174, 31;
shr.u32 %r6176, %r8324, 30;
add.s32 %r6177, %r6175, %r6176;
neg.s32 %r6178, %r6177;
setp.eq.s32 %p1125, %r6171, 0;
selp.b32 %r8326, %r6177, %r6178, %p1125;
setp.ne.s32 %p1126, %r6175, 0;
xor.b32 %r6179, %r6171, -2147483648;
selp.b32 %r6180, %r6179, %r6171, %p1126;
selp.b32 %r6181, -1, 0, %p1126;
xor.b32 %r6182, %r6174, %r6181;
shl.b32 %r6183, %r8325, 2;
xor.b32 %r6184, %r6183, %r6181;
cvt.u64.u32 %rd2024, %r6182;
cvt.u64.u32 %rd2025, %r6184;
bfi.b64 %rd2026, %rd2024, %rd2025, 32, 32;
cvt.rn.f64.s64 %fd179, %rd2026;
mul.f64 %fd180, %fd179, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4239, %fd180;
setp.eq.s32 %p1127, %r6180, 0;
neg.f32 %f4240, %f4239;
selp.f32 %f5770, %f4239, %f4240, %p1127;
$L__BB0_1322:
add.s32 %r1759, %r8326, 1;
and.b32 %r1760, %r1759, 1;
setp.eq.s32 %p1128, %r1760, 0;
selp.f32 %f1539, %f5770, 0f3F800000, %p1128;
mul.rn.f32 %f1540, %f5770, %f5770;
mov.f32 %f5771, 0fB94D4153;
@%p1128 bra $L__BB0_1324;
mov.f32 %f4243, 0fBAB607ED;
mov.f32 %f4244, 0f37CBAC00;
fma.rn.f32 %f5771, %f4244, %f1540, %f4243;
$L__BB0_1324:
selp.f32 %f4245, 0f3C0885E4, 0f3D2AAABB, %p1128;
fma.rn.f32 %f4246, %f5771, %f1540, %f4245;
selp.f32 %f4247, 0fBE2AAAA8, 0fBEFFFFFF, %p1128;
fma.rn.f32 %f4248, %f4246, %f1540, %f4247;
mov.f32 %f4249, 0f00000000;
fma.rn.f32 %f4250, %f1540, %f1539, %f4249;
fma.rn.f32 %f5772, %f4248, %f4250, %f1539;
and.b32 %r6186, %r1759, 2;
setp.eq.s32 %p1130, %r6186, 0;
@%p1130 bra $L__BB0_1326;
mov.f32 %f4252, 0fBF800000;
fma.rn.f32 %f5772, %f5772, %f4252, %f4249;
$L__BB0_1326:
add.f32 %f5794, %f5769, %f5772;
mul.f32 %f4253, %f1419, 0f3F22F983;
cvt.rni.s32.f32 %r8330, %f4253;
cvt.rn.f32.s32 %f4254, %r8330;
mov.f32 %f4255, 0fBFC90FDA;
fma.rn.f32 %f4256, %f4254, %f4255, %f1419;
mov.f32 %f4257, 0fB3A22168;
fma.rn.f32 %f4258, %f4254, %f4257, %f4256;
mov.f32 %f4259, 0fA7C234C5;
fma.rn.f32 %f5773, %f4254, %f4259, %f4258;
abs.f32 %f1548, %f1419;
setp.ltu.f32 %p1131, %f1548, 0f47CE4780;
@%p1131 bra $L__BB0_1334;
setp.eq.f32 %p1132, %f1548, 0f7F800000;
@%p1132 bra $L__BB0_1333;
bra.uni $L__BB0_1328;
$L__BB0_1333:
mov.f32 %f4262, 0f00000000;
mul.rn.f32 %f5773, %f1419, %f4262;
mov.u32 %r8330, 0;
bra.uni $L__BB0_1334;
$L__BB0_1328:
mov.b32 %r1762, %f1419;
shr.u32 %r6188, %r1762, 23;
and.b32 %r6189, %r6188, 255;
add.s32 %r1763, %r6189, -128;
shl.b32 %r6190, %r1762, 8;
or.b32 %r1764, %r6190, -2147483648;
shr.u32 %r1765, %r1763, 5;
mov.u64 %rd2685, 0;
mov.u32 %r8327, 0;
mov.u64 %rd2030, __cudart_i2opi_f;
mov.u64 %rd2686, %rd2685;
$L__BB0_1329:
.pragma "nounroll";
shl.b64 %rd2029, %rd2685, 2;
add.s64 %rd2031, %rd2030, %rd2029;
ld.global.nc.u32 %r6191, [%rd2031];
mad.wide.u32 %rd2032, %r6191, %r1764, %rd2686;
shr.u64 %rd2686, %rd2032, 32;
add.s64 %rd2033, %rd1, %rd2029;
st.local.u32 [%rd2033], %rd2032;
add.s32 %r8327, %r8327, 1;
cvt.s64.s32 %rd2685, %r8327;
setp.ne.s32 %p1133, %r8327, 6;
@%p1133 bra $L__BB0_1329;
st.local.u32 [%rd4], %rd2686;
mov.u32 %r6192, 4;
sub.s32 %r1768, %r6192, %r1765;
mov.u32 %r6193, 6;
sub.s32 %r6194, %r6193, %r1765;
mul.wide.s32 %rd2034, %r6194, 4;
add.s64 %rd2035, %rd1, %rd2034;
ld.local.u32 %r8328, [%rd2035];
ld.local.u32 %r8329, [%rd2035+-4];
and.b32 %r1771, %r1763, 31;
setp.eq.s32 %p1134, %r1771, 0;
@%p1134 bra $L__BB0_1332;
mov.u32 %r6195, 32;
sub.s32 %r6196, %r6195, %r1771;
shr.u32 %r6197, %r8329, %r6196;
shl.b32 %r6198, %r8328, %r1771;
add.s32 %r8328, %r6197, %r6198;
mul.wide.s32 %rd2036, %r1768, 4;
add.s64 %rd2037, %rd1, %rd2036;
ld.local.u32 %r6199, [%rd2037];
shr.u32 %r6200, %r6199, %r6196;
shl.b32 %r6201, %r8329, %r1771;
add.s32 %r8329, %r6200, %r6201;
$L__BB0_1332:
and.b32 %r6202, %r1762, -2147483648;
shr.u32 %r6203, %r8329, 30;
shl.b32 %r6204, %r8328, 2;
or.b32 %r6205, %r6203, %r6204;
shr.u32 %r6206, %r6205, 31;
shr.u32 %r6207, %r8328, 30;
add.s32 %r6208, %r6206, %r6207;
neg.s32 %r6209, %r6208;
setp.eq.s32 %p1135, %r6202, 0;
selp.b32 %r8330, %r6208, %r6209, %p1135;
setp.ne.s32 %p1136, %r6206, 0;
xor.b32 %r6210, %r6202, -2147483648;
selp.b32 %r6211, %r6210, %r6202, %p1136;
selp.b32 %r6212, -1, 0, %p1136;
xor.b32 %r6213, %r6205, %r6212;
shl.b32 %r6214, %r8329, 2;
xor.b32 %r6215, %r6214, %r6212;
cvt.u64.u32 %rd2038, %r6213;
cvt.u64.u32 %rd2039, %r6215;
bfi.b64 %rd2040, %rd2038, %rd2039, 32, 32;
cvt.rn.f64.s64 %fd181, %rd2040;
mul.f64 %fd182, %fd181, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4260, %fd182;
setp.eq.s32 %p1137, %r6211, 0;
neg.f32 %f4261, %f4260;
selp.f32 %f5773, %f4260, %f4261, %p1137;
$L__BB0_1334:
and.b32 %r1778, %r8330, 1;
setp.eq.s32 %p1138, %r1778, 0;
selp.f32 %f1552, %f5773, 0f3F800000, %p1138;
mul.rn.f32 %f1553, %f5773, %f5773;
mov.f32 %f5774, 0fB94D4153;
@%p1138 bra $L__BB0_1336;
mov.f32 %f4264, 0fBAB607ED;
mov.f32 %f4265, 0f37CBAC00;
fma.rn.f32 %f5774, %f4265, %f1553, %f4264;
$L__BB0_1336:
selp.f32 %f4266, 0f3C0885E4, 0f3D2AAABB, %p1138;
fma.rn.f32 %f4267, %f5774, %f1553, %f4266;
selp.f32 %f4268, 0fBE2AAAA8, 0fBEFFFFFF, %p1138;
fma.rn.f32 %f4269, %f4267, %f1553, %f4268;
mov.f32 %f4270, 0f00000000;
fma.rn.f32 %f4271, %f1553, %f1552, %f4270;
fma.rn.f32 %f5775, %f4269, %f4271, %f1552;
and.b32 %r6217, %r8330, 2;
setp.eq.s32 %p1140, %r6217, 0;
@%p1140 bra $L__BB0_1338;
mov.f32 %f4273, 0fBF800000;
fma.rn.f32 %f5775, %f5775, %f4273, %f4270;
$L__BB0_1338:
mul.f32 %f4274, %f1411, 0f3F22F983;
cvt.rni.s32.f32 %r8334, %f4274;
cvt.rn.f32.s32 %f4275, %r8334;
mov.f32 %f4276, 0fBFC90FDA;
fma.rn.f32 %f4277, %f4275, %f4276, %f1411;
mov.f32 %f4278, 0fB3A22168;
fma.rn.f32 %f4279, %f4275, %f4278, %f4277;
mov.f32 %f4280, 0fA7C234C5;
fma.rn.f32 %f5776, %f4275, %f4280, %f4279;
abs.f32 %f1560, %f1411;
setp.ltu.f32 %p1141, %f1560, 0f47CE4780;
@%p1141 bra $L__BB0_1346;
setp.eq.f32 %p1142, %f1560, 0f7F800000;
@%p1142 bra $L__BB0_1345;
bra.uni $L__BB0_1340;
$L__BB0_1345:
mov.f32 %f4283, 0f00000000;
mul.rn.f32 %f5776, %f1411, %f4283;
mov.u32 %r8334, 0;
bra.uni $L__BB0_1346;
$L__BB0_1340:
mov.b32 %r1780, %f1411;
shr.u32 %r6219, %r1780, 23;
and.b32 %r6220, %r6219, 255;
add.s32 %r1781, %r6220, -128;
shl.b32 %r6221, %r1780, 8;
or.b32 %r1782, %r6221, -2147483648;
shr.u32 %r1783, %r1781, 5;
mov.u64 %rd2687, 0;
mov.u32 %r8331, 0;
mov.u64 %rd2044, __cudart_i2opi_f;
mov.u64 %rd2688, %rd2687;
$L__BB0_1341:
.pragma "nounroll";
shl.b64 %rd2043, %rd2687, 2;
add.s64 %rd2045, %rd2044, %rd2043;
ld.global.nc.u32 %r6222, [%rd2045];
mad.wide.u32 %rd2046, %r6222, %r1782, %rd2688;
shr.u64 %rd2688, %rd2046, 32;
add.s64 %rd2047, %rd1, %rd2043;
st.local.u32 [%rd2047], %rd2046;
add.s32 %r8331, %r8331, 1;
cvt.s64.s32 %rd2687, %r8331;
setp.ne.s32 %p1143, %r8331, 6;
@%p1143 bra $L__BB0_1341;
st.local.u32 [%rd4], %rd2688;
mov.u32 %r6223, 4;
sub.s32 %r1786, %r6223, %r1783;
mov.u32 %r6224, 6;
sub.s32 %r6225, %r6224, %r1783;
mul.wide.s32 %rd2048, %r6225, 4;
add.s64 %rd2049, %rd1, %rd2048;
ld.local.u32 %r8332, [%rd2049];
ld.local.u32 %r8333, [%rd2049+-4];
and.b32 %r1789, %r1781, 31;
setp.eq.s32 %p1144, %r1789, 0;
@%p1144 bra $L__BB0_1344;
mov.u32 %r6226, 32;
sub.s32 %r6227, %r6226, %r1789;
shr.u32 %r6228, %r8333, %r6227;
shl.b32 %r6229, %r8332, %r1789;
add.s32 %r8332, %r6228, %r6229;
mul.wide.s32 %rd2050, %r1786, 4;
add.s64 %rd2051, %rd1, %rd2050;
ld.local.u32 %r6230, [%rd2051];
shr.u32 %r6231, %r6230, %r6227;
shl.b32 %r6232, %r8333, %r1789;
add.s32 %r8333, %r6231, %r6232;
$L__BB0_1344:
and.b32 %r6233, %r1780, -2147483648;
shr.u32 %r6234, %r8333, 30;
shl.b32 %r6235, %r8332, 2;
or.b32 %r6236, %r6234, %r6235;
shr.u32 %r6237, %r6236, 31;
shr.u32 %r6238, %r8332, 30;
add.s32 %r6239, %r6237, %r6238;
neg.s32 %r6240, %r6239;
setp.eq.s32 %p1145, %r6233, 0;
selp.b32 %r8334, %r6239, %r6240, %p1145;
setp.ne.s32 %p1146, %r6237, 0;
xor.b32 %r6241, %r6233, -2147483648;
selp.b32 %r6242, %r6241, %r6233, %p1146;
selp.b32 %r6243, -1, 0, %p1146;
xor.b32 %r6244, %r6236, %r6243;
shl.b32 %r6245, %r8333, 2;
xor.b32 %r6246, %r6245, %r6243;
cvt.u64.u32 %rd2052, %r6244;
cvt.u64.u32 %rd2053, %r6246;
bfi.b64 %rd2054, %rd2052, %rd2053, 32, 32;
cvt.rn.f64.s64 %fd183, %rd2054;
mul.f64 %fd184, %fd183, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4281, %fd184;
setp.eq.s32 %p1147, %r6242, 0;
neg.f32 %f4282, %f4281;
selp.f32 %f5776, %f4281, %f4282, %p1147;
$L__BB0_1346:
add.s32 %r1796, %r8334, 1;
and.b32 %r1797, %r1796, 1;
setp.eq.s32 %p1148, %r1797, 0;
selp.f32 %f1564, %f5776, 0f3F800000, %p1148;
mul.rn.f32 %f1565, %f5776, %f5776;
mov.f32 %f5777, 0fB94D4153;
@%p1148 bra $L__BB0_1348;
mov.f32 %f4285, 0fBAB607ED;
mov.f32 %f4286, 0f37CBAC00;
fma.rn.f32 %f5777, %f4286, %f1565, %f4285;
$L__BB0_1348:
selp.f32 %f4287, 0f3C0885E4, 0f3D2AAABB, %p1148;
fma.rn.f32 %f4288, %f5777, %f1565, %f4287;
selp.f32 %f4289, 0fBE2AAAA8, 0fBEFFFFFF, %p1148;
fma.rn.f32 %f4290, %f4288, %f1565, %f4289;
mov.f32 %f4291, 0f00000000;
fma.rn.f32 %f4292, %f1565, %f1564, %f4291;
fma.rn.f32 %f5778, %f4290, %f4292, %f1564;
and.b32 %r6248, %r1796, 2;
setp.eq.s32 %p1150, %r6248, 0;
@%p1150 bra $L__BB0_1350;
mov.f32 %f4294, 0fBF800000;
fma.rn.f32 %f5778, %f5778, %f4294, %f4291;
$L__BB0_1350:
add.f32 %f5793, %f5775, %f5778;
mul.f32 %f4295, %f1420, 0f3F22F983;
cvt.rni.s32.f32 %r8338, %f4295;
cvt.rn.f32.s32 %f4296, %r8338;
mov.f32 %f4297, 0fBFC90FDA;
fma.rn.f32 %f4298, %f4296, %f4297, %f1420;
mov.f32 %f4299, 0fB3A22168;
fma.rn.f32 %f4300, %f4296, %f4299, %f4298;
mov.f32 %f4301, 0fA7C234C5;
fma.rn.f32 %f5779, %f4296, %f4301, %f4300;
abs.f32 %f1573, %f1420;
setp.ltu.f32 %p1151, %f1573, 0f47CE4780;
@%p1151 bra $L__BB0_1358;
setp.eq.f32 %p1152, %f1573, 0f7F800000;
@%p1152 bra $L__BB0_1357;
bra.uni $L__BB0_1352;
$L__BB0_1357:
mov.f32 %f4304, 0f00000000;
mul.rn.f32 %f5779, %f1420, %f4304;
mov.u32 %r8338, 0;
bra.uni $L__BB0_1358;
$L__BB0_1352:
mov.b32 %r1799, %f1420;
shr.u32 %r6250, %r1799, 23;
and.b32 %r6251, %r6250, 255;
add.s32 %r1800, %r6251, -128;
shl.b32 %r6252, %r1799, 8;
or.b32 %r1801, %r6252, -2147483648;
shr.u32 %r1802, %r1800, 5;
mov.u64 %rd2689, 0;
mov.u32 %r8335, 0;
mov.u64 %rd2058, __cudart_i2opi_f;
mov.u64 %rd2690, %rd2689;
$L__BB0_1353:
.pragma "nounroll";
shl.b64 %rd2057, %rd2689, 2;
add.s64 %rd2059, %rd2058, %rd2057;
ld.global.nc.u32 %r6253, [%rd2059];
mad.wide.u32 %rd2060, %r6253, %r1801, %rd2690;
shr.u64 %rd2690, %rd2060, 32;
add.s64 %rd2061, %rd1, %rd2057;
st.local.u32 [%rd2061], %rd2060;
add.s32 %r8335, %r8335, 1;
cvt.s64.s32 %rd2689, %r8335;
setp.ne.s32 %p1153, %r8335, 6;
@%p1153 bra $L__BB0_1353;
st.local.u32 [%rd4], %rd2690;
mov.u32 %r6254, 4;
sub.s32 %r1805, %r6254, %r1802;
mov.u32 %r6255, 6;
sub.s32 %r6256, %r6255, %r1802;
mul.wide.s32 %rd2062, %r6256, 4;
add.s64 %rd2063, %rd1, %rd2062;
ld.local.u32 %r8336, [%rd2063];
ld.local.u32 %r8337, [%rd2063+-4];
and.b32 %r1808, %r1800, 31;
setp.eq.s32 %p1154, %r1808, 0;
@%p1154 bra $L__BB0_1356;
mov.u32 %r6257, 32;
sub.s32 %r6258, %r6257, %r1808;
shr.u32 %r6259, %r8337, %r6258;
shl.b32 %r6260, %r8336, %r1808;
add.s32 %r8336, %r6259, %r6260;
mul.wide.s32 %rd2064, %r1805, 4;
add.s64 %rd2065, %rd1, %rd2064;
ld.local.u32 %r6261, [%rd2065];
shr.u32 %r6262, %r6261, %r6258;
shl.b32 %r6263, %r8337, %r1808;
add.s32 %r8337, %r6262, %r6263;
$L__BB0_1356:
and.b32 %r6264, %r1799, -2147483648;
shr.u32 %r6265, %r8337, 30;
shl.b32 %r6266, %r8336, 2;
or.b32 %r6267, %r6265, %r6266;
shr.u32 %r6268, %r6267, 31;
shr.u32 %r6269, %r8336, 30;
add.s32 %r6270, %r6268, %r6269;
neg.s32 %r6271, %r6270;
setp.eq.s32 %p1155, %r6264, 0;
selp.b32 %r8338, %r6270, %r6271, %p1155;
setp.ne.s32 %p1156, %r6268, 0;
xor.b32 %r6272, %r6264, -2147483648;
selp.b32 %r6273, %r6272, %r6264, %p1156;
selp.b32 %r6274, -1, 0, %p1156;
xor.b32 %r6275, %r6267, %r6274;
shl.b32 %r6276, %r8337, 2;
xor.b32 %r6277, %r6276, %r6274;
cvt.u64.u32 %rd2066, %r6275;
cvt.u64.u32 %rd2067, %r6277;
bfi.b64 %rd2068, %rd2066, %rd2067, 32, 32;
cvt.rn.f64.s64 %fd185, %rd2068;
mul.f64 %fd186, %fd185, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4302, %fd186;
setp.eq.s32 %p1157, %r6273, 0;
neg.f32 %f4303, %f4302;
selp.f32 %f5779, %f4302, %f4303, %p1157;
$L__BB0_1358:
and.b32 %r1815, %r8338, 1;
setp.eq.s32 %p1158, %r1815, 0;
selp.f32 %f1577, %f5779, 0f3F800000, %p1158;
mul.rn.f32 %f1578, %f5779, %f5779;
mov.f32 %f5780, 0fB94D4153;
@%p1158 bra $L__BB0_1360;
mov.f32 %f4306, 0fBAB607ED;
mov.f32 %f4307, 0f37CBAC00;
fma.rn.f32 %f5780, %f4307, %f1578, %f4306;
$L__BB0_1360:
selp.f32 %f4308, 0f3C0885E4, 0f3D2AAABB, %p1158;
fma.rn.f32 %f4309, %f5780, %f1578, %f4308;
selp.f32 %f4310, 0fBE2AAAA8, 0fBEFFFFFF, %p1158;
fma.rn.f32 %f4311, %f4309, %f1578, %f4310;
mov.f32 %f4312, 0f00000000;
fma.rn.f32 %f4313, %f1578, %f1577, %f4312;
fma.rn.f32 %f5781, %f4311, %f4313, %f1577;
and.b32 %r6279, %r8338, 2;
setp.eq.s32 %p1160, %r6279, 0;
@%p1160 bra $L__BB0_1362;
mov.f32 %f4315, 0fBF800000;
fma.rn.f32 %f5781, %f5781, %f4315, %f4312;
$L__BB0_1362:
mul.f32 %f4316, %f1412, 0f3F22F983;
cvt.rni.s32.f32 %r8342, %f4316;
cvt.rn.f32.s32 %f4317, %r8342;
mov.f32 %f4318, 0fBFC90FDA;
fma.rn.f32 %f4319, %f4317, %f4318, %f1412;
mov.f32 %f4320, 0fB3A22168;
fma.rn.f32 %f4321, %f4317, %f4320, %f4319;
mov.f32 %f4322, 0fA7C234C5;
fma.rn.f32 %f5782, %f4317, %f4322, %f4321;
abs.f32 %f1585, %f1412;
setp.ltu.f32 %p1161, %f1585, 0f47CE4780;
@%p1161 bra $L__BB0_1370;
setp.eq.f32 %p1162, %f1585, 0f7F800000;
@%p1162 bra $L__BB0_1369;
bra.uni $L__BB0_1364;
$L__BB0_1369:
mov.f32 %f4325, 0f00000000;
mul.rn.f32 %f5782, %f1412, %f4325;
mov.u32 %r8342, 0;
bra.uni $L__BB0_1370;
$L__BB0_1364:
mov.b32 %r1817, %f1412;
shr.u32 %r6281, %r1817, 23;
and.b32 %r6282, %r6281, 255;
add.s32 %r1818, %r6282, -128;
shl.b32 %r6283, %r1817, 8;
or.b32 %r1819, %r6283, -2147483648;
shr.u32 %r1820, %r1818, 5;
mov.u64 %rd2691, 0;
mov.u32 %r8339, 0;
mov.u64 %rd2072, __cudart_i2opi_f;
mov.u64 %rd2692, %rd2691;
$L__BB0_1365:
.pragma "nounroll";
shl.b64 %rd2071, %rd2691, 2;
add.s64 %rd2073, %rd2072, %rd2071;
ld.global.nc.u32 %r6284, [%rd2073];
mad.wide.u32 %rd2074, %r6284, %r1819, %rd2692;
shr.u64 %rd2692, %rd2074, 32;
add.s64 %rd2075, %rd1, %rd2071;
st.local.u32 [%rd2075], %rd2074;
add.s32 %r8339, %r8339, 1;
cvt.s64.s32 %rd2691, %r8339;
setp.ne.s32 %p1163, %r8339, 6;
@%p1163 bra $L__BB0_1365;
st.local.u32 [%rd4], %rd2692;
mov.u32 %r6285, 4;
sub.s32 %r1823, %r6285, %r1820;
mov.u32 %r6286, 6;
sub.s32 %r6287, %r6286, %r1820;
mul.wide.s32 %rd2076, %r6287, 4;
add.s64 %rd2077, %rd1, %rd2076;
ld.local.u32 %r8340, [%rd2077];
ld.local.u32 %r8341, [%rd2077+-4];
and.b32 %r1826, %r1818, 31;
setp.eq.s32 %p1164, %r1826, 0;
@%p1164 bra $L__BB0_1368;
mov.u32 %r6288, 32;
sub.s32 %r6289, %r6288, %r1826;
shr.u32 %r6290, %r8341, %r6289;
shl.b32 %r6291, %r8340, %r1826;
add.s32 %r8340, %r6290, %r6291;
mul.wide.s32 %rd2078, %r1823, 4;
add.s64 %rd2079, %rd1, %rd2078;
ld.local.u32 %r6292, [%rd2079];
shr.u32 %r6293, %r6292, %r6289;
shl.b32 %r6294, %r8341, %r1826;
add.s32 %r8341, %r6293, %r6294;
$L__BB0_1368:
and.b32 %r6295, %r1817, -2147483648;
shr.u32 %r6296, %r8341, 30;
shl.b32 %r6297, %r8340, 2;
or.b32 %r6298, %r6296, %r6297;
shr.u32 %r6299, %r6298, 31;
shr.u32 %r6300, %r8340, 30;
add.s32 %r6301, %r6299, %r6300;
neg.s32 %r6302, %r6301;
setp.eq.s32 %p1165, %r6295, 0;
selp.b32 %r8342, %r6301, %r6302, %p1165;
setp.ne.s32 %p1166, %r6299, 0;
xor.b32 %r6303, %r6295, -2147483648;
selp.b32 %r6304, %r6303, %r6295, %p1166;
selp.b32 %r6305, -1, 0, %p1166;
xor.b32 %r6306, %r6298, %r6305;
shl.b32 %r6307, %r8341, 2;
xor.b32 %r6308, %r6307, %r6305;
cvt.u64.u32 %rd2080, %r6306;
cvt.u64.u32 %rd2081, %r6308;
bfi.b64 %rd2082, %rd2080, %rd2081, 32, 32;
cvt.rn.f64.s64 %fd187, %rd2082;
mul.f64 %fd188, %fd187, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4323, %fd188;
setp.eq.s32 %p1167, %r6304, 0;
neg.f32 %f4324, %f4323;
selp.f32 %f5782, %f4323, %f4324, %p1167;
$L__BB0_1370:
add.s32 %r1833, %r8342, 1;
and.b32 %r1834, %r1833, 1;
setp.eq.s32 %p1168, %r1834, 0;
selp.f32 %f1589, %f5782, 0f3F800000, %p1168;
mul.rn.f32 %f1590, %f5782, %f5782;
mov.f32 %f5783, 0fB94D4153;
@%p1168 bra $L__BB0_1372;
mov.f32 %f4327, 0fBAB607ED;
mov.f32 %f4328, 0f37CBAC00;
fma.rn.f32 %f5783, %f4328, %f1590, %f4327;
$L__BB0_1372:
selp.f32 %f4329, 0f3C0885E4, 0f3D2AAABB, %p1168;
fma.rn.f32 %f4330, %f5783, %f1590, %f4329;
selp.f32 %f4331, 0fBE2AAAA8, 0fBEFFFFFF, %p1168;
fma.rn.f32 %f4332, %f4330, %f1590, %f4331;
mov.f32 %f4333, 0f00000000;
fma.rn.f32 %f4334, %f1590, %f1589, %f4333;
fma.rn.f32 %f5784, %f4332, %f4334, %f1589;
and.b32 %r6310, %r1833, 2;
setp.eq.s32 %p1170, %r6310, 0;
@%p1170 bra $L__BB0_1374;
mov.f32 %f4336, 0fBF800000;
fma.rn.f32 %f5784, %f5784, %f4336, %f4333;
$L__BB0_1374:
add.f32 %f5792, %f5781, %f5784;
mul.f32 %f4337, %f1421, 0f3F22F983;
cvt.rni.s32.f32 %r8346, %f4337;
cvt.rn.f32.s32 %f4338, %r8346;
mov.f32 %f4339, 0fBFC90FDA;
fma.rn.f32 %f4340, %f4338, %f4339, %f1421;
mov.f32 %f4341, 0fB3A22168;
fma.rn.f32 %f4342, %f4338, %f4341, %f4340;
mov.f32 %f4343, 0fA7C234C5;
fma.rn.f32 %f5785, %f4338, %f4343, %f4342;
abs.f32 %f1598, %f1421;
setp.ltu.f32 %p1171, %f1598, 0f47CE4780;
@%p1171 bra $L__BB0_1382;
setp.eq.f32 %p1172, %f1598, 0f7F800000;
@%p1172 bra $L__BB0_1381;
bra.uni $L__BB0_1376;
$L__BB0_1381:
mov.f32 %f4346, 0f00000000;
mul.rn.f32 %f5785, %f1421, %f4346;
mov.u32 %r8346, 0;
bra.uni $L__BB0_1382;
$L__BB0_1376:
mov.b32 %r1836, %f1421;
shr.u32 %r6312, %r1836, 23;
and.b32 %r6313, %r6312, 255;
add.s32 %r1837, %r6313, -128;
shl.b32 %r6314, %r1836, 8;
or.b32 %r1838, %r6314, -2147483648;
shr.u32 %r1839, %r1837, 5;
mov.u64 %rd2693, 0;
mov.u32 %r8343, 0;
mov.u64 %rd2086, __cudart_i2opi_f;
mov.u64 %rd2694, %rd2693;
$L__BB0_1377:
.pragma "nounroll";
shl.b64 %rd2085, %rd2693, 2;
add.s64 %rd2087, %rd2086, %rd2085;
ld.global.nc.u32 %r6315, [%rd2087];
mad.wide.u32 %rd2088, %r6315, %r1838, %rd2694;
shr.u64 %rd2694, %rd2088, 32;
add.s64 %rd2089, %rd1, %rd2085;
st.local.u32 [%rd2089], %rd2088;
add.s32 %r8343, %r8343, 1;
cvt.s64.s32 %rd2693, %r8343;
setp.ne.s32 %p1173, %r8343, 6;
@%p1173 bra $L__BB0_1377;
st.local.u32 [%rd4], %rd2694;
mov.u32 %r6316, 4;
sub.s32 %r1842, %r6316, %r1839;
mov.u32 %r6317, 6;
sub.s32 %r6318, %r6317, %r1839;
mul.wide.s32 %rd2090, %r6318, 4;
add.s64 %rd2091, %rd1, %rd2090;
ld.local.u32 %r8344, [%rd2091];
ld.local.u32 %r8345, [%rd2091+-4];
and.b32 %r1845, %r1837, 31;
setp.eq.s32 %p1174, %r1845, 0;
@%p1174 bra $L__BB0_1380;
mov.u32 %r6319, 32;
sub.s32 %r6320, %r6319, %r1845;
shr.u32 %r6321, %r8345, %r6320;
shl.b32 %r6322, %r8344, %r1845;
add.s32 %r8344, %r6321, %r6322;
mul.wide.s32 %rd2092, %r1842, 4;
add.s64 %rd2093, %rd1, %rd2092;
ld.local.u32 %r6323, [%rd2093];
shr.u32 %r6324, %r6323, %r6320;
shl.b32 %r6325, %r8345, %r1845;
add.s32 %r8345, %r6324, %r6325;
$L__BB0_1380:
and.b32 %r6326, %r1836, -2147483648;
shr.u32 %r6327, %r8345, 30;
shl.b32 %r6328, %r8344, 2;
or.b32 %r6329, %r6327, %r6328;
shr.u32 %r6330, %r6329, 31;
shr.u32 %r6331, %r8344, 30;
add.s32 %r6332, %r6330, %r6331;
neg.s32 %r6333, %r6332;
setp.eq.s32 %p1175, %r6326, 0;
selp.b32 %r8346, %r6332, %r6333, %p1175;
setp.ne.s32 %p1176, %r6330, 0;
xor.b32 %r6334, %r6326, -2147483648;
selp.b32 %r6335, %r6334, %r6326, %p1176;
selp.b32 %r6336, -1, 0, %p1176;
xor.b32 %r6337, %r6329, %r6336;
shl.b32 %r6338, %r8345, 2;
xor.b32 %r6339, %r6338, %r6336;
cvt.u64.u32 %rd2094, %r6337;
cvt.u64.u32 %rd2095, %r6339;
bfi.b64 %rd2096, %rd2094, %rd2095, 32, 32;
cvt.rn.f64.s64 %fd189, %rd2096;
mul.f64 %fd190, %fd189, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4344, %fd190;
setp.eq.s32 %p1177, %r6335, 0;
neg.f32 %f4345, %f4344;
selp.f32 %f5785, %f4344, %f4345, %p1177;
$L__BB0_1382:
and.b32 %r1852, %r8346, 1;
setp.eq.s32 %p1178, %r1852, 0;
selp.f32 %f1602, %f5785, 0f3F800000, %p1178;
mul.rn.f32 %f1603, %f5785, %f5785;
mov.f32 %f5786, 0fB94D4153;
@%p1178 bra $L__BB0_1384;
mov.f32 %f4348, 0fBAB607ED;
mov.f32 %f4349, 0f37CBAC00;
fma.rn.f32 %f5786, %f4349, %f1603, %f4348;
$L__BB0_1384:
selp.f32 %f4350, 0f3C0885E4, 0f3D2AAABB, %p1178;
fma.rn.f32 %f4351, %f5786, %f1603, %f4350;
selp.f32 %f4352, 0fBE2AAAA8, 0fBEFFFFFF, %p1178;
fma.rn.f32 %f4353, %f4351, %f1603, %f4352;
mov.f32 %f4354, 0f00000000;
fma.rn.f32 %f4355, %f1603, %f1602, %f4354;
fma.rn.f32 %f5787, %f4353, %f4355, %f1602;
and.b32 %r6341, %r8346, 2;
setp.eq.s32 %p1180, %r6341, 0;
@%p1180 bra $L__BB0_1386;
mov.f32 %f4357, 0fBF800000;
fma.rn.f32 %f5787, %f5787, %f4357, %f4354;
$L__BB0_1386:
mul.f32 %f4358, %f1413, 0f3F22F983;
cvt.rni.s32.f32 %r8350, %f4358;
cvt.rn.f32.s32 %f4359, %r8350;
mov.f32 %f4360, 0fBFC90FDA;
fma.rn.f32 %f4361, %f4359, %f4360, %f1413;
mov.f32 %f4362, 0fB3A22168;
fma.rn.f32 %f4363, %f4359, %f4362, %f4361;
mov.f32 %f4364, 0fA7C234C5;
fma.rn.f32 %f5788, %f4359, %f4364, %f4363;
abs.f32 %f1610, %f1413;
setp.ltu.f32 %p1181, %f1610, 0f47CE4780;
@%p1181 bra $L__BB0_1394;
setp.eq.f32 %p1182, %f1610, 0f7F800000;
@%p1182 bra $L__BB0_1393;
bra.uni $L__BB0_1388;
$L__BB0_1393:
mov.f32 %f4367, 0f00000000;
mul.rn.f32 %f5788, %f1413, %f4367;
mov.u32 %r8350, 0;
bra.uni $L__BB0_1394;
$L__BB0_1388:
mov.b32 %r1854, %f1413;
shr.u32 %r6343, %r1854, 23;
and.b32 %r6344, %r6343, 255;
add.s32 %r1855, %r6344, -128;
shl.b32 %r6345, %r1854, 8;
or.b32 %r1856, %r6345, -2147483648;
shr.u32 %r1857, %r1855, 5;
mov.u64 %rd2695, 0;
mov.u32 %r8347, 0;
mov.u64 %rd2100, __cudart_i2opi_f;
mov.u64 %rd2696, %rd2695;
$L__BB0_1389:
.pragma "nounroll";
shl.b64 %rd2099, %rd2695, 2;
add.s64 %rd2101, %rd2100, %rd2099;
ld.global.nc.u32 %r6346, [%rd2101];
mad.wide.u32 %rd2102, %r6346, %r1856, %rd2696;
shr.u64 %rd2696, %rd2102, 32;
add.s64 %rd2103, %rd1, %rd2099;
st.local.u32 [%rd2103], %rd2102;
add.s32 %r8347, %r8347, 1;
cvt.s64.s32 %rd2695, %r8347;
setp.ne.s32 %p1183, %r8347, 6;
@%p1183 bra $L__BB0_1389;
st.local.u32 [%rd4], %rd2696;
mov.u32 %r6347, 4;
sub.s32 %r1860, %r6347, %r1857;
mov.u32 %r6348, 6;
sub.s32 %r6349, %r6348, %r1857;
mul.wide.s32 %rd2104, %r6349, 4;
add.s64 %rd2105, %rd1, %rd2104;
ld.local.u32 %r8348, [%rd2105];
ld.local.u32 %r8349, [%rd2105+-4];
and.b32 %r1863, %r1855, 31;
setp.eq.s32 %p1184, %r1863, 0;
@%p1184 bra $L__BB0_1392;
mov.u32 %r6350, 32;
sub.s32 %r6351, %r6350, %r1863;
shr.u32 %r6352, %r8349, %r6351;
shl.b32 %r6353, %r8348, %r1863;
add.s32 %r8348, %r6352, %r6353;
mul.wide.s32 %rd2106, %r1860, 4;
add.s64 %rd2107, %rd1, %rd2106;
ld.local.u32 %r6354, [%rd2107];
shr.u32 %r6355, %r6354, %r6351;
shl.b32 %r6356, %r8349, %r1863;
add.s32 %r8349, %r6355, %r6356;
$L__BB0_1392:
and.b32 %r6357, %r1854, -2147483648;
shr.u32 %r6358, %r8349, 30;
shl.b32 %r6359, %r8348, 2;
or.b32 %r6360, %r6358, %r6359;
shr.u32 %r6361, %r6360, 31;
shr.u32 %r6362, %r8348, 30;
add.s32 %r6363, %r6361, %r6362;
neg.s32 %r6364, %r6363;
setp.eq.s32 %p1185, %r6357, 0;
selp.b32 %r8350, %r6363, %r6364, %p1185;
setp.ne.s32 %p1186, %r6361, 0;
xor.b32 %r6365, %r6357, -2147483648;
selp.b32 %r6366, %r6365, %r6357, %p1186;
selp.b32 %r6367, -1, 0, %p1186;
xor.b32 %r6368, %r6360, %r6367;
shl.b32 %r6369, %r8349, 2;
xor.b32 %r6370, %r6369, %r6367;
cvt.u64.u32 %rd2108, %r6368;
cvt.u64.u32 %rd2109, %r6370;
bfi.b64 %rd2110, %rd2108, %rd2109, 32, 32;
cvt.rn.f64.s64 %fd191, %rd2110;
mul.f64 %fd192, %fd191, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4365, %fd192;
setp.eq.s32 %p1187, %r6366, 0;
neg.f32 %f4366, %f4365;
selp.f32 %f5788, %f4365, %f4366, %p1187;
$L__BB0_1394:
add.s32 %r1870, %r8350, 1;
and.b32 %r1871, %r1870, 1;
setp.eq.s32 %p1188, %r1871, 0;
selp.f32 %f1614, %f5788, 0f3F800000, %p1188;
mul.rn.f32 %f1615, %f5788, %f5788;
mov.f32 %f5789, 0fB94D4153;
@%p1188 bra $L__BB0_1396;
mov.f32 %f4369, 0fBAB607ED;
mov.f32 %f4370, 0f37CBAC00;
fma.rn.f32 %f5789, %f4370, %f1615, %f4369;
$L__BB0_1396:
selp.f32 %f4371, 0f3C0885E4, 0f3D2AAABB, %p1188;
fma.rn.f32 %f4372, %f5789, %f1615, %f4371;
selp.f32 %f4373, 0fBE2AAAA8, 0fBEFFFFFF, %p1188;
fma.rn.f32 %f4374, %f4372, %f1615, %f4373;
mov.f32 %f4375, 0f00000000;
fma.rn.f32 %f4376, %f1615, %f1614, %f4375;
fma.rn.f32 %f5790, %f4374, %f4376, %f1614;
and.b32 %r6372, %r1870, 2;
setp.eq.s32 %p1190, %r6372, 0;
@%p1190 bra $L__BB0_1398;
mov.f32 %f4378, 0fBF800000;
fma.rn.f32 %f5790, %f5790, %f4378, %f4375;
$L__BB0_1398:
add.f32 %f5791, %f5787, %f5790;
bra.uni $L__BB0_1399;
$L__BB0_978:
mov.b32 %r1274, %f5416;
shr.u32 %r5299, %r1274, 23;
and.b32 %r5300, %r5299, 255;
add.s32 %r1275, %r5300, -128;
shl.b32 %r5301, %r1274, 8;
or.b32 %r1276, %r5301, -2147483648;
shr.u32 %r1277, %r1275, 5;
mov.u64 %rd2633, 0;
mov.u32 %r8223, 0;
mov.u64 %rd1637, __cudart_i2opi_f;
mov.u64 %rd2634, %rd2633;
$L__BB0_979:
.pragma "nounroll";
shl.b64 %rd1636, %rd2633, 2;
add.s64 %rd1638, %rd1637, %rd1636;
ld.global.nc.u32 %r5302, [%rd1638];
mad.wide.u32 %rd1639, %r5302, %r1276, %rd2634;
shr.u64 %rd2634, %rd1639, 32;
add.s64 %rd1640, %rd1, %rd1636;
st.local.u32 [%rd1640], %rd1639;
add.s32 %r8223, %r8223, 1;
cvt.s64.s32 %rd2633, %r8223;
setp.ne.s32 %p835, %r8223, 6;
@%p835 bra $L__BB0_979;
st.local.u32 [%rd4], %rd2634;
mov.u32 %r5303, 4;
sub.s32 %r1280, %r5303, %r1277;
mov.u32 %r5304, 6;
sub.s32 %r5305, %r5304, %r1277;
mul.wide.s32 %rd1641, %r5305, 4;
add.s64 %rd1642, %rd1, %rd1641;
ld.local.u32 %r8224, [%rd1642];
ld.local.u32 %r8225, [%rd1642+-4];
and.b32 %r1283, %r1275, 31;
setp.eq.s32 %p836, %r1283, 0;
@%p836 bra $L__BB0_982;
mov.u32 %r5306, 32;
sub.s32 %r5307, %r5306, %r1283;
shr.u32 %r5308, %r8225, %r5307;
shl.b32 %r5309, %r8224, %r1283;
add.s32 %r8224, %r5308, %r5309;
mul.wide.s32 %rd1643, %r1280, 4;
add.s64 %rd1644, %rd1, %rd1643;
ld.local.u32 %r5310, [%rd1644];
shr.u32 %r5311, %r5310, %r5307;
shl.b32 %r5312, %r8225, %r1283;
add.s32 %r8225, %r5311, %r5312;
$L__BB0_982:
and.b32 %r5313, %r1274, -2147483648;
shr.u32 %r5314, %r8225, 30;
shl.b32 %r5315, %r8224, 2;
or.b32 %r5316, %r5314, %r5315;
shr.u32 %r5317, %r5316, 31;
shr.u32 %r5318, %r8224, 30;
add.s32 %r5319, %r5317, %r5318;
neg.s32 %r5320, %r5319;
setp.eq.s32 %p837, %r5313, 0;
selp.b32 %r8226, %r5319, %r5320, %p837;
setp.ne.s32 %p838, %r5317, 0;
xor.b32 %r5321, %r5313, -2147483648;
selp.b32 %r5322, %r5321, %r5313, %p838;
selp.b32 %r5323, -1, 0, %p838;
xor.b32 %r5324, %r5316, %r5323;
shl.b32 %r5325, %r8225, 2;
xor.b32 %r5326, %r5325, %r5323;
cvt.u64.u32 %rd1645, %r5324;
cvt.u64.u32 %rd1646, %r5326;
bfi.b64 %rd1647, %rd1645, %rd1646, 32, 32;
cvt.rn.f64.s64 %fd129, %rd1647;
mul.f64 %fd130, %fd129, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3699, %fd130;
setp.eq.s32 %p839, %r5322, 0;
neg.f32 %f3700, %f3699;
selp.f32 %f5644, %f3699, %f3700, %p839;
$L__BB0_984:
and.b32 %r1290, %r8226, 1;
setp.eq.s32 %p840, %r1290, 0;
selp.f32 %f1144, %f5644, 0f3F800000, %p840;
mul.rn.f32 %f1145, %f5644, %f5644;
mov.f32 %f5645, 0fB94D4153;
@%p840 bra $L__BB0_986;
mov.f32 %f3703, 0fBAB607ED;
mov.f32 %f3704, 0f37CBAC00;
fma.rn.f32 %f5645, %f3704, %f1145, %f3703;
$L__BB0_986:
selp.f32 %f3705, 0f3C0885E4, 0f3D2AAABB, %p840;
fma.rn.f32 %f3706, %f5645, %f1145, %f3705;
selp.f32 %f3707, 0fBE2AAAA8, 0fBEFFFFFF, %p840;
fma.rn.f32 %f3708, %f3706, %f1145, %f3707;
mov.f32 %f3709, 0f00000000;
fma.rn.f32 %f3710, %f1145, %f1144, %f3709;
fma.rn.f32 %f5281, %f3708, %f3710, %f1144;
and.b32 %r5328, %r8226, 2;
setp.eq.s32 %p842, %r5328, 0;
@%p842 bra $L__BB0_988;
mov.f32 %f3712, 0fBF800000;
fma.rn.f32 %f5281, %f5281, %f3712, %f3709;
$L__BB0_988:
setp.lt.s32 %p16, %r11, %r1272;
@%p832 bra $L__BB0_1001;
mul.f32 %f3713, %f5607, 0f3F22F983;
cvt.rni.s32.f32 %r8230, %f3713;
cvt.rn.f32.s32 %f3714, %r8230;
mov.f32 %f3715, 0fBFC90FDA;
fma.rn.f32 %f3716, %f3714, %f3715, %f5607;
mov.f32 %f3717, 0fB3A22168;
fma.rn.f32 %f3718, %f3714, %f3717, %f3716;
mov.f32 %f3719, 0fA7C234C5;
fma.rn.f32 %f5648, %f3714, %f3719, %f3718;
abs.f32 %f1153, %f5607;
setp.ltu.f32 %p844, %f1153, 0f47CE4780;
@%p844 bra $L__BB0_997;
setp.eq.f32 %p845, %f1153, 0f7F800000;
@%p845 bra $L__BB0_996;
bra.uni $L__BB0_991;
$L__BB0_996:
mov.f32 %f3722, 0f00000000;
mul.rn.f32 %f5648, %f5607, %f3722;
mov.u32 %r8230, 0;
bra.uni $L__BB0_997;
$L__BB0_991:
mov.b32 %r1292, %f5607;
shr.u32 %r5330, %r1292, 23;
and.b32 %r5331, %r5330, 255;
add.s32 %r1293, %r5331, -128;
shl.b32 %r5332, %r1292, 8;
or.b32 %r1294, %r5332, -2147483648;
shr.u32 %r1295, %r1293, 5;
mov.u64 %rd2635, 0;
mov.u32 %r8227, 0;
mov.u64 %rd1651, __cudart_i2opi_f;
mov.u64 %rd2636, %rd2635;
$L__BB0_992:
.pragma "nounroll";
shl.b64 %rd1650, %rd2635, 2;
add.s64 %rd1652, %rd1651, %rd1650;
ld.global.nc.u32 %r5333, [%rd1652];
mad.wide.u32 %rd1653, %r5333, %r1294, %rd2636;
shr.u64 %rd2636, %rd1653, 32;
add.s64 %rd1654, %rd1, %rd1650;
st.local.u32 [%rd1654], %rd1653;
add.s32 %r8227, %r8227, 1;
cvt.s64.s32 %rd2635, %r8227;
setp.ne.s32 %p846, %r8227, 6;
@%p846 bra $L__BB0_992;
st.local.u32 [%rd4], %rd2636;
mov.u32 %r5334, 4;
sub.s32 %r1298, %r5334, %r1295;
mov.u32 %r5335, 6;
sub.s32 %r5336, %r5335, %r1295;
mul.wide.s32 %rd1655, %r5336, 4;
add.s64 %rd1656, %rd1, %rd1655;
ld.local.u32 %r8228, [%rd1656];
ld.local.u32 %r8229, [%rd1656+-4];
and.b32 %r1301, %r1293, 31;
setp.eq.s32 %p847, %r1301, 0;
@%p847 bra $L__BB0_995;
mov.u32 %r5337, 32;
sub.s32 %r5338, %r5337, %r1301;
shr.u32 %r5339, %r8229, %r5338;
shl.b32 %r5340, %r8228, %r1301;
add.s32 %r8228, %r5339, %r5340;
mul.wide.s32 %rd1657, %r1298, 4;
add.s64 %rd1658, %rd1, %rd1657;
ld.local.u32 %r5341, [%rd1658];
shr.u32 %r5342, %r5341, %r5338;
shl.b32 %r5343, %r8229, %r1301;
add.s32 %r8229, %r5342, %r5343;
$L__BB0_995:
and.b32 %r5344, %r1292, -2147483648;
shr.u32 %r5345, %r8229, 30;
shl.b32 %r5346, %r8228, 2;
or.b32 %r5347, %r5345, %r5346;
shr.u32 %r5348, %r5347, 31;
shr.u32 %r5349, %r8228, 30;
add.s32 %r5350, %r5348, %r5349;
neg.s32 %r5351, %r5350;
setp.eq.s32 %p848, %r5344, 0;
selp.b32 %r8230, %r5350, %r5351, %p848;
setp.ne.s32 %p849, %r5348, 0;
xor.b32 %r5352, %r5344, -2147483648;
selp.b32 %r5353, %r5352, %r5344, %p849;
selp.b32 %r5354, -1, 0, %p849;
xor.b32 %r5355, %r5347, %r5354;
shl.b32 %r5356, %r8229, 2;
xor.b32 %r5357, %r5356, %r5354;
cvt.u64.u32 %rd1659, %r5355;
cvt.u64.u32 %rd1660, %r5357;
bfi.b64 %rd1661, %rd1659, %rd1660, 32, 32;
cvt.rn.f64.s64 %fd131, %rd1661;
mul.f64 %fd132, %fd131, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3720, %fd132;
setp.eq.s32 %p850, %r5353, 0;
neg.f32 %f3721, %f3720;
selp.f32 %f5648, %f3720, %f3721, %p850;
$L__BB0_997:
add.s32 %r1308, %r8230, 1;
and.b32 %r1309, %r1308, 1;
setp.eq.s32 %p851, %r1309, 0;
selp.f32 %f1157, %f5648, 0f3F800000, %p851;
mul.rn.f32 %f1158, %f5648, %f5648;
mov.f32 %f5649, 0fB94D4153;
@%p851 bra $L__BB0_999;
mov.f32 %f3724, 0fBAB607ED;
mov.f32 %f3725, 0f37CBAC00;
fma.rn.f32 %f5649, %f3725, %f1158, %f3724;
$L__BB0_999:
selp.f32 %f3726, 0f3C0885E4, 0f3D2AAABB, %p851;
fma.rn.f32 %f3727, %f5649, %f1158, %f3726;
selp.f32 %f3728, 0fBE2AAAA8, 0fBEFFFFFF, %p851;
fma.rn.f32 %f3729, %f3727, %f1158, %f3728;
mov.f32 %f3730, 0f00000000;
fma.rn.f32 %f3731, %f1158, %f1157, %f3730;
fma.rn.f32 %f5283, %f3729, %f3731, %f1157;
and.b32 %r5359, %r1308, 2;
setp.eq.s32 %p853, %r5359, 0;
@%p853 bra $L__BB0_1001;
mov.f32 %f3733, 0fBF800000;
fma.rn.f32 %f5283, %f5283, %f3733, %f3730;
$L__BB0_1001:
selp.f32 %f1165, %f5283, %f5284, %p16;
selp.f32 %f1166, %f5281, %f5282, %p16;
@%p832 bra $L__BB0_1003;
add.f32 %f5798, %f1166, %f1165;
mov.f32 %f5282, %f5281;
mov.f32 %f5284, %f5283;
$L__BB0_1003:
@%p807 bra $L__BB0_1032;
shl.b32 %r5360, %r12, 5;
mov.u32 %r5361, -32;
sub.s32 %r1310, %r5361, %r5360;
setp.ge.s32 %p857, %r11, %r1310;
@%p857 bra $L__BB0_1017;
mul.f32 %f3736, %f5415, 0f3F22F983;
cvt.rni.s32.f32 %r8234, %f3736;
cvt.rn.f32.s32 %f3737, %r8234;
mov.f32 %f3738, 0fBFC90FDA;
fma.rn.f32 %f3739, %f3737, %f3738, %f5415;
mov.f32 %f3740, 0fB3A22168;
fma.rn.f32 %f3741, %f3737, %f3740, %f3739;
mov.f32 %f3742, 0fA7C234C5;
fma.rn.f32 %f5657, %f3737, %f3742, %f3741;
abs.f32 %f1174, %f5415;
setp.ltu.f32 %p858, %f1174, 0f47CE4780;
@%p858 bra $L__BB0_1013;
setp.eq.f32 %p859, %f1174, 0f7F800000;
@%p859 bra $L__BB0_1012;
bra.uni $L__BB0_1007;
$L__BB0_1012:
mov.f32 %f3745, 0f00000000;
mul.rn.f32 %f5657, %f5415, %f3745;
mov.u32 %r8234, 0;
bra.uni $L__BB0_1013;
$L__BB0_1007:
mov.b32 %r1312, %f5415;
shr.u32 %r5363, %r1312, 23;
and.b32 %r5364, %r5363, 255;
add.s32 %r1313, %r5364, -128;
shl.b32 %r5365, %r1312, 8;
or.b32 %r1314, %r5365, -2147483648;
shr.u32 %r1315, %r1313, 5;
mov.u64 %rd2637, 0;
mov.u32 %r8231, 0;
mov.u64 %rd1665, __cudart_i2opi_f;
mov.u64 %rd2638, %rd2637;
$L__BB0_1008:
.pragma "nounroll";
shl.b64 %rd1664, %rd2637, 2;
add.s64 %rd1666, %rd1665, %rd1664;
ld.global.nc.u32 %r5366, [%rd1666];
mad.wide.u32 %rd1667, %r5366, %r1314, %rd2638;
shr.u64 %rd2638, %rd1667, 32;
add.s64 %rd1668, %rd1, %rd1664;
st.local.u32 [%rd1668], %rd1667;
add.s32 %r8231, %r8231, 1;
cvt.s64.s32 %rd2637, %r8231;
setp.ne.s32 %p860, %r8231, 6;
@%p860 bra $L__BB0_1008;
st.local.u32 [%rd4], %rd2638;
mov.u32 %r5367, 4;
sub.s32 %r1318, %r5367, %r1315;
mov.u32 %r5368, 6;
sub.s32 %r5369, %r5368, %r1315;
mul.wide.s32 %rd1669, %r5369, 4;
add.s64 %rd1670, %rd1, %rd1669;
ld.local.u32 %r8232, [%rd1670];
ld.local.u32 %r8233, [%rd1670+-4];
and.b32 %r1321, %r1313, 31;
setp.eq.s32 %p861, %r1321, 0;
@%p861 bra $L__BB0_1011;
mov.u32 %r5370, 32;
sub.s32 %r5371, %r5370, %r1321;
shr.u32 %r5372, %r8233, %r5371;
shl.b32 %r5373, %r8232, %r1321;
add.s32 %r8232, %r5372, %r5373;
mul.wide.s32 %rd1671, %r1318, 4;
add.s64 %rd1672, %rd1, %rd1671;
ld.local.u32 %r5374, [%rd1672];
shr.u32 %r5375, %r5374, %r5371;
shl.b32 %r5376, %r8233, %r1321;
add.s32 %r8233, %r5375, %r5376;
$L__BB0_1011:
and.b32 %r5377, %r1312, -2147483648;
shr.u32 %r5378, %r8233, 30;
shl.b32 %r5379, %r8232, 2;
or.b32 %r5380, %r5378, %r5379;
shr.u32 %r5381, %r5380, 31;
shr.u32 %r5382, %r8232, 30;
add.s32 %r5383, %r5381, %r5382;
neg.s32 %r5384, %r5383;
setp.eq.s32 %p862, %r5377, 0;
selp.b32 %r8234, %r5383, %r5384, %p862;
setp.ne.s32 %p863, %r5381, 0;
xor.b32 %r5385, %r5377, -2147483648;
selp.b32 %r5386, %r5385, %r5377, %p863;
selp.b32 %r5387, -1, 0, %p863;
xor.b32 %r5388, %r5380, %r5387;
shl.b32 %r5389, %r8233, 2;
xor.b32 %r5390, %r5389, %r5387;
cvt.u64.u32 %rd1673, %r5388;
cvt.u64.u32 %rd1674, %r5390;
bfi.b64 %rd1675, %rd1673, %rd1674, 32, 32;
cvt.rn.f64.s64 %fd133, %rd1675;
mul.f64 %fd134, %fd133, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3743, %fd134;
setp.eq.s32 %p864, %r5386, 0;
neg.f32 %f3744, %f3743;
selp.f32 %f5657, %f3743, %f3744, %p864;
$L__BB0_1013:
and.b32 %r1328, %r8234, 1;
setp.eq.s32 %p865, %r1328, 0;
selp.f32 %f1178, %f5657, 0f3F800000, %p865;
mul.rn.f32 %f1179, %f5657, %f5657;
mov.f32 %f5658, 0fB94D4153;
@%p865 bra $L__BB0_1015;
mov.f32 %f3747, 0fBAB607ED;
mov.f32 %f3748, 0f37CBAC00;
fma.rn.f32 %f5658, %f3748, %f1179, %f3747;
$L__BB0_1015:
selp.f32 %f3749, 0f3C0885E4, 0f3D2AAABB, %p865;
fma.rn.f32 %f3750, %f5658, %f1179, %f3749;
selp.f32 %f3751, 0fBE2AAAA8, 0fBEFFFFFF, %p865;
fma.rn.f32 %f3752, %f3750, %f1179, %f3751;
mov.f32 %f3753, 0f00000000;
fma.rn.f32 %f3754, %f1179, %f1178, %f3753;
fma.rn.f32 %f5281, %f3752, %f3754, %f1178;
and.b32 %r5392, %r8234, 2;
setp.eq.s32 %p867, %r5392, 0;
@%p867 bra $L__BB0_1017;
mov.f32 %f3756, 0fBF800000;
fma.rn.f32 %f5281, %f5281, %f3756, %f3753;
$L__BB0_1017:
setp.lt.s32 %p17, %r11, %r1310;
@%p857 bra $L__BB0_1030;
mul.f32 %f3757, %f5606, 0f3F22F983;
cvt.rni.s32.f32 %r8238, %f3757;
cvt.rn.f32.s32 %f3758, %r8238;
mov.f32 %f3759, 0fBFC90FDA;
fma.rn.f32 %f3760, %f3758, %f3759, %f5606;
mov.f32 %f3761, 0fB3A22168;
fma.rn.f32 %f3762, %f3758, %f3761, %f3760;
mov.f32 %f3763, 0fA7C234C5;
fma.rn.f32 %f5661, %f3758, %f3763, %f3762;
abs.f32 %f1187, %f5606;
setp.ltu.f32 %p869, %f1187, 0f47CE4780;
@%p869 bra $L__BB0_1026;
setp.eq.f32 %p870, %f1187, 0f7F800000;
@%p870 bra $L__BB0_1025;
bra.uni $L__BB0_1020;
$L__BB0_1025:
mov.f32 %f3766, 0f00000000;
mul.rn.f32 %f5661, %f5606, %f3766;
mov.u32 %r8238, 0;
bra.uni $L__BB0_1026;
$L__BB0_1020:
mov.b32 %r1330, %f5606;
shr.u32 %r5394, %r1330, 23;
and.b32 %r5395, %r5394, 255;
add.s32 %r1331, %r5395, -128;
shl.b32 %r5396, %r1330, 8;
or.b32 %r1332, %r5396, -2147483648;
shr.u32 %r1333, %r1331, 5;
mov.u64 %rd2639, 0;
mov.u32 %r8235, 0;
mov.u64 %rd1679, __cudart_i2opi_f;
mov.u64 %rd2640, %rd2639;
$L__BB0_1021:
.pragma "nounroll";
shl.b64 %rd1678, %rd2639, 2;
add.s64 %rd1680, %rd1679, %rd1678;
ld.global.nc.u32 %r5397, [%rd1680];
mad.wide.u32 %rd1681, %r5397, %r1332, %rd2640;
shr.u64 %rd2640, %rd1681, 32;
add.s64 %rd1682, %rd1, %rd1678;
st.local.u32 [%rd1682], %rd1681;
add.s32 %r8235, %r8235, 1;
cvt.s64.s32 %rd2639, %r8235;
setp.ne.s32 %p871, %r8235, 6;
@%p871 bra $L__BB0_1021;
st.local.u32 [%rd4], %rd2640;
mov.u32 %r5398, 4;
sub.s32 %r1336, %r5398, %r1333;
mov.u32 %r5399, 6;
sub.s32 %r5400, %r5399, %r1333;
mul.wide.s32 %rd1683, %r5400, 4;
add.s64 %rd1684, %rd1, %rd1683;
ld.local.u32 %r8236, [%rd1684];
ld.local.u32 %r8237, [%rd1684+-4];
and.b32 %r1339, %r1331, 31;
setp.eq.s32 %p872, %r1339, 0;
@%p872 bra $L__BB0_1024;
mov.u32 %r5401, 32;
sub.s32 %r5402, %r5401, %r1339;
shr.u32 %r5403, %r8237, %r5402;
shl.b32 %r5404, %r8236, %r1339;
add.s32 %r8236, %r5403, %r5404;
mul.wide.s32 %rd1685, %r1336, 4;
add.s64 %rd1686, %rd1, %rd1685;
ld.local.u32 %r5405, [%rd1686];
shr.u32 %r5406, %r5405, %r5402;
shl.b32 %r5407, %r8237, %r1339;
add.s32 %r8237, %r5406, %r5407;
$L__BB0_1024:
and.b32 %r5408, %r1330, -2147483648;
shr.u32 %r5409, %r8237, 30;
shl.b32 %r5410, %r8236, 2;
or.b32 %r5411, %r5409, %r5410;
shr.u32 %r5412, %r5411, 31;
shr.u32 %r5413, %r8236, 30;
add.s32 %r5414, %r5412, %r5413;
neg.s32 %r5415, %r5414;
setp.eq.s32 %p873, %r5408, 0;
selp.b32 %r8238, %r5414, %r5415, %p873;
setp.ne.s32 %p874, %r5412, 0;
xor.b32 %r5416, %r5408, -2147483648;
selp.b32 %r5417, %r5416, %r5408, %p874;
selp.b32 %r5418, -1, 0, %p874;
xor.b32 %r5419, %r5411, %r5418;
shl.b32 %r5420, %r8237, 2;
xor.b32 %r5421, %r5420, %r5418;
cvt.u64.u32 %rd1687, %r5419;
cvt.u64.u32 %rd1688, %r5421;
bfi.b64 %rd1689, %rd1687, %rd1688, 32, 32;
cvt.rn.f64.s64 %fd135, %rd1689;
mul.f64 %fd136, %fd135, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3764, %fd136;
setp.eq.s32 %p875, %r5417, 0;
neg.f32 %f3765, %f3764;
selp.f32 %f5661, %f3764, %f3765, %p875;
$L__BB0_1026:
add.s32 %r1346, %r8238, 1;
and.b32 %r1347, %r1346, 1;
setp.eq.s32 %p876, %r1347, 0;
selp.f32 %f1191, %f5661, 0f3F800000, %p876;
mul.rn.f32 %f1192, %f5661, %f5661;
mov.f32 %f5662, 0fB94D4153;
@%p876 bra $L__BB0_1028;
mov.f32 %f3768, 0fBAB607ED;
mov.f32 %f3769, 0f37CBAC00;
fma.rn.f32 %f5662, %f3769, %f1192, %f3768;
$L__BB0_1028:
selp.f32 %f3770, 0f3C0885E4, 0f3D2AAABB, %p876;
fma.rn.f32 %f3771, %f5662, %f1192, %f3770;
selp.f32 %f3772, 0fBE2AAAA8, 0fBEFFFFFF, %p876;
fma.rn.f32 %f3773, %f3771, %f1192, %f3772;
mov.f32 %f3774, 0f00000000;
fma.rn.f32 %f3775, %f1192, %f1191, %f3774;
fma.rn.f32 %f5283, %f3773, %f3775, %f1191;
and.b32 %r5423, %r1346, 2;
setp.eq.s32 %p878, %r5423, 0;
@%p878 bra $L__BB0_1030;
mov.f32 %f3777, 0fBF800000;
fma.rn.f32 %f5283, %f5283, %f3777, %f3774;
$L__BB0_1030:
selp.f32 %f1199, %f5283, %f5284, %p17;
selp.f32 %f1200, %f5281, %f5282, %p17;
@%p857 bra $L__BB0_1032;
add.f32 %f5797, %f1200, %f1199;
mov.f32 %f5282, %f5281;
mov.f32 %f5284, %f5283;
$L__BB0_1032:
@%p810 bra $L__BB0_1061;
shl.b32 %r5424, %r12, 5;
neg.s32 %r1348, %r5424;
setp.ge.s32 %p882, %r11, %r1348;
@%p882 bra $L__BB0_1046;
mul.f32 %f3780, %f5414, 0f3F22F983;
cvt.rni.s32.f32 %r8242, %f3780;
cvt.rn.f32.s32 %f3781, %r8242;
mov.f32 %f3782, 0fBFC90FDA;
fma.rn.f32 %f3783, %f3781, %f3782, %f5414;
mov.f32 %f3784, 0fB3A22168;
fma.rn.f32 %f3785, %f3781, %f3784, %f3783;
mov.f32 %f3786, 0fA7C234C5;
fma.rn.f32 %f5670, %f3781, %f3786, %f3785;
abs.f32 %f1208, %f5414;
setp.ltu.f32 %p883, %f1208, 0f47CE4780;
@%p883 bra $L__BB0_1042;
setp.eq.f32 %p884, %f1208, 0f7F800000;
@%p884 bra $L__BB0_1041;
bra.uni $L__BB0_1036;
$L__BB0_1041:
mov.f32 %f3789, 0f00000000;
mul.rn.f32 %f5670, %f5414, %f3789;
mov.u32 %r8242, 0;
bra.uni $L__BB0_1042;
$L__BB0_1036:
mov.b32 %r1350, %f5414;
shr.u32 %r5426, %r1350, 23;
and.b32 %r5427, %r5426, 255;
add.s32 %r1351, %r5427, -128;
shl.b32 %r5428, %r1350, 8;
or.b32 %r1352, %r5428, -2147483648;
shr.u32 %r1353, %r1351, 5;
mov.u64 %rd2641, 0;
mov.u32 %r8239, 0;
mov.u64 %rd1693, __cudart_i2opi_f;
mov.u64 %rd2642, %rd2641;
$L__BB0_1037:
.pragma "nounroll";
shl.b64 %rd1692, %rd2641, 2;
add.s64 %rd1694, %rd1693, %rd1692;
ld.global.nc.u32 %r5429, [%rd1694];
mad.wide.u32 %rd1695, %r5429, %r1352, %rd2642;
shr.u64 %rd2642, %rd1695, 32;
add.s64 %rd1696, %rd1, %rd1692;
st.local.u32 [%rd1696], %rd1695;
add.s32 %r8239, %r8239, 1;
cvt.s64.s32 %rd2641, %r8239;
setp.ne.s32 %p885, %r8239, 6;
@%p885 bra $L__BB0_1037;
st.local.u32 [%rd4], %rd2642;
mov.u32 %r5430, 4;
sub.s32 %r1356, %r5430, %r1353;
mov.u32 %r5431, 6;
sub.s32 %r5432, %r5431, %r1353;
mul.wide.s32 %rd1697, %r5432, 4;
add.s64 %rd1698, %rd1, %rd1697;
ld.local.u32 %r8240, [%rd1698];
ld.local.u32 %r8241, [%rd1698+-4];
and.b32 %r1359, %r1351, 31;
setp.eq.s32 %p886, %r1359, 0;
@%p886 bra $L__BB0_1040;
mov.u32 %r5433, 32;
sub.s32 %r5434, %r5433, %r1359;
shr.u32 %r5435, %r8241, %r5434;
shl.b32 %r5436, %r8240, %r1359;
add.s32 %r8240, %r5435, %r5436;
mul.wide.s32 %rd1699, %r1356, 4;
add.s64 %rd1700, %rd1, %rd1699;
ld.local.u32 %r5437, [%rd1700];
shr.u32 %r5438, %r5437, %r5434;
shl.b32 %r5439, %r8241, %r1359;
add.s32 %r8241, %r5438, %r5439;
$L__BB0_1040:
and.b32 %r5440, %r1350, -2147483648;
shr.u32 %r5441, %r8241, 30;
shl.b32 %r5442, %r8240, 2;
or.b32 %r5443, %r5441, %r5442;
shr.u32 %r5444, %r5443, 31;
shr.u32 %r5445, %r8240, 30;
add.s32 %r5446, %r5444, %r5445;
neg.s32 %r5447, %r5446;
setp.eq.s32 %p887, %r5440, 0;
selp.b32 %r8242, %r5446, %r5447, %p887;
setp.ne.s32 %p888, %r5444, 0;
xor.b32 %r5448, %r5440, -2147483648;
selp.b32 %r5449, %r5448, %r5440, %p888;
selp.b32 %r5450, -1, 0, %p888;
xor.b32 %r5451, %r5443, %r5450;
shl.b32 %r5452, %r8241, 2;
xor.b32 %r5453, %r5452, %r5450;
cvt.u64.u32 %rd1701, %r5451;
cvt.u64.u32 %rd1702, %r5453;
bfi.b64 %rd1703, %rd1701, %rd1702, 32, 32;
cvt.rn.f64.s64 %fd137, %rd1703;
mul.f64 %fd138, %fd137, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3787, %fd138;
setp.eq.s32 %p889, %r5449, 0;
neg.f32 %f3788, %f3787;
selp.f32 %f5670, %f3787, %f3788, %p889;
$L__BB0_1042:
and.b32 %r1366, %r8242, 1;
setp.eq.s32 %p890, %r1366, 0;
selp.f32 %f1212, %f5670, 0f3F800000, %p890;
mul.rn.f32 %f1213, %f5670, %f5670;
mov.f32 %f5671, 0fB94D4153;
@%p890 bra $L__BB0_1044;
mov.f32 %f3791, 0fBAB607ED;
mov.f32 %f3792, 0f37CBAC00;
fma.rn.f32 %f5671, %f3792, %f1213, %f3791;
$L__BB0_1044:
selp.f32 %f3793, 0f3C0885E4, 0f3D2AAABB, %p890;
fma.rn.f32 %f3794, %f5671, %f1213, %f3793;
selp.f32 %f3795, 0fBE2AAAA8, 0fBEFFFFFF, %p890;
fma.rn.f32 %f3796, %f3794, %f1213, %f3795;
mov.f32 %f3797, 0f00000000;
fma.rn.f32 %f3798, %f1213, %f1212, %f3797;
fma.rn.f32 %f5281, %f3796, %f3798, %f1212;
and.b32 %r5455, %r8242, 2;
setp.eq.s32 %p892, %r5455, 0;
@%p892 bra $L__BB0_1046;
mov.f32 %f3800, 0fBF800000;
fma.rn.f32 %f5281, %f5281, %f3800, %f3797;
$L__BB0_1046:
setp.lt.s32 %p18, %r11, %r1348;
@%p882 bra $L__BB0_1059;
mul.f32 %f3801, %f5406, 0f3F22F983;
cvt.rni.s32.f32 %r8246, %f3801;
cvt.rn.f32.s32 %f3802, %r8246;
mov.f32 %f3803, 0fBFC90FDA;
fma.rn.f32 %f3804, %f3802, %f3803, %f5406;
mov.f32 %f3805, 0fB3A22168;
fma.rn.f32 %f3806, %f3802, %f3805, %f3804;
mov.f32 %f3807, 0fA7C234C5;
fma.rn.f32 %f5674, %f3802, %f3807, %f3806;
abs.f32 %f1221, %f5406;
setp.ltu.f32 %p894, %f1221, 0f47CE4780;
@%p894 bra $L__BB0_1055;
setp.eq.f32 %p895, %f1221, 0f7F800000;
@%p895 bra $L__BB0_1054;
bra.uni $L__BB0_1049;
$L__BB0_1054:
mov.f32 %f3810, 0f00000000;
mul.rn.f32 %f5674, %f5406, %f3810;
mov.u32 %r8246, 0;
bra.uni $L__BB0_1055;
$L__BB0_1049:
mov.b32 %r1368, %f5406;
shr.u32 %r5457, %r1368, 23;
and.b32 %r5458, %r5457, 255;
add.s32 %r1369, %r5458, -128;
shl.b32 %r5459, %r1368, 8;
or.b32 %r1370, %r5459, -2147483648;
shr.u32 %r1371, %r1369, 5;
mov.u64 %rd2643, 0;
mov.u32 %r8243, 0;
mov.u64 %rd1707, __cudart_i2opi_f;
mov.u64 %rd2644, %rd2643;
$L__BB0_1050:
.pragma "nounroll";
shl.b64 %rd1706, %rd2643, 2;
add.s64 %rd1708, %rd1707, %rd1706;
ld.global.nc.u32 %r5460, [%rd1708];
mad.wide.u32 %rd1709, %r5460, %r1370, %rd2644;
shr.u64 %rd2644, %rd1709, 32;
add.s64 %rd1710, %rd1, %rd1706;
st.local.u32 [%rd1710], %rd1709;
add.s32 %r8243, %r8243, 1;
cvt.s64.s32 %rd2643, %r8243;
setp.ne.s32 %p896, %r8243, 6;
@%p896 bra $L__BB0_1050;
st.local.u32 [%rd4], %rd2644;
mov.u32 %r5461, 4;
sub.s32 %r1374, %r5461, %r1371;
mov.u32 %r5462, 6;
sub.s32 %r5463, %r5462, %r1371;
mul.wide.s32 %rd1711, %r5463, 4;
add.s64 %rd1712, %rd1, %rd1711;
ld.local.u32 %r8244, [%rd1712];
ld.local.u32 %r8245, [%rd1712+-4];
and.b32 %r1377, %r1369, 31;
setp.eq.s32 %p897, %r1377, 0;
@%p897 bra $L__BB0_1053;
mov.u32 %r5464, 32;
sub.s32 %r5465, %r5464, %r1377;
shr.u32 %r5466, %r8245, %r5465;
shl.b32 %r5467, %r8244, %r1377;
add.s32 %r8244, %r5466, %r5467;
mul.wide.s32 %rd1713, %r1374, 4;
add.s64 %rd1714, %rd1, %rd1713;
ld.local.u32 %r5468, [%rd1714];
shr.u32 %r5469, %r5468, %r5465;
shl.b32 %r5470, %r8245, %r1377;
add.s32 %r8245, %r5469, %r5470;
$L__BB0_1053:
and.b32 %r5471, %r1368, -2147483648;
shr.u32 %r5472, %r8245, 30;
shl.b32 %r5473, %r8244, 2;
or.b32 %r5474, %r5472, %r5473;
shr.u32 %r5475, %r5474, 31;
shr.u32 %r5476, %r8244, 30;
add.s32 %r5477, %r5475, %r5476;
neg.s32 %r5478, %r5477;
setp.eq.s32 %p898, %r5471, 0;
selp.b32 %r8246, %r5477, %r5478, %p898;
setp.ne.s32 %p899, %r5475, 0;
xor.b32 %r5479, %r5471, -2147483648;
selp.b32 %r5480, %r5479, %r5471, %p899;
selp.b32 %r5481, -1, 0, %p899;
xor.b32 %r5482, %r5474, %r5481;
shl.b32 %r5483, %r8245, 2;
xor.b32 %r5484, %r5483, %r5481;
cvt.u64.u32 %rd1715, %r5482;
cvt.u64.u32 %rd1716, %r5484;
bfi.b64 %rd1717, %rd1715, %rd1716, 32, 32;
cvt.rn.f64.s64 %fd139, %rd1717;
mul.f64 %fd140, %fd139, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3808, %fd140;
setp.eq.s32 %p900, %r5480, 0;
neg.f32 %f3809, %f3808;
selp.f32 %f5674, %f3808, %f3809, %p900;
$L__BB0_1055:
add.s32 %r1384, %r8246, 1;
and.b32 %r1385, %r1384, 1;
setp.eq.s32 %p901, %r1385, 0;
selp.f32 %f1225, %f5674, 0f3F800000, %p901;
mul.rn.f32 %f1226, %f5674, %f5674;
mov.f32 %f5675, 0fB94D4153;
@%p901 bra $L__BB0_1057;
mov.f32 %f3812, 0fBAB607ED;
mov.f32 %f3813, 0f37CBAC00;
fma.rn.f32 %f5675, %f3813, %f1226, %f3812;
$L__BB0_1057:
selp.f32 %f3814, 0f3C0885E4, 0f3D2AAABB, %p901;
fma.rn.f32 %f3815, %f5675, %f1226, %f3814;
selp.f32 %f3816, 0fBE2AAAA8, 0fBEFFFFFF, %p901;
fma.rn.f32 %f3817, %f3815, %f1226, %f3816;
mov.f32 %f3818, 0f00000000;
fma.rn.f32 %f3819, %f1226, %f1225, %f3818;
fma.rn.f32 %f5283, %f3817, %f3819, %f1225;
and.b32 %r5486, %r1384, 2;
setp.eq.s32 %p903, %r5486, 0;
@%p903 bra $L__BB0_1059;
mov.f32 %f3821, 0fBF800000;
fma.rn.f32 %f5283, %f5283, %f3821, %f3818;
$L__BB0_1059:
selp.f32 %f1233, %f5283, %f5284, %p18;
selp.f32 %f1234, %f5281, %f5282, %p18;
@%p882 bra $L__BB0_1061;
add.f32 %f5796, %f1234, %f1233;
mov.f32 %f5282, %f5281;
mov.f32 %f5284, %f5283;
$L__BB0_1061:
@%p810 bra $L__BB0_1090;
shl.b32 %r5487, %r12, 5;
mov.u32 %r5488, -32;
sub.s32 %r1386, %r5488, %r5487;
setp.ge.s32 %p907, %r11, %r1386;
@%p907 bra $L__BB0_1075;
mul.f32 %f3824, %f5413, 0f3F22F983;
cvt.rni.s32.f32 %r8250, %f3824;
cvt.rn.f32.s32 %f3825, %r8250;
mov.f32 %f3826, 0fBFC90FDA;
fma.rn.f32 %f3827, %f3825, %f3826, %f5413;
mov.f32 %f3828, 0fB3A22168;
fma.rn.f32 %f3829, %f3825, %f3828, %f3827;
mov.f32 %f3830, 0fA7C234C5;
fma.rn.f32 %f5683, %f3825, %f3830, %f3829;
abs.f32 %f1242, %f5413;
setp.ltu.f32 %p908, %f1242, 0f47CE4780;
@%p908 bra $L__BB0_1071;
setp.eq.f32 %p909, %f1242, 0f7F800000;
@%p909 bra $L__BB0_1070;
bra.uni $L__BB0_1065;
$L__BB0_1070:
mov.f32 %f3833, 0f00000000;
mul.rn.f32 %f5683, %f5413, %f3833;
mov.u32 %r8250, 0;
bra.uni $L__BB0_1071;
$L__BB0_1065:
mov.b32 %r1388, %f5413;
shr.u32 %r5490, %r1388, 23;
and.b32 %r5491, %r5490, 255;
add.s32 %r1389, %r5491, -128;
shl.b32 %r5492, %r1388, 8;
or.b32 %r1390, %r5492, -2147483648;
shr.u32 %r1391, %r1389, 5;
mov.u64 %rd2645, 0;
mov.u32 %r8247, 0;
mov.u64 %rd1721, __cudart_i2opi_f;
mov.u64 %rd2646, %rd2645;
$L__BB0_1066:
.pragma "nounroll";
shl.b64 %rd1720, %rd2645, 2;
add.s64 %rd1722, %rd1721, %rd1720;
ld.global.nc.u32 %r5493, [%rd1722];
mad.wide.u32 %rd1723, %r5493, %r1390, %rd2646;
shr.u64 %rd2646, %rd1723, 32;
add.s64 %rd1724, %rd1, %rd1720;
st.local.u32 [%rd1724], %rd1723;
add.s32 %r8247, %r8247, 1;
cvt.s64.s32 %rd2645, %r8247;
setp.ne.s32 %p910, %r8247, 6;
@%p910 bra $L__BB0_1066;
st.local.u32 [%rd4], %rd2646;
mov.u32 %r5494, 4;
sub.s32 %r1394, %r5494, %r1391;
mov.u32 %r5495, 6;
sub.s32 %r5496, %r5495, %r1391;
mul.wide.s32 %rd1725, %r5496, 4;
add.s64 %rd1726, %rd1, %rd1725;
ld.local.u32 %r8248, [%rd1726];
ld.local.u32 %r8249, [%rd1726+-4];
and.b32 %r1397, %r1389, 31;
setp.eq.s32 %p911, %r1397, 0;
@%p911 bra $L__BB0_1069;
mov.u32 %r5497, 32;
sub.s32 %r5498, %r5497, %r1397;
shr.u32 %r5499, %r8249, %r5498;
shl.b32 %r5500, %r8248, %r1397;
add.s32 %r8248, %r5499, %r5500;
mul.wide.s32 %rd1727, %r1394, 4;
add.s64 %rd1728, %rd1, %rd1727;
ld.local.u32 %r5501, [%rd1728];
shr.u32 %r5502, %r5501, %r5498;
shl.b32 %r5503, %r8249, %r1397;
add.s32 %r8249, %r5502, %r5503;
$L__BB0_1069:
and.b32 %r5504, %r1388, -2147483648;
shr.u32 %r5505, %r8249, 30;
shl.b32 %r5506, %r8248, 2;
or.b32 %r5507, %r5505, %r5506;
shr.u32 %r5508, %r5507, 31;
shr.u32 %r5509, %r8248, 30;
add.s32 %r5510, %r5508, %r5509;
neg.s32 %r5511, %r5510;
setp.eq.s32 %p912, %r5504, 0;
selp.b32 %r8250, %r5510, %r5511, %p912;
setp.ne.s32 %p913, %r5508, 0;
xor.b32 %r5512, %r5504, -2147483648;
selp.b32 %r5513, %r5512, %r5504, %p913;
selp.b32 %r5514, -1, 0, %p913;
xor.b32 %r5515, %r5507, %r5514;
shl.b32 %r5516, %r8249, 2;
xor.b32 %r5517, %r5516, %r5514;
cvt.u64.u32 %rd1729, %r5515;
cvt.u64.u32 %rd1730, %r5517;
bfi.b64 %rd1731, %rd1729, %rd1730, 32, 32;
cvt.rn.f64.s64 %fd141, %rd1731;
mul.f64 %fd142, %fd141, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3831, %fd142;
setp.eq.s32 %p914, %r5513, 0;
neg.f32 %f3832, %f3831;
selp.f32 %f5683, %f3831, %f3832, %p914;
$L__BB0_1071:
and.b32 %r1404, %r8250, 1;
setp.eq.s32 %p915, %r1404, 0;
selp.f32 %f1246, %f5683, 0f3F800000, %p915;
mul.rn.f32 %f1247, %f5683, %f5683;
mov.f32 %f5684, 0fB94D4153;
@%p915 bra $L__BB0_1073;
mov.f32 %f3835, 0fBAB607ED;
mov.f32 %f3836, 0f37CBAC00;
fma.rn.f32 %f5684, %f3836, %f1247, %f3835;
$L__BB0_1073:
selp.f32 %f3837, 0f3C0885E4, 0f3D2AAABB, %p915;
fma.rn.f32 %f3838, %f5684, %f1247, %f3837;
selp.f32 %f3839, 0fBE2AAAA8, 0fBEFFFFFF, %p915;
fma.rn.f32 %f3840, %f3838, %f1247, %f3839;
mov.f32 %f3841, 0f00000000;
fma.rn.f32 %f3842, %f1247, %f1246, %f3841;
fma.rn.f32 %f5281, %f3840, %f3842, %f1246;
and.b32 %r5519, %r8250, 2;
setp.eq.s32 %p917, %r5519, 0;
@%p917 bra $L__BB0_1075;
mov.f32 %f3844, 0fBF800000;
fma.rn.f32 %f5281, %f5281, %f3844, %f3841;
$L__BB0_1075:
setp.lt.s32 %p19, %r11, %r1386;
@%p907 bra $L__BB0_1088;
mul.f32 %f3845, %f5405, 0f3F22F983;
cvt.rni.s32.f32 %r8254, %f3845;
cvt.rn.f32.s32 %f3846, %r8254;
mov.f32 %f3847, 0fBFC90FDA;
fma.rn.f32 %f3848, %f3846, %f3847, %f5405;
mov.f32 %f3849, 0fB3A22168;
fma.rn.f32 %f3850, %f3846, %f3849, %f3848;
mov.f32 %f3851, 0fA7C234C5;
fma.rn.f32 %f5687, %f3846, %f3851, %f3850;
abs.f32 %f1255, %f5405;
setp.ltu.f32 %p919, %f1255, 0f47CE4780;
@%p919 bra $L__BB0_1084;
setp.eq.f32 %p920, %f1255, 0f7F800000;
@%p920 bra $L__BB0_1083;
bra.uni $L__BB0_1078;
$L__BB0_1083:
mov.f32 %f3854, 0f00000000;
mul.rn.f32 %f5687, %f5405, %f3854;
mov.u32 %r8254, 0;
bra.uni $L__BB0_1084;
$L__BB0_1078:
mov.b32 %r1406, %f5405;
shr.u32 %r5521, %r1406, 23;
and.b32 %r5522, %r5521, 255;
add.s32 %r1407, %r5522, -128;
shl.b32 %r5523, %r1406, 8;
or.b32 %r1408, %r5523, -2147483648;
shr.u32 %r1409, %r1407, 5;
mov.u64 %rd2647, 0;
mov.u32 %r8251, 0;
mov.u64 %rd1735, __cudart_i2opi_f;
mov.u64 %rd2648, %rd2647;
$L__BB0_1079:
.pragma "nounroll";
shl.b64 %rd1734, %rd2647, 2;
add.s64 %rd1736, %rd1735, %rd1734;
ld.global.nc.u32 %r5524, [%rd1736];
mad.wide.u32 %rd1737, %r5524, %r1408, %rd2648;
shr.u64 %rd2648, %rd1737, 32;
add.s64 %rd1738, %rd1, %rd1734;
st.local.u32 [%rd1738], %rd1737;
add.s32 %r8251, %r8251, 1;
cvt.s64.s32 %rd2647, %r8251;
setp.ne.s32 %p921, %r8251, 6;
@%p921 bra $L__BB0_1079;
st.local.u32 [%rd4], %rd2648;
mov.u32 %r5525, 4;
sub.s32 %r1412, %r5525, %r1409;
mov.u32 %r5526, 6;
sub.s32 %r5527, %r5526, %r1409;
mul.wide.s32 %rd1739, %r5527, 4;
add.s64 %rd1740, %rd1, %rd1739;
ld.local.u32 %r8252, [%rd1740];
ld.local.u32 %r8253, [%rd1740+-4];
and.b32 %r1415, %r1407, 31;
setp.eq.s32 %p922, %r1415, 0;
@%p922 bra $L__BB0_1082;
mov.u32 %r5528, 32;
sub.s32 %r5529, %r5528, %r1415;
shr.u32 %r5530, %r8253, %r5529;
shl.b32 %r5531, %r8252, %r1415;
add.s32 %r8252, %r5530, %r5531;
mul.wide.s32 %rd1741, %r1412, 4;
add.s64 %rd1742, %rd1, %rd1741;
ld.local.u32 %r5532, [%rd1742];
shr.u32 %r5533, %r5532, %r5529;
shl.b32 %r5534, %r8253, %r1415;
add.s32 %r8253, %r5533, %r5534;
$L__BB0_1082:
and.b32 %r5535, %r1406, -2147483648;
shr.u32 %r5536, %r8253, 30;
shl.b32 %r5537, %r8252, 2;
or.b32 %r5538, %r5536, %r5537;
shr.u32 %r5539, %r5538, 31;
shr.u32 %r5540, %r8252, 30;
add.s32 %r5541, %r5539, %r5540;
neg.s32 %r5542, %r5541;
setp.eq.s32 %p923, %r5535, 0;
selp.b32 %r8254, %r5541, %r5542, %p923;
setp.ne.s32 %p924, %r5539, 0;
xor.b32 %r5543, %r5535, -2147483648;
selp.b32 %r5544, %r5543, %r5535, %p924;
selp.b32 %r5545, -1, 0, %p924;
xor.b32 %r5546, %r5538, %r5545;
shl.b32 %r5547, %r8253, 2;
xor.b32 %r5548, %r5547, %r5545;
cvt.u64.u32 %rd1743, %r5546;
cvt.u64.u32 %rd1744, %r5548;
bfi.b64 %rd1745, %rd1743, %rd1744, 32, 32;
cvt.rn.f64.s64 %fd143, %rd1745;
mul.f64 %fd144, %fd143, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3852, %fd144;
setp.eq.s32 %p925, %r5544, 0;
neg.f32 %f3853, %f3852;
selp.f32 %f5687, %f3852, %f3853, %p925;
$L__BB0_1084:
add.s32 %r1422, %r8254, 1;
and.b32 %r1423, %r1422, 1;
setp.eq.s32 %p926, %r1423, 0;
selp.f32 %f1259, %f5687, 0f3F800000, %p926;
mul.rn.f32 %f1260, %f5687, %f5687;
mov.f32 %f5688, 0fB94D4153;
@%p926 bra $L__BB0_1086;
mov.f32 %f3856, 0fBAB607ED;
mov.f32 %f3857, 0f37CBAC00;
fma.rn.f32 %f5688, %f3857, %f1260, %f3856;
$L__BB0_1086:
selp.f32 %f3858, 0f3C0885E4, 0f3D2AAABB, %p926;
fma.rn.f32 %f3859, %f5688, %f1260, %f3858;
selp.f32 %f3860, 0fBE2AAAA8, 0fBEFFFFFF, %p926;
fma.rn.f32 %f3861, %f3859, %f1260, %f3860;
mov.f32 %f3862, 0f00000000;
fma.rn.f32 %f3863, %f1260, %f1259, %f3862;
fma.rn.f32 %f5283, %f3861, %f3863, %f1259;
and.b32 %r5550, %r1422, 2;
setp.eq.s32 %p928, %r5550, 0;
@%p928 bra $L__BB0_1088;
mov.f32 %f3865, 0fBF800000;
fma.rn.f32 %f5283, %f5283, %f3865, %f3862;
$L__BB0_1088:
selp.f32 %f1267, %f5283, %f5284, %p19;
selp.f32 %f1268, %f5281, %f5282, %p19;
@%p907 bra $L__BB0_1090;
add.f32 %f5795, %f1268, %f1267;
mov.f32 %f5282, %f5281;
mov.f32 %f5284, %f5283;
$L__BB0_1090:
@%p813 bra $L__BB0_1119;
shl.b32 %r5551, %r12, 5;
neg.s32 %r1424, %r5551;
setp.ge.s32 %p932, %r11, %r1424;
@%p932 bra $L__BB0_1104;
mul.f32 %f3868, %f5412, 0f3F22F983;
cvt.rni.s32.f32 %r8258, %f3868;
cvt.rn.f32.s32 %f3869, %r8258;
mov.f32 %f3870, 0fBFC90FDA;
fma.rn.f32 %f3871, %f3869, %f3870, %f5412;
mov.f32 %f3872, 0fB3A22168;
fma.rn.f32 %f3873, %f3869, %f3872, %f3871;
mov.f32 %f3874, 0fA7C234C5;
fma.rn.f32 %f5696, %f3869, %f3874, %f3873;
abs.f32 %f1276, %f5412;
setp.ltu.f32 %p933, %f1276, 0f47CE4780;
@%p933 bra $L__BB0_1100;
setp.eq.f32 %p934, %f1276, 0f7F800000;
@%p934 bra $L__BB0_1099;
bra.uni $L__BB0_1094;
$L__BB0_1099:
mov.f32 %f3877, 0f00000000;
mul.rn.f32 %f5696, %f5412, %f3877;
mov.u32 %r8258, 0;
bra.uni $L__BB0_1100;
$L__BB0_1094:
mov.b32 %r1426, %f5412;
shr.u32 %r5553, %r1426, 23;
and.b32 %r5554, %r5553, 255;
add.s32 %r1427, %r5554, -128;
shl.b32 %r5555, %r1426, 8;
or.b32 %r1428, %r5555, -2147483648;
shr.u32 %r1429, %r1427, 5;
mov.u64 %rd2649, 0;
mov.u32 %r8255, 0;
mov.u64 %rd1749, __cudart_i2opi_f;
mov.u64 %rd2650, %rd2649;
$L__BB0_1095:
.pragma "nounroll";
shl.b64 %rd1748, %rd2649, 2;
add.s64 %rd1750, %rd1749, %rd1748;
ld.global.nc.u32 %r5556, [%rd1750];
mad.wide.u32 %rd1751, %r5556, %r1428, %rd2650;
shr.u64 %rd2650, %rd1751, 32;
add.s64 %rd1752, %rd1, %rd1748;
st.local.u32 [%rd1752], %rd1751;
add.s32 %r8255, %r8255, 1;
cvt.s64.s32 %rd2649, %r8255;
setp.ne.s32 %p935, %r8255, 6;
@%p935 bra $L__BB0_1095;
st.local.u32 [%rd4], %rd2650;
mov.u32 %r5557, 4;
sub.s32 %r1432, %r5557, %r1429;
mov.u32 %r5558, 6;
sub.s32 %r5559, %r5558, %r1429;
mul.wide.s32 %rd1753, %r5559, 4;
add.s64 %rd1754, %rd1, %rd1753;
ld.local.u32 %r8256, [%rd1754];
ld.local.u32 %r8257, [%rd1754+-4];
and.b32 %r1435, %r1427, 31;
setp.eq.s32 %p936, %r1435, 0;
@%p936 bra $L__BB0_1098;
mov.u32 %r5560, 32;
sub.s32 %r5561, %r5560, %r1435;
shr.u32 %r5562, %r8257, %r5561;
shl.b32 %r5563, %r8256, %r1435;
add.s32 %r8256, %r5562, %r5563;
mul.wide.s32 %rd1755, %r1432, 4;
add.s64 %rd1756, %rd1, %rd1755;
ld.local.u32 %r5564, [%rd1756];
shr.u32 %r5565, %r5564, %r5561;
shl.b32 %r5566, %r8257, %r1435;
add.s32 %r8257, %r5565, %r5566;
$L__BB0_1098:
and.b32 %r5567, %r1426, -2147483648;
shr.u32 %r5568, %r8257, 30;
shl.b32 %r5569, %r8256, 2;
or.b32 %r5570, %r5568, %r5569;
shr.u32 %r5571, %r5570, 31;
shr.u32 %r5572, %r8256, 30;
add.s32 %r5573, %r5571, %r5572;
neg.s32 %r5574, %r5573;
setp.eq.s32 %p937, %r5567, 0;
selp.b32 %r8258, %r5573, %r5574, %p937;
setp.ne.s32 %p938, %r5571, 0;
xor.b32 %r5575, %r5567, -2147483648;
selp.b32 %r5576, %r5575, %r5567, %p938;
selp.b32 %r5577, -1, 0, %p938;
xor.b32 %r5578, %r5570, %r5577;
shl.b32 %r5579, %r8257, 2;
xor.b32 %r5580, %r5579, %r5577;
cvt.u64.u32 %rd1757, %r5578;
cvt.u64.u32 %rd1758, %r5580;
bfi.b64 %rd1759, %rd1757, %rd1758, 32, 32;
cvt.rn.f64.s64 %fd145, %rd1759;
mul.f64 %fd146, %fd145, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3875, %fd146;
setp.eq.s32 %p939, %r5576, 0;
neg.f32 %f3876, %f3875;
selp.f32 %f5696, %f3875, %f3876, %p939;
$L__BB0_1100:
and.b32 %r1442, %r8258, 1;
setp.eq.s32 %p940, %r1442, 0;
selp.f32 %f1280, %f5696, 0f3F800000, %p940;
mul.rn.f32 %f1281, %f5696, %f5696;
mov.f32 %f5697, 0fB94D4153;
@%p940 bra $L__BB0_1102;
mov.f32 %f3879, 0fBAB607ED;
mov.f32 %f3880, 0f37CBAC00;
fma.rn.f32 %f5697, %f3880, %f1281, %f3879;
$L__BB0_1102:
selp.f32 %f3881, 0f3C0885E4, 0f3D2AAABB, %p940;
fma.rn.f32 %f3882, %f5697, %f1281, %f3881;
selp.f32 %f3883, 0fBE2AAAA8, 0fBEFFFFFF, %p940;
fma.rn.f32 %f3884, %f3882, %f1281, %f3883;
mov.f32 %f3885, 0f00000000;
fma.rn.f32 %f3886, %f1281, %f1280, %f3885;
fma.rn.f32 %f5281, %f3884, %f3886, %f1280;
and.b32 %r5582, %r8258, 2;
setp.eq.s32 %p942, %r5582, 0;
@%p942 bra $L__BB0_1104;
mov.f32 %f3888, 0fBF800000;
fma.rn.f32 %f5281, %f5281, %f3888, %f3885;
$L__BB0_1104:
setp.lt.s32 %p20, %r11, %r1424;
@%p932 bra $L__BB0_1117;
mul.f32 %f3889, %f5404, 0f3F22F983;
cvt.rni.s32.f32 %r8262, %f3889;
cvt.rn.f32.s32 %f3890, %r8262;
mov.f32 %f3891, 0fBFC90FDA;
fma.rn.f32 %f3892, %f3890, %f3891, %f5404;
mov.f32 %f3893, 0fB3A22168;
fma.rn.f32 %f3894, %f3890, %f3893, %f3892;
mov.f32 %f3895, 0fA7C234C5;
fma.rn.f32 %f5700, %f3890, %f3895, %f3894;
abs.f32 %f1289, %f5404;
setp.ltu.f32 %p944, %f1289, 0f47CE4780;
@%p944 bra $L__BB0_1113;
setp.eq.f32 %p945, %f1289, 0f7F800000;
@%p945 bra $L__BB0_1112;
bra.uni $L__BB0_1107;
$L__BB0_1112:
mov.f32 %f3898, 0f00000000;
mul.rn.f32 %f5700, %f5404, %f3898;
mov.u32 %r8262, 0;
bra.uni $L__BB0_1113;
$L__BB0_1107:
mov.b32 %r1444, %f5404;
shr.u32 %r5584, %r1444, 23;
and.b32 %r5585, %r5584, 255;
add.s32 %r1445, %r5585, -128;
shl.b32 %r5586, %r1444, 8;
or.b32 %r1446, %r5586, -2147483648;
shr.u32 %r1447, %r1445, 5;
mov.u64 %rd2651, 0;
mov.u32 %r8259, 0;
mov.u64 %rd1763, __cudart_i2opi_f;
mov.u64 %rd2652, %rd2651;
$L__BB0_1108:
.pragma "nounroll";
shl.b64 %rd1762, %rd2651, 2;
add.s64 %rd1764, %rd1763, %rd1762;
ld.global.nc.u32 %r5587, [%rd1764];
mad.wide.u32 %rd1765, %r5587, %r1446, %rd2652;
shr.u64 %rd2652, %rd1765, 32;
add.s64 %rd1766, %rd1, %rd1762;
st.local.u32 [%rd1766], %rd1765;
add.s32 %r8259, %r8259, 1;
cvt.s64.s32 %rd2651, %r8259;
setp.ne.s32 %p946, %r8259, 6;
@%p946 bra $L__BB0_1108;
st.local.u32 [%rd4], %rd2652;
mov.u32 %r5588, 4;
sub.s32 %r1450, %r5588, %r1447;
mov.u32 %r5589, 6;
sub.s32 %r5590, %r5589, %r1447;
mul.wide.s32 %rd1767, %r5590, 4;
add.s64 %rd1768, %rd1, %rd1767;
ld.local.u32 %r8260, [%rd1768];
ld.local.u32 %r8261, [%rd1768+-4];
and.b32 %r1453, %r1445, 31;
setp.eq.s32 %p947, %r1453, 0;
@%p947 bra $L__BB0_1111;
mov.u32 %r5591, 32;
sub.s32 %r5592, %r5591, %r1453;
shr.u32 %r5593, %r8261, %r5592;
shl.b32 %r5594, %r8260, %r1453;
add.s32 %r8260, %r5593, %r5594;
mul.wide.s32 %rd1769, %r1450, 4;
add.s64 %rd1770, %rd1, %rd1769;
ld.local.u32 %r5595, [%rd1770];
shr.u32 %r5596, %r5595, %r5592;
shl.b32 %r5597, %r8261, %r1453;
add.s32 %r8261, %r5596, %r5597;
$L__BB0_1111:
and.b32 %r5598, %r1444, -2147483648;
shr.u32 %r5599, %r8261, 30;
shl.b32 %r5600, %r8260, 2;
or.b32 %r5601, %r5599, %r5600;
shr.u32 %r5602, %r5601, 31;
shr.u32 %r5603, %r8260, 30;
add.s32 %r5604, %r5602, %r5603;
neg.s32 %r5605, %r5604;
setp.eq.s32 %p948, %r5598, 0;
selp.b32 %r8262, %r5604, %r5605, %p948;
setp.ne.s32 %p949, %r5602, 0;
xor.b32 %r5606, %r5598, -2147483648;
selp.b32 %r5607, %r5606, %r5598, %p949;
selp.b32 %r5608, -1, 0, %p949;
xor.b32 %r5609, %r5601, %r5608;
shl.b32 %r5610, %r8261, 2;
xor.b32 %r5611, %r5610, %r5608;
cvt.u64.u32 %rd1771, %r5609;
cvt.u64.u32 %rd1772, %r5611;
bfi.b64 %rd1773, %rd1771, %rd1772, 32, 32;
cvt.rn.f64.s64 %fd147, %rd1773;
mul.f64 %fd148, %fd147, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3896, %fd148;
setp.eq.s32 %p950, %r5607, 0;
neg.f32 %f3897, %f3896;
selp.f32 %f5700, %f3896, %f3897, %p950;
$L__BB0_1113:
add.s32 %r1460, %r8262, 1;
and.b32 %r1461, %r1460, 1;
setp.eq.s32 %p951, %r1461, 0;
selp.f32 %f1293, %f5700, 0f3F800000, %p951;
mul.rn.f32 %f1294, %f5700, %f5700;
mov.f32 %f5701, 0fB94D4153;
@%p951 bra $L__BB0_1115;
mov.f32 %f3900, 0fBAB607ED;
mov.f32 %f3901, 0f37CBAC00;
fma.rn.f32 %f5701, %f3901, %f1294, %f3900;
$L__BB0_1115:
selp.f32 %f3902, 0f3C0885E4, 0f3D2AAABB, %p951;
fma.rn.f32 %f3903, %f5701, %f1294, %f3902;
selp.f32 %f3904, 0fBE2AAAA8, 0fBEFFFFFF, %p951;
fma.rn.f32 %f3905, %f3903, %f1294, %f3904;
mov.f32 %f3906, 0f00000000;
fma.rn.f32 %f3907, %f1294, %f1293, %f3906;
fma.rn.f32 %f5283, %f3905, %f3907, %f1293;
and.b32 %r5613, %r1460, 2;
setp.eq.s32 %p953, %r5613, 0;
@%p953 bra $L__BB0_1117;
mov.f32 %f3909, 0fBF800000;
fma.rn.f32 %f5283, %f5283, %f3909, %f3906;
$L__BB0_1117:
selp.f32 %f1301, %f5283, %f5284, %p20;
selp.f32 %f1302, %f5281, %f5282, %p20;
@%p932 bra $L__BB0_1119;
add.f32 %f5794, %f1302, %f1301;
mov.f32 %f5282, %f5281;
mov.f32 %f5284, %f5283;
$L__BB0_1119:
@%p813 bra $L__BB0_1148;
shl.b32 %r5614, %r12, 5;
mov.u32 %r5615, -32;
sub.s32 %r1462, %r5615, %r5614;
setp.ge.s32 %p957, %r11, %r1462;
@%p957 bra $L__BB0_1133;
mul.f32 %f3912, %f5411, 0f3F22F983;
cvt.rni.s32.f32 %r8266, %f3912;
cvt.rn.f32.s32 %f3913, %r8266;
mov.f32 %f3914, 0fBFC90FDA;
fma.rn.f32 %f3915, %f3913, %f3914, %f5411;
mov.f32 %f3916, 0fB3A22168;
fma.rn.f32 %f3917, %f3913, %f3916, %f3915;
mov.f32 %f3918, 0fA7C234C5;
fma.rn.f32 %f5709, %f3913, %f3918, %f3917;
abs.f32 %f1310, %f5411;
setp.ltu.f32 %p958, %f1310, 0f47CE4780;
@%p958 bra $L__BB0_1129;
setp.eq.f32 %p959, %f1310, 0f7F800000;
@%p959 bra $L__BB0_1128;
bra.uni $L__BB0_1123;
$L__BB0_1128:
mov.f32 %f3921, 0f00000000;
mul.rn.f32 %f5709, %f5411, %f3921;
mov.u32 %r8266, 0;
bra.uni $L__BB0_1129;
$L__BB0_1123:
mov.b32 %r1464, %f5411;
shr.u32 %r5617, %r1464, 23;
and.b32 %r5618, %r5617, 255;
add.s32 %r1465, %r5618, -128;
shl.b32 %r5619, %r1464, 8;
or.b32 %r1466, %r5619, -2147483648;
shr.u32 %r1467, %r1465, 5;
mov.u64 %rd2653, 0;
mov.u32 %r8263, 0;
mov.u64 %rd1777, __cudart_i2opi_f;
mov.u64 %rd2654, %rd2653;
$L__BB0_1124:
.pragma "nounroll";
shl.b64 %rd1776, %rd2653, 2;
add.s64 %rd1778, %rd1777, %rd1776;
ld.global.nc.u32 %r5620, [%rd1778];
mad.wide.u32 %rd1779, %r5620, %r1466, %rd2654;
shr.u64 %rd2654, %rd1779, 32;
add.s64 %rd1780, %rd1, %rd1776;
st.local.u32 [%rd1780], %rd1779;
add.s32 %r8263, %r8263, 1;
cvt.s64.s32 %rd2653, %r8263;
setp.ne.s32 %p960, %r8263, 6;
@%p960 bra $L__BB0_1124;
st.local.u32 [%rd4], %rd2654;
mov.u32 %r5621, 4;
sub.s32 %r1470, %r5621, %r1467;
mov.u32 %r5622, 6;
sub.s32 %r5623, %r5622, %r1467;
mul.wide.s32 %rd1781, %r5623, 4;
add.s64 %rd1782, %rd1, %rd1781;
ld.local.u32 %r8264, [%rd1782];
ld.local.u32 %r8265, [%rd1782+-4];
and.b32 %r1473, %r1465, 31;
setp.eq.s32 %p961, %r1473, 0;
@%p961 bra $L__BB0_1127;
mov.u32 %r5624, 32;
sub.s32 %r5625, %r5624, %r1473;
shr.u32 %r5626, %r8265, %r5625;
shl.b32 %r5627, %r8264, %r1473;
add.s32 %r8264, %r5626, %r5627;
mul.wide.s32 %rd1783, %r1470, 4;
add.s64 %rd1784, %rd1, %rd1783;
ld.local.u32 %r5628, [%rd1784];
shr.u32 %r5629, %r5628, %r5625;
shl.b32 %r5630, %r8265, %r1473;
add.s32 %r8265, %r5629, %r5630;
$L__BB0_1127:
and.b32 %r5631, %r1464, -2147483648;
shr.u32 %r5632, %r8265, 30;
shl.b32 %r5633, %r8264, 2;
or.b32 %r5634, %r5632, %r5633;
shr.u32 %r5635, %r5634, 31;
shr.u32 %r5636, %r8264, 30;
add.s32 %r5637, %r5635, %r5636;
neg.s32 %r5638, %r5637;
setp.eq.s32 %p962, %r5631, 0;
selp.b32 %r8266, %r5637, %r5638, %p962;
setp.ne.s32 %p963, %r5635, 0;
xor.b32 %r5639, %r5631, -2147483648;
selp.b32 %r5640, %r5639, %r5631, %p963;
selp.b32 %r5641, -1, 0, %p963;
xor.b32 %r5642, %r5634, %r5641;
shl.b32 %r5643, %r8265, 2;
xor.b32 %r5644, %r5643, %r5641;
cvt.u64.u32 %rd1785, %r5642;
cvt.u64.u32 %rd1786, %r5644;
bfi.b64 %rd1787, %rd1785, %rd1786, 32, 32;
cvt.rn.f64.s64 %fd149, %rd1787;
mul.f64 %fd150, %fd149, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3919, %fd150;
setp.eq.s32 %p964, %r5640, 0;
neg.f32 %f3920, %f3919;
selp.f32 %f5709, %f3919, %f3920, %p964;
$L__BB0_1129:
and.b32 %r1480, %r8266, 1;
setp.eq.s32 %p965, %r1480, 0;
selp.f32 %f1314, %f5709, 0f3F800000, %p965;
mul.rn.f32 %f1315, %f5709, %f5709;
mov.f32 %f5710, 0fB94D4153;
@%p965 bra $L__BB0_1131;
mov.f32 %f3923, 0fBAB607ED;
mov.f32 %f3924, 0f37CBAC00;
fma.rn.f32 %f5710, %f3924, %f1315, %f3923;
$L__BB0_1131:
selp.f32 %f3925, 0f3C0885E4, 0f3D2AAABB, %p965;
fma.rn.f32 %f3926, %f5710, %f1315, %f3925;
selp.f32 %f3927, 0fBE2AAAA8, 0fBEFFFFFF, %p965;
fma.rn.f32 %f3928, %f3926, %f1315, %f3927;
mov.f32 %f3929, 0f00000000;
fma.rn.f32 %f3930, %f1315, %f1314, %f3929;
fma.rn.f32 %f5281, %f3928, %f3930, %f1314;
and.b32 %r5646, %r8266, 2;
setp.eq.s32 %p967, %r5646, 0;
@%p967 bra $L__BB0_1133;
mov.f32 %f3932, 0fBF800000;
fma.rn.f32 %f5281, %f5281, %f3932, %f3929;
$L__BB0_1133:
setp.lt.s32 %p21, %r11, %r1462;
@%p957 bra $L__BB0_1146;
mul.f32 %f3933, %f5403, 0f3F22F983;
cvt.rni.s32.f32 %r8270, %f3933;
cvt.rn.f32.s32 %f3934, %r8270;
mov.f32 %f3935, 0fBFC90FDA;
fma.rn.f32 %f3936, %f3934, %f3935, %f5403;
mov.f32 %f3937, 0fB3A22168;
fma.rn.f32 %f3938, %f3934, %f3937, %f3936;
mov.f32 %f3939, 0fA7C234C5;
fma.rn.f32 %f5713, %f3934, %f3939, %f3938;
abs.f32 %f1323, %f5403;
setp.ltu.f32 %p969, %f1323, 0f47CE4780;
@%p969 bra $L__BB0_1142;
setp.eq.f32 %p970, %f1323, 0f7F800000;
@%p970 bra $L__BB0_1141;
bra.uni $L__BB0_1136;
$L__BB0_1141:
mov.f32 %f3942, 0f00000000;
mul.rn.f32 %f5713, %f5403, %f3942;
mov.u32 %r8270, 0;
bra.uni $L__BB0_1142;
$L__BB0_1136:
mov.b32 %r1482, %f5403;
shr.u32 %r5648, %r1482, 23;
and.b32 %r5649, %r5648, 255;
add.s32 %r1483, %r5649, -128;
shl.b32 %r5650, %r1482, 8;
or.b32 %r1484, %r5650, -2147483648;
shr.u32 %r1485, %r1483, 5;
mov.u64 %rd2655, 0;
mov.u32 %r8267, 0;
mov.u64 %rd1791, __cudart_i2opi_f;
mov.u64 %rd2656, %rd2655;
$L__BB0_1137:
.pragma "nounroll";
shl.b64 %rd1790, %rd2655, 2;
add.s64 %rd1792, %rd1791, %rd1790;
ld.global.nc.u32 %r5651, [%rd1792];
mad.wide.u32 %rd1793, %r5651, %r1484, %rd2656;
shr.u64 %rd2656, %rd1793, 32;
add.s64 %rd1794, %rd1, %rd1790;
st.local.u32 [%rd1794], %rd1793;
add.s32 %r8267, %r8267, 1;
cvt.s64.s32 %rd2655, %r8267;
setp.ne.s32 %p971, %r8267, 6;
@%p971 bra $L__BB0_1137;
st.local.u32 [%rd4], %rd2656;
mov.u32 %r5652, 4;
sub.s32 %r1488, %r5652, %r1485;
mov.u32 %r5653, 6;
sub.s32 %r5654, %r5653, %r1485;
mul.wide.s32 %rd1795, %r5654, 4;
add.s64 %rd1796, %rd1, %rd1795;
ld.local.u32 %r8268, [%rd1796];
ld.local.u32 %r8269, [%rd1796+-4];
and.b32 %r1491, %r1483, 31;
setp.eq.s32 %p972, %r1491, 0;
@%p972 bra $L__BB0_1140;
mov.u32 %r5655, 32;
sub.s32 %r5656, %r5655, %r1491;
shr.u32 %r5657, %r8269, %r5656;
shl.b32 %r5658, %r8268, %r1491;
add.s32 %r8268, %r5657, %r5658;
mul.wide.s32 %rd1797, %r1488, 4;
add.s64 %rd1798, %rd1, %rd1797;
ld.local.u32 %r5659, [%rd1798];
shr.u32 %r5660, %r5659, %r5656;
shl.b32 %r5661, %r8269, %r1491;
add.s32 %r8269, %r5660, %r5661;
$L__BB0_1140:
and.b32 %r5662, %r1482, -2147483648;
shr.u32 %r5663, %r8269, 30;
shl.b32 %r5664, %r8268, 2;
or.b32 %r5665, %r5663, %r5664;
shr.u32 %r5666, %r5665, 31;
shr.u32 %r5667, %r8268, 30;
add.s32 %r5668, %r5666, %r5667;
neg.s32 %r5669, %r5668;
setp.eq.s32 %p973, %r5662, 0;
selp.b32 %r8270, %r5668, %r5669, %p973;
setp.ne.s32 %p974, %r5666, 0;
xor.b32 %r5670, %r5662, -2147483648;
selp.b32 %r5671, %r5670, %r5662, %p974;
selp.b32 %r5672, -1, 0, %p974;
xor.b32 %r5673, %r5665, %r5672;
shl.b32 %r5674, %r8269, 2;
xor.b32 %r5675, %r5674, %r5672;
cvt.u64.u32 %rd1799, %r5673;
cvt.u64.u32 %rd1800, %r5675;
bfi.b64 %rd1801, %rd1799, %rd1800, 32, 32;
cvt.rn.f64.s64 %fd151, %rd1801;
mul.f64 %fd152, %fd151, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3940, %fd152;
setp.eq.s32 %p975, %r5671, 0;
neg.f32 %f3941, %f3940;
selp.f32 %f5713, %f3940, %f3941, %p975;
$L__BB0_1142:
add.s32 %r1498, %r8270, 1;
and.b32 %r1499, %r1498, 1;
setp.eq.s32 %p976, %r1499, 0;
selp.f32 %f1327, %f5713, 0f3F800000, %p976;
mul.rn.f32 %f1328, %f5713, %f5713;
mov.f32 %f5714, 0fB94D4153;
@%p976 bra $L__BB0_1144;
mov.f32 %f3944, 0fBAB607ED;
mov.f32 %f3945, 0f37CBAC00;
fma.rn.f32 %f5714, %f3945, %f1328, %f3944;
$L__BB0_1144:
selp.f32 %f3946, 0f3C0885E4, 0f3D2AAABB, %p976;
fma.rn.f32 %f3947, %f5714, %f1328, %f3946;
selp.f32 %f3948, 0fBE2AAAA8, 0fBEFFFFFF, %p976;
fma.rn.f32 %f3949, %f3947, %f1328, %f3948;
mov.f32 %f3950, 0f00000000;
fma.rn.f32 %f3951, %f1328, %f1327, %f3950;
fma.rn.f32 %f5283, %f3949, %f3951, %f1327;
and.b32 %r5677, %r1498, 2;
setp.eq.s32 %p978, %r5677, 0;
@%p978 bra $L__BB0_1146;
mov.f32 %f3953, 0fBF800000;
fma.rn.f32 %f5283, %f5283, %f3953, %f3950;
$L__BB0_1146:
selp.f32 %f1335, %f5283, %f5284, %p21;
selp.f32 %f1336, %f5281, %f5282, %p21;
@%p957 bra $L__BB0_1148;
add.f32 %f5793, %f1336, %f1335;
mov.f32 %f5282, %f5281;
mov.f32 %f5284, %f5283;
$L__BB0_1148:
@%p816 bra $L__BB0_1177;
shl.b32 %r5678, %r12, 5;
neg.s32 %r1500, %r5678;
setp.ge.s32 %p982, %r11, %r1500;
@%p982 bra $L__BB0_1162;
mul.f32 %f3956, %f5410, 0f3F22F983;
cvt.rni.s32.f32 %r8274, %f3956;
cvt.rn.f32.s32 %f3957, %r8274;
mov.f32 %f3958, 0fBFC90FDA;
fma.rn.f32 %f3959, %f3957, %f3958, %f5410;
mov.f32 %f3960, 0fB3A22168;
fma.rn.f32 %f3961, %f3957, %f3960, %f3959;
mov.f32 %f3962, 0fA7C234C5;
fma.rn.f32 %f5722, %f3957, %f3962, %f3961;
abs.f32 %f1344, %f5410;
setp.ltu.f32 %p983, %f1344, 0f47CE4780;
@%p983 bra $L__BB0_1158;
setp.eq.f32 %p984, %f1344, 0f7F800000;
@%p984 bra $L__BB0_1157;
bra.uni $L__BB0_1152;
$L__BB0_1157:
mov.f32 %f3965, 0f00000000;
mul.rn.f32 %f5722, %f5410, %f3965;
mov.u32 %r8274, 0;
bra.uni $L__BB0_1158;
$L__BB0_1152:
mov.b32 %r1502, %f5410;
shr.u32 %r5680, %r1502, 23;
and.b32 %r5681, %r5680, 255;
add.s32 %r1503, %r5681, -128;
shl.b32 %r5682, %r1502, 8;
or.b32 %r1504, %r5682, -2147483648;
shr.u32 %r1505, %r1503, 5;
mov.u64 %rd2657, 0;
mov.u32 %r8271, 0;
mov.u64 %rd1805, __cudart_i2opi_f;
mov.u64 %rd2658, %rd2657;
$L__BB0_1153:
.pragma "nounroll";
shl.b64 %rd1804, %rd2657, 2;
add.s64 %rd1806, %rd1805, %rd1804;
ld.global.nc.u32 %r5683, [%rd1806];
mad.wide.u32 %rd1807, %r5683, %r1504, %rd2658;
shr.u64 %rd2658, %rd1807, 32;
add.s64 %rd1808, %rd1, %rd1804;
st.local.u32 [%rd1808], %rd1807;
add.s32 %r8271, %r8271, 1;
cvt.s64.s32 %rd2657, %r8271;
setp.ne.s32 %p985, %r8271, 6;
@%p985 bra $L__BB0_1153;
st.local.u32 [%rd4], %rd2658;
mov.u32 %r5684, 4;
sub.s32 %r1508, %r5684, %r1505;
mov.u32 %r5685, 6;
sub.s32 %r5686, %r5685, %r1505;
mul.wide.s32 %rd1809, %r5686, 4;
add.s64 %rd1810, %rd1, %rd1809;
ld.local.u32 %r8272, [%rd1810];
ld.local.u32 %r8273, [%rd1810+-4];
and.b32 %r1511, %r1503, 31;
setp.eq.s32 %p986, %r1511, 0;
@%p986 bra $L__BB0_1156;
mov.u32 %r5687, 32;
sub.s32 %r5688, %r5687, %r1511;
shr.u32 %r5689, %r8273, %r5688;
shl.b32 %r5690, %r8272, %r1511;
add.s32 %r8272, %r5689, %r5690;
mul.wide.s32 %rd1811, %r1508, 4;
add.s64 %rd1812, %rd1, %rd1811;
ld.local.u32 %r5691, [%rd1812];
shr.u32 %r5692, %r5691, %r5688;
shl.b32 %r5693, %r8273, %r1511;
add.s32 %r8273, %r5692, %r5693;
$L__BB0_1156:
and.b32 %r5694, %r1502, -2147483648;
shr.u32 %r5695, %r8273, 30;
shl.b32 %r5696, %r8272, 2;
or.b32 %r5697, %r5695, %r5696;
shr.u32 %r5698, %r5697, 31;
shr.u32 %r5699, %r8272, 30;
add.s32 %r5700, %r5698, %r5699;
neg.s32 %r5701, %r5700;
setp.eq.s32 %p987, %r5694, 0;
selp.b32 %r8274, %r5700, %r5701, %p987;
setp.ne.s32 %p988, %r5698, 0;
xor.b32 %r5702, %r5694, -2147483648;
selp.b32 %r5703, %r5702, %r5694, %p988;
selp.b32 %r5704, -1, 0, %p988;
xor.b32 %r5705, %r5697, %r5704;
shl.b32 %r5706, %r8273, 2;
xor.b32 %r5707, %r5706, %r5704;
cvt.u64.u32 %rd1813, %r5705;
cvt.u64.u32 %rd1814, %r5707;
bfi.b64 %rd1815, %rd1813, %rd1814, 32, 32;
cvt.rn.f64.s64 %fd153, %rd1815;
mul.f64 %fd154, %fd153, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3963, %fd154;
setp.eq.s32 %p989, %r5703, 0;
neg.f32 %f3964, %f3963;
selp.f32 %f5722, %f3963, %f3964, %p989;
$L__BB0_1158:
and.b32 %r1518, %r8274, 1;
setp.eq.s32 %p990, %r1518, 0;
selp.f32 %f1348, %f5722, 0f3F800000, %p990;
mul.rn.f32 %f1349, %f5722, %f5722;
mov.f32 %f5723, 0fB94D4153;
@%p990 bra $L__BB0_1160;
mov.f32 %f3967, 0fBAB607ED;
mov.f32 %f3968, 0f37CBAC00;
fma.rn.f32 %f5723, %f3968, %f1349, %f3967;
$L__BB0_1160:
selp.f32 %f3969, 0f3C0885E4, 0f3D2AAABB, %p990;
fma.rn.f32 %f3970, %f5723, %f1349, %f3969;
selp.f32 %f3971, 0fBE2AAAA8, 0fBEFFFFFF, %p990;
fma.rn.f32 %f3972, %f3970, %f1349, %f3971;
mov.f32 %f3973, 0f00000000;
fma.rn.f32 %f3974, %f1349, %f1348, %f3973;
fma.rn.f32 %f5281, %f3972, %f3974, %f1348;
and.b32 %r5709, %r8274, 2;
setp.eq.s32 %p992, %r5709, 0;
@%p992 bra $L__BB0_1162;
mov.f32 %f3976, 0fBF800000;
fma.rn.f32 %f5281, %f5281, %f3976, %f3973;
$L__BB0_1162:
setp.lt.s32 %p22, %r11, %r1500;
@%p982 bra $L__BB0_1175;
mul.f32 %f3977, %f5402, 0f3F22F983;
cvt.rni.s32.f32 %r8278, %f3977;
cvt.rn.f32.s32 %f3978, %r8278;
mov.f32 %f3979, 0fBFC90FDA;
fma.rn.f32 %f3980, %f3978, %f3979, %f5402;
mov.f32 %f3981, 0fB3A22168;
fma.rn.f32 %f3982, %f3978, %f3981, %f3980;
mov.f32 %f3983, 0fA7C234C5;
fma.rn.f32 %f5726, %f3978, %f3983, %f3982;
abs.f32 %f1357, %f5402;
setp.ltu.f32 %p994, %f1357, 0f47CE4780;
@%p994 bra $L__BB0_1171;
setp.eq.f32 %p995, %f1357, 0f7F800000;
@%p995 bra $L__BB0_1170;
bra.uni $L__BB0_1165;
$L__BB0_1170:
mov.f32 %f3986, 0f00000000;
mul.rn.f32 %f5726, %f5402, %f3986;
mov.u32 %r8278, 0;
bra.uni $L__BB0_1171;
$L__BB0_1165:
mov.b32 %r1520, %f5402;
shr.u32 %r5711, %r1520, 23;
and.b32 %r5712, %r5711, 255;
add.s32 %r1521, %r5712, -128;
shl.b32 %r5713, %r1520, 8;
or.b32 %r1522, %r5713, -2147483648;
shr.u32 %r1523, %r1521, 5;
mov.u64 %rd2659, 0;
mov.u32 %r8275, 0;
mov.u64 %rd1819, __cudart_i2opi_f;
mov.u64 %rd2660, %rd2659;
$L__BB0_1166:
.pragma "nounroll";
shl.b64 %rd1818, %rd2659, 2;
add.s64 %rd1820, %rd1819, %rd1818;
ld.global.nc.u32 %r5714, [%rd1820];
mad.wide.u32 %rd1821, %r5714, %r1522, %rd2660;
shr.u64 %rd2660, %rd1821, 32;
add.s64 %rd1822, %rd1, %rd1818;
st.local.u32 [%rd1822], %rd1821;
add.s32 %r8275, %r8275, 1;
cvt.s64.s32 %rd2659, %r8275;
setp.ne.s32 %p996, %r8275, 6;
@%p996 bra $L__BB0_1166;
st.local.u32 [%rd4], %rd2660;
mov.u32 %r5715, 4;
sub.s32 %r1526, %r5715, %r1523;
mov.u32 %r5716, 6;
sub.s32 %r5717, %r5716, %r1523;
mul.wide.s32 %rd1823, %r5717, 4;
add.s64 %rd1824, %rd1, %rd1823;
ld.local.u32 %r8276, [%rd1824];
ld.local.u32 %r8277, [%rd1824+-4];
and.b32 %r1529, %r1521, 31;
setp.eq.s32 %p997, %r1529, 0;
@%p997 bra $L__BB0_1169;
mov.u32 %r5718, 32;
sub.s32 %r5719, %r5718, %r1529;
shr.u32 %r5720, %r8277, %r5719;
shl.b32 %r5721, %r8276, %r1529;
add.s32 %r8276, %r5720, %r5721;
mul.wide.s32 %rd1825, %r1526, 4;
add.s64 %rd1826, %rd1, %rd1825;
ld.local.u32 %r5722, [%rd1826];
shr.u32 %r5723, %r5722, %r5719;
shl.b32 %r5724, %r8277, %r1529;
add.s32 %r8277, %r5723, %r5724;
$L__BB0_1169:
and.b32 %r5725, %r1520, -2147483648;
shr.u32 %r5726, %r8277, 30;
shl.b32 %r5727, %r8276, 2;
or.b32 %r5728, %r5726, %r5727;
shr.u32 %r5729, %r5728, 31;
shr.u32 %r5730, %r8276, 30;
add.s32 %r5731, %r5729, %r5730;
neg.s32 %r5732, %r5731;
setp.eq.s32 %p998, %r5725, 0;
selp.b32 %r8278, %r5731, %r5732, %p998;
setp.ne.s32 %p999, %r5729, 0;
xor.b32 %r5733, %r5725, -2147483648;
selp.b32 %r5734, %r5733, %r5725, %p999;
selp.b32 %r5735, -1, 0, %p999;
xor.b32 %r5736, %r5728, %r5735;
shl.b32 %r5737, %r8277, 2;
xor.b32 %r5738, %r5737, %r5735;
cvt.u64.u32 %rd1827, %r5736;
cvt.u64.u32 %rd1828, %r5738;
bfi.b64 %rd1829, %rd1827, %rd1828, 32, 32;
cvt.rn.f64.s64 %fd155, %rd1829;
mul.f64 %fd156, %fd155, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f3984, %fd156;
setp.eq.s32 %p1000, %r5734, 0;
neg.f32 %f3985, %f3984;
selp.f32 %f5726, %f3984, %f3985, %p1000;
$L__BB0_1171:
add.s32 %r1536, %r8278, 1;
and.b32 %r1537, %r1536, 1;
setp.eq.s32 %p1001, %r1537, 0;
selp.f32 %f1361, %f5726, 0f3F800000, %p1001;
mul.rn.f32 %f1362, %f5726, %f5726;
mov.f32 %f5727, 0fB94D4153;
@%p1001 bra $L__BB0_1173;
mov.f32 %f3988, 0fBAB607ED;
mov.f32 %f3989, 0f37CBAC00;
fma.rn.f32 %f5727, %f3989, %f1362, %f3988;
$L__BB0_1173:
selp.f32 %f3990, 0f3C0885E4, 0f3D2AAABB, %p1001;
fma.rn.f32 %f3991, %f5727, %f1362, %f3990;
selp.f32 %f3992, 0fBE2AAAA8, 0fBEFFFFFF, %p1001;
fma.rn.f32 %f3993, %f3991, %f1362, %f3992;
mov.f32 %f3994, 0f00000000;
fma.rn.f32 %f3995, %f1362, %f1361, %f3994;
fma.rn.f32 %f5283, %f3993, %f3995, %f1361;
and.b32 %r5740, %r1536, 2;
setp.eq.s32 %p1003, %r5740, 0;
@%p1003 bra $L__BB0_1175;
mov.f32 %f3997, 0fBF800000;
fma.rn.f32 %f5283, %f5283, %f3997, %f3994;
$L__BB0_1175:
selp.f32 %f1369, %f5283, %f5284, %p22;
selp.f32 %f1370, %f5281, %f5282, %p22;
@%p982 bra $L__BB0_1177;
add.f32 %f5792, %f1370, %f1369;
mov.f32 %f5282, %f5281;
mov.f32 %f5284, %f5283;
$L__BB0_1177:
@%p816 bra $L__BB0_1399;
shl.b32 %r5741, %r12, 5;
mov.u32 %r5742, -32;
sub.s32 %r1538, %r5742, %r5741;
setp.ge.s32 %p1007, %r11, %r1538;
@%p1007 bra $L__BB0_1191;
mul.f32 %f4000, %f5409, 0f3F22F983;
cvt.rni.s32.f32 %r8282, %f4000;
cvt.rn.f32.s32 %f4001, %r8282;
mov.f32 %f4002, 0fBFC90FDA;
fma.rn.f32 %f4003, %f4001, %f4002, %f5409;
mov.f32 %f4004, 0fB3A22168;
fma.rn.f32 %f4005, %f4001, %f4004, %f4003;
mov.f32 %f4006, 0fA7C234C5;
fma.rn.f32 %f5735, %f4001, %f4006, %f4005;
abs.f32 %f1378, %f5409;
setp.ltu.f32 %p1008, %f1378, 0f47CE4780;
@%p1008 bra $L__BB0_1187;
setp.eq.f32 %p1009, %f1378, 0f7F800000;
@%p1009 bra $L__BB0_1186;
bra.uni $L__BB0_1181;
$L__BB0_1186:
mov.f32 %f4009, 0f00000000;
mul.rn.f32 %f5735, %f5409, %f4009;
mov.u32 %r8282, 0;
bra.uni $L__BB0_1187;
$L__BB0_1181:
mov.b32 %r1540, %f5409;
shr.u32 %r5744, %r1540, 23;
and.b32 %r5745, %r5744, 255;
add.s32 %r1541, %r5745, -128;
shl.b32 %r5746, %r1540, 8;
or.b32 %r1542, %r5746, -2147483648;
shr.u32 %r1543, %r1541, 5;
mov.u64 %rd2661, 0;
mov.u32 %r8279, 0;
mov.u64 %rd1833, __cudart_i2opi_f;
mov.u64 %rd2662, %rd2661;
$L__BB0_1182:
.pragma "nounroll";
shl.b64 %rd1832, %rd2661, 2;
add.s64 %rd1834, %rd1833, %rd1832;
ld.global.nc.u32 %r5747, [%rd1834];
mad.wide.u32 %rd1835, %r5747, %r1542, %rd2662;
shr.u64 %rd2662, %rd1835, 32;
add.s64 %rd1836, %rd1, %rd1832;
st.local.u32 [%rd1836], %rd1835;
add.s32 %r8279, %r8279, 1;
cvt.s64.s32 %rd2661, %r8279;
setp.ne.s32 %p1010, %r8279, 6;
@%p1010 bra $L__BB0_1182;
st.local.u32 [%rd4], %rd2662;
mov.u32 %r5748, 4;
sub.s32 %r1546, %r5748, %r1543;
mov.u32 %r5749, 6;
sub.s32 %r5750, %r5749, %r1543;
mul.wide.s32 %rd1837, %r5750, 4;
add.s64 %rd1838, %rd1, %rd1837;
ld.local.u32 %r8280, [%rd1838];
ld.local.u32 %r8281, [%rd1838+-4];
and.b32 %r1549, %r1541, 31;
setp.eq.s32 %p1011, %r1549, 0;
@%p1011 bra $L__BB0_1185;
mov.u32 %r5751, 32;
sub.s32 %r5752, %r5751, %r1549;
shr.u32 %r5753, %r8281, %r5752;
shl.b32 %r5754, %r8280, %r1549;
add.s32 %r8280, %r5753, %r5754;
mul.wide.s32 %rd1839, %r1546, 4;
add.s64 %rd1840, %rd1, %rd1839;
ld.local.u32 %r5755, [%rd1840];
shr.u32 %r5756, %r5755, %r5752;
shl.b32 %r5757, %r8281, %r1549;
add.s32 %r8281, %r5756, %r5757;
$L__BB0_1185:
and.b32 %r5758, %r1540, -2147483648;
shr.u32 %r5759, %r8281, 30;
shl.b32 %r5760, %r8280, 2;
or.b32 %r5761, %r5759, %r5760;
shr.u32 %r5762, %r5761, 31;
shr.u32 %r5763, %r8280, 30;
add.s32 %r5764, %r5762, %r5763;
neg.s32 %r5765, %r5764;
setp.eq.s32 %p1012, %r5758, 0;
selp.b32 %r8282, %r5764, %r5765, %p1012;
setp.ne.s32 %p1013, %r5762, 0;
xor.b32 %r5766, %r5758, -2147483648;
selp.b32 %r5767, %r5766, %r5758, %p1013;
selp.b32 %r5768, -1, 0, %p1013;
xor.b32 %r5769, %r5761, %r5768;
shl.b32 %r5770, %r8281, 2;
xor.b32 %r5771, %r5770, %r5768;
cvt.u64.u32 %rd1841, %r5769;
cvt.u64.u32 %rd1842, %r5771;
bfi.b64 %rd1843, %rd1841, %rd1842, 32, 32;
cvt.rn.f64.s64 %fd157, %rd1843;
mul.f64 %fd158, %fd157, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4007, %fd158;
setp.eq.s32 %p1014, %r5767, 0;
neg.f32 %f4008, %f4007;
selp.f32 %f5735, %f4007, %f4008, %p1014;
$L__BB0_1187:
and.b32 %r1556, %r8282, 1;
setp.eq.s32 %p1015, %r1556, 0;
selp.f32 %f1382, %f5735, 0f3F800000, %p1015;
mul.rn.f32 %f1383, %f5735, %f5735;
mov.f32 %f5736, 0fB94D4153;
@%p1015 bra $L__BB0_1189;
mov.f32 %f4011, 0fBAB607ED;
mov.f32 %f4012, 0f37CBAC00;
fma.rn.f32 %f5736, %f4012, %f1383, %f4011;
$L__BB0_1189:
selp.f32 %f4013, 0f3C0885E4, 0f3D2AAABB, %p1015;
fma.rn.f32 %f4014, %f5736, %f1383, %f4013;
selp.f32 %f4015, 0fBE2AAAA8, 0fBEFFFFFF, %p1015;
fma.rn.f32 %f4016, %f4014, %f1383, %f4015;
mov.f32 %f4017, 0f00000000;
fma.rn.f32 %f4018, %f1383, %f1382, %f4017;
fma.rn.f32 %f5281, %f4016, %f4018, %f1382;
and.b32 %r5773, %r8282, 2;
setp.eq.s32 %p1017, %r5773, 0;
@%p1017 bra $L__BB0_1191;
mov.f32 %f4020, 0fBF800000;
fma.rn.f32 %f5281, %f5281, %f4020, %f4017;
$L__BB0_1191:
setp.lt.s32 %p23, %r11, %r1538;
@%p1007 bra $L__BB0_1204;
mul.f32 %f4021, %f5401, 0f3F22F983;
cvt.rni.s32.f32 %r8286, %f4021;
cvt.rn.f32.s32 %f4022, %r8286;
mov.f32 %f4023, 0fBFC90FDA;
fma.rn.f32 %f4024, %f4022, %f4023, %f5401;
mov.f32 %f4025, 0fB3A22168;
fma.rn.f32 %f4026, %f4022, %f4025, %f4024;
mov.f32 %f4027, 0fA7C234C5;
fma.rn.f32 %f5739, %f4022, %f4027, %f4026;
abs.f32 %f1391, %f5401;
setp.ltu.f32 %p1019, %f1391, 0f47CE4780;
@%p1019 bra $L__BB0_1200;
setp.eq.f32 %p1020, %f1391, 0f7F800000;
@%p1020 bra $L__BB0_1199;
bra.uni $L__BB0_1194;
$L__BB0_1199:
mov.f32 %f4030, 0f00000000;
mul.rn.f32 %f5739, %f5401, %f4030;
mov.u32 %r8286, 0;
bra.uni $L__BB0_1200;
$L__BB0_1194:
mov.b32 %r1558, %f5401;
shr.u32 %r5775, %r1558, 23;
and.b32 %r5776, %r5775, 255;
add.s32 %r1559, %r5776, -128;
shl.b32 %r5777, %r1558, 8;
or.b32 %r1560, %r5777, -2147483648;
shr.u32 %r1561, %r1559, 5;
mov.u64 %rd2663, 0;
mov.u32 %r8283, 0;
mov.u64 %rd1847, __cudart_i2opi_f;
mov.u64 %rd2664, %rd2663;
$L__BB0_1195:
.pragma "nounroll";
shl.b64 %rd1846, %rd2663, 2;
add.s64 %rd1848, %rd1847, %rd1846;
ld.global.nc.u32 %r5778, [%rd1848];
mad.wide.u32 %rd1849, %r5778, %r1560, %rd2664;
shr.u64 %rd2664, %rd1849, 32;
add.s64 %rd1850, %rd1, %rd1846;
st.local.u32 [%rd1850], %rd1849;
add.s32 %r8283, %r8283, 1;
cvt.s64.s32 %rd2663, %r8283;
setp.ne.s32 %p1021, %r8283, 6;
@%p1021 bra $L__BB0_1195;
st.local.u32 [%rd4], %rd2664;
mov.u32 %r5779, 4;
sub.s32 %r1564, %r5779, %r1561;
mov.u32 %r5780, 6;
sub.s32 %r5781, %r5780, %r1561;
mul.wide.s32 %rd1851, %r5781, 4;
add.s64 %rd1852, %rd1, %rd1851;
ld.local.u32 %r8284, [%rd1852];
ld.local.u32 %r8285, [%rd1852+-4];
and.b32 %r1567, %r1559, 31;
setp.eq.s32 %p1022, %r1567, 0;
@%p1022 bra $L__BB0_1198;
mov.u32 %r5782, 32;
sub.s32 %r5783, %r5782, %r1567;
shr.u32 %r5784, %r8285, %r5783;
shl.b32 %r5785, %r8284, %r1567;
add.s32 %r8284, %r5784, %r5785;
mul.wide.s32 %rd1853, %r1564, 4;
add.s64 %rd1854, %rd1, %rd1853;
ld.local.u32 %r5786, [%rd1854];
shr.u32 %r5787, %r5786, %r5783;
shl.b32 %r5788, %r8285, %r1567;
add.s32 %r8285, %r5787, %r5788;
$L__BB0_1198:
and.b32 %r5789, %r1558, -2147483648;
shr.u32 %r5790, %r8285, 30;
shl.b32 %r5791, %r8284, 2;
or.b32 %r5792, %r5790, %r5791;
shr.u32 %r5793, %r5792, 31;
shr.u32 %r5794, %r8284, 30;
add.s32 %r5795, %r5793, %r5794;
neg.s32 %r5796, %r5795;
setp.eq.s32 %p1023, %r5789, 0;
selp.b32 %r8286, %r5795, %r5796, %p1023;
setp.ne.s32 %p1024, %r5793, 0;
xor.b32 %r5797, %r5789, -2147483648;
selp.b32 %r5798, %r5797, %r5789, %p1024;
selp.b32 %r5799, -1, 0, %p1024;
xor.b32 %r5800, %r5792, %r5799;
shl.b32 %r5801, %r8285, 2;
xor.b32 %r5802, %r5801, %r5799;
cvt.u64.u32 %rd1855, %r5800;
cvt.u64.u32 %rd1856, %r5802;
bfi.b64 %rd1857, %rd1855, %rd1856, 32, 32;
cvt.rn.f64.s64 %fd159, %rd1857;
mul.f64 %fd160, %fd159, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4028, %fd160;
setp.eq.s32 %p1025, %r5798, 0;
neg.f32 %f4029, %f4028;
selp.f32 %f5739, %f4028, %f4029, %p1025;
$L__BB0_1200:
add.s32 %r1574, %r8286, 1;
and.b32 %r1575, %r1574, 1;
setp.eq.s32 %p1026, %r1575, 0;
selp.f32 %f1395, %f5739, 0f3F800000, %p1026;
mul.rn.f32 %f1396, %f5739, %f5739;
mov.f32 %f5740, 0fB94D4153;
@%p1026 bra $L__BB0_1202;
mov.f32 %f4032, 0fBAB607ED;
mov.f32 %f4033, 0f37CBAC00;
fma.rn.f32 %f5740, %f4033, %f1396, %f4032;
$L__BB0_1202:
selp.f32 %f4034, 0f3C0885E4, 0f3D2AAABB, %p1026;
fma.rn.f32 %f4035, %f5740, %f1396, %f4034;
selp.f32 %f4036, 0fBE2AAAA8, 0fBEFFFFFF, %p1026;
fma.rn.f32 %f4037, %f4035, %f1396, %f4036;
mov.f32 %f4038, 0f00000000;
fma.rn.f32 %f4039, %f1396, %f1395, %f4038;
fma.rn.f32 %f5283, %f4037, %f4039, %f1395;
and.b32 %r5804, %r1574, 2;
setp.eq.s32 %p1028, %r5804, 0;
@%p1028 bra $L__BB0_1204;
mov.f32 %f4041, 0fBF800000;
fma.rn.f32 %f5283, %f5283, %f4041, %f4038;
$L__BB0_1204:
selp.f32 %f1403, %f5283, %f5284, %p23;
selp.f32 %f1404, %f5281, %f5282, %p23;
@%p1007 bra $L__BB0_1399;
add.f32 %f5791, %f1404, %f1403;
mov.f32 %f5282, %f5281;
mov.f32 %f5284, %f5283;
$L__BB0_1399:
setp.lt.s32 %p1191, %r12, 0;
and.pred %p1193, %p33, %p1191;
@%p1193 bra $L__BB0_1672;
bra.uni $L__BB0_1400;
$L__BB0_1672:
mov.u32 %r7785, %ctaid.x;
shl.b32 %r7070, %r12, 5;
add.s32 %r7071, %r7070, %r1;
mul.hi.s32 %r7072, %r7071, -1840700269;
add.s32 %r7073, %r7072, %r7071;
shr.u32 %r7074, %r7073, 31;
shr.s32 %r7075, %r7073, 2;
add.s32 %r7076, %r7075, %r7074;
mul.lo.s32 %r7077, %r7076, %r2589;
shl.b32 %r7078, %r2586, 1;
add.s32 %r7079, %r7078, %r14;
shl.b32 %r7080, %r2587, 1;
add.s32 %r7081, %r7079, %r7080;
add.s32 %r7082, %r7081, %r7077;
mul.lo.s32 %r7083, %r7076, 7;
sub.s32 %r7084, %r7071, %r7083;
mul.lo.s32 %r7085, %r7084, %r2590;
add.s32 %r7086, %r7082, %r7085;
mul.wide.s32 %rd2303, %r7086, 4;
add.s64 %rd2304, %rd3, %rd2303;
ld.global.f32 %f1957, [%rd2304];
add.s32 %r7087, %r7071, 32;
mul.hi.s32 %r7088, %r7087, -1840700269;
add.s32 %r7089, %r7088, %r7087;
shr.u32 %r7090, %r7089, 31;
shr.s32 %r7091, %r7089, 2;
add.s32 %r7092, %r7091, %r7090;
mul.lo.s32 %r7093, %r7092, %r2589;
add.s32 %r7094, %r7081, %r7093;
mul.lo.s32 %r7095, %r7092, 7;
sub.s32 %r7096, %r7087, %r7095;
mul.lo.s32 %r7097, %r7096, %r2590;
add.s32 %r7098, %r7094, %r7097;
mul.wide.s32 %rd2305, %r7098, 4;
add.s64 %rd2306, %rd3, %rd2305;
ld.global.f32 %f1958, [%rd2306];
mul.wide.s32 %rd2307, %r2587, 4;
add.s64 %rd2308, %rd2304, %rd2307;
ld.global.f32 %f1959, [%rd2308];
add.s64 %rd2309, %rd2306, %rd2307;
ld.global.f32 %f1960, [%rd2309];
add.s64 %rd2310, %rd2308, %rd2307;
ld.global.f32 %f1961, [%rd2310];
add.s64 %rd2311, %rd2309, %rd2307;
ld.global.f32 %f1962, [%rd2311];
mad.lo.s32 %r7099, %r2586, 3, %r14;
add.s32 %r7100, %r7099, %r7077;
add.s32 %r7101, %r7100, %r7085;
mul.wide.s32 %rd2312, %r7101, 4;
add.s64 %rd2313, %rd3, %rd2312;
ld.global.f32 %f1963, [%rd2313];
add.s32 %r7102, %r7099, %r7093;
add.s32 %r7103, %r7102, %r7097;
mul.wide.s32 %rd2314, %r7103, 4;
add.s64 %rd2315, %rd3, %rd2314;
ld.global.f32 %f1964, [%rd2315];
mul.hi.s32 %r7105, %r7071, 954437177;
shr.u32 %r7106, %r7105, 31;
shr.s32 %r7107, %r7105, 1;
add.s32 %r7108, %r7107, %r7106;
mul.lo.s32 %r7109, %r7108, %r2579;
shl.b32 %r7110, %r2576, 2;
mad.lo.s32 %r7111, %r2578, %r7785, %r7110;
add.s32 %r7112, %r7111, %r7109;
mul.lo.s32 %r7113, %r7108, 9;
sub.s32 %r7114, %r7071, %r7113;
mul.lo.s32 %r7115, %r7114, %r2580;
add.s32 %r7116, %r7112, %r7115;
mul.wide.s32 %rd2316, %r7116, 4;
add.s64 %rd2317, %rd2, %rd2316;
ld.global.f32 %f1965, [%rd2317];
mul.hi.s32 %r7117, %r7087, 954437177;
shr.u32 %r7118, %r7117, 31;
shr.s32 %r7119, %r7117, 1;
add.s32 %r7120, %r7119, %r7118;
mul.lo.s32 %r7121, %r7120, %r2579;
add.s32 %r7122, %r7111, %r7121;
mul.lo.s32 %r7123, %r7120, 9;
sub.s32 %r7124, %r7087, %r7123;
mul.lo.s32 %r7125, %r7124, %r2580;
add.s32 %r7126, %r7122, %r7125;
mul.wide.s32 %rd2318, %r7126, 4;
add.s64 %rd2319, %rd2, %rd2318;
ld.global.f32 %f1966, [%rd2319];
mul.wide.s32 %rd2320, %r2577, 4;
add.s64 %rd2321, %rd2317, %rd2320;
ld.global.f32 %f1967, [%rd2321];
add.s64 %rd2322, %rd2319, %rd2320;
ld.global.f32 %f1968, [%rd2322];
add.s64 %rd2323, %rd2321, %rd2320;
ld.global.f32 %f1969, [%rd2323];
add.s64 %rd2324, %rd2322, %rd2320;
ld.global.f32 %f1970, [%rd2324];
add.s32 %r7127, %r7111, %r2576;
add.s32 %r7128, %r7127, %r7109;
add.s32 %r7129, %r7128, %r7115;
mul.wide.s32 %rd2325, %r7129, 4;
add.s64 %rd2326, %rd2, %rd2325;
ld.global.f32 %f1971, [%rd2326];
add.s32 %r7130, %r7127, %r7121;
add.s32 %r7131, %r7130, %r7125;
mul.wide.s32 %rd2327, %r7131, 4;
add.s64 %rd2328, %rd2, %rd2327;
ld.global.f32 %f1972, [%rd2328];
mul.f32 %f4731, %f1965, 0f3F22F983;
cvt.rni.s32.f32 %r8418, %f4731;
cvt.rn.f32.s32 %f4732, %r8418;
mov.f32 %f4733, 0fBFC90FDA;
fma.rn.f32 %f4734, %f4732, %f4733, %f1965;
mov.f32 %f4735, 0fB3A22168;
fma.rn.f32 %f4736, %f4732, %f4735, %f4734;
mov.f32 %f4737, 0fA7C234C5;
fma.rn.f32 %f5942, %f4732, %f4737, %f4736;
abs.f32 %f1974, %f1965;
setp.ltu.f32 %p1419, %f1974, 0f47CE4780;
@%p1419 bra $L__BB0_1680;
setp.eq.f32 %p1420, %f1974, 0f7F800000;
@%p1420 bra $L__BB0_1679;
bra.uni $L__BB0_1674;
$L__BB0_1679:
mov.f32 %f4740, 0f00000000;
mul.rn.f32 %f5942, %f1965, %f4740;
mov.u32 %r8418, 0;
bra.uni $L__BB0_1680;
$L__BB0_1400:
add.s32 %r1872, %r12, 12;
setp.gt.s32 %p1194, %r1872, 14;
shl.b32 %r6373, %r2586, 1;
add.s32 %r6374, %r6373, %r14;
shl.b32 %r6375, %r2587, 1;
add.s32 %r1873, %r6374, %r6375;
@%p1194 bra $L__BB0_1405;
shl.b32 %r1874, %r12, 5;
neg.s32 %r6376, %r1874;
setp.ge.s32 %p1195, %r11, %r6376;
@%p1195 bra $L__BB0_1403;
add.s32 %r6377, %r1874, %r1;
mul.hi.s32 %r6378, %r6377, -1840700269;
add.s32 %r6379, %r6378, %r6377;
shr.u32 %r6380, %r6379, 31;
shr.s32 %r6381, %r6379, 2;
add.s32 %r6382, %r6381, %r6380;
mad.lo.s32 %r6383, %r6382, %r2589, %r1873;
mul.lo.s32 %r6384, %r6382, 7;
sub.s32 %r6385, %r6377, %r6384;
mad.lo.s32 %r6386, %r6385, %r2590, %r6383;
mul.wide.s32 %rd2111, %r6386, 4;
add.s64 %rd2112, %rd3, %rd2111;
ld.global.f32 %f5607, [%rd2112];
$L__BB0_1403:
mov.u32 %r6387, -32;
sub.s32 %r6388, %r6387, %r1874;
setp.ge.s32 %p1196, %r11, %r6388;
@%p1196 bra $L__BB0_1405;
add.s32 %r6389, %r1874, %r1;
add.s32 %r6390, %r6389, 32;
mul.hi.s32 %r6391, %r6390, -1840700269;
add.s32 %r6392, %r6391, %r6390;
shr.u32 %r6393, %r6392, 31;
shr.s32 %r6394, %r6392, 2;
add.s32 %r6395, %r6394, %r6393;
mad.lo.s32 %r6396, %r6395, %r2589, %r1873;
mul.lo.s32 %r6397, %r6395, 7;
sub.s32 %r6398, %r6390, %r6397;
mad.lo.s32 %r6399, %r6398, %r2590, %r6396;
mul.wide.s32 %rd2113, %r6399, 4;
add.s64 %rd2114, %rd3, %rd2113;
ld.global.f32 %f5606, [%rd2114];
$L__BB0_1405:
add.s32 %r1875, %r12, 13;
setp.gt.s32 %p1197, %r1875, 14;
add.s32 %r1876, %r1873, %r2587;
@%p1197 bra $L__BB0_1410;
shl.b32 %r1877, %r12, 5;
neg.s32 %r6400, %r1877;
setp.ge.s32 %p1198, %r11, %r6400;
@%p1198 bra $L__BB0_1408;
add.s32 %r6401, %r1877, %r1;
mul.hi.s32 %r6402, %r6401, -1840700269;
add.s32 %r6403, %r6402, %r6401;
shr.u32 %r6404, %r6403, 31;
shr.s32 %r6405, %r6403, 2;
add.s32 %r6406, %r6405, %r6404;
mad.lo.s32 %r6407, %r6406, %r2589, %r1876;
mul.lo.s32 %r6408, %r6406, 7;
sub.s32 %r6409, %r6401, %r6408;
mad.lo.s32 %r6410, %r6409, %r2590, %r6407;
mul.wide.s32 %rd2115, %r6410, 4;
add.s64 %rd2116, %rd3, %rd2115;
ld.global.f32 %f5406, [%rd2116];
$L__BB0_1408:
mov.u32 %r6411, -32;
sub.s32 %r6412, %r6411, %r1877;
setp.ge.s32 %p1199, %r11, %r6412;
@%p1199 bra $L__BB0_1410;
add.s32 %r6413, %r1877, %r1;
add.s32 %r6414, %r6413, 32;
mul.hi.s32 %r6415, %r6414, -1840700269;
add.s32 %r6416, %r6415, %r6414;
shr.u32 %r6417, %r6416, 31;
shr.s32 %r6418, %r6416, 2;
add.s32 %r6419, %r6418, %r6417;
mad.lo.s32 %r6420, %r6419, %r2589, %r1876;
mul.lo.s32 %r6421, %r6419, 7;
sub.s32 %r6422, %r6414, %r6421;
mad.lo.s32 %r6423, %r6422, %r2590, %r6420;
mul.wide.s32 %rd2117, %r6423, 4;
add.s64 %rd2118, %rd3, %rd2117;
ld.global.f32 %f5405, [%rd2118];
$L__BB0_1410:
add.s32 %r1878, %r12, 14;
setp.gt.s32 %p1200, %r1878, 14;
add.s32 %r1879, %r1876, %r2587;
@%p1200 bra $L__BB0_1415;
shl.b32 %r1880, %r12, 5;
neg.s32 %r6424, %r1880;
setp.ge.s32 %p1201, %r11, %r6424;
@%p1201 bra $L__BB0_1413;
add.s32 %r6425, %r1880, %r1;
mul.hi.s32 %r6426, %r6425, -1840700269;
add.s32 %r6427, %r6426, %r6425;
shr.u32 %r6428, %r6427, 31;
shr.s32 %r6429, %r6427, 2;
add.s32 %r6430, %r6429, %r6428;
mad.lo.s32 %r6431, %r6430, %r2589, %r1879;
mul.lo.s32 %r6432, %r6430, 7;
sub.s32 %r6433, %r6425, %r6432;
mad.lo.s32 %r6434, %r6433, %r2590, %r6431;
mul.wide.s32 %rd2119, %r6434, 4;
add.s64 %rd2120, %rd3, %rd2119;
ld.global.f32 %f5404, [%rd2120];
$L__BB0_1413:
mov.u32 %r6435, -32;
sub.s32 %r6436, %r6435, %r1880;
setp.ge.s32 %p1202, %r11, %r6436;
@%p1202 bra $L__BB0_1415;
add.s32 %r6437, %r1880, %r1;
add.s32 %r6438, %r6437, 32;
mul.hi.s32 %r6439, %r6438, -1840700269;
add.s32 %r6440, %r6439, %r6438;
shr.u32 %r6441, %r6440, 31;
shr.s32 %r6442, %r6440, 2;
add.s32 %r6443, %r6442, %r6441;
mad.lo.s32 %r6444, %r6443, %r2589, %r1879;
mul.lo.s32 %r6445, %r6443, 7;
sub.s32 %r6446, %r6438, %r6445;
mad.lo.s32 %r6447, %r6446, %r2590, %r6444;
mul.wide.s32 %rd2121, %r6447, 4;
add.s64 %rd2122, %rd3, %rd2121;
ld.global.f32 %f5403, [%rd2122];
$L__BB0_1415:
add.s32 %r1881, %r12, 15;
setp.gt.s32 %p1203, %r1881, 14;
mad.lo.s32 %r1882, %r2586, 3, %r14;
@%p1203 bra $L__BB0_1420;
shl.b32 %r1883, %r12, 5;
neg.s32 %r6448, %r1883;
setp.ge.s32 %p1204, %r11, %r6448;
@%p1204 bra $L__BB0_1418;
add.s32 %r6449, %r1883, %r1;
mul.hi.s32 %r6450, %r6449, -1840700269;
add.s32 %r6451, %r6450, %r6449;
shr.u32 %r6452, %r6451, 31;
shr.s32 %r6453, %r6451, 2;
add.s32 %r6454, %r6453, %r6452;
mad.lo.s32 %r6455, %r6454, %r2589, %r1882;
mul.lo.s32 %r6456, %r6454, 7;
sub.s32 %r6457, %r6449, %r6456;
mad.lo.s32 %r6458, %r6457, %r2590, %r6455;
mul.wide.s32 %rd2123, %r6458, 4;
add.s64 %rd2124, %rd3, %rd2123;
ld.global.f32 %f5402, [%rd2124];
$L__BB0_1418:
mov.u32 %r6459, -32;
sub.s32 %r6460, %r6459, %r1883;
setp.ge.s32 %p1205, %r11, %r6460;
@%p1205 bra $L__BB0_1420;
add.s32 %r6461, %r1883, %r1;
add.s32 %r6462, %r6461, 32;
mul.hi.s32 %r6463, %r6462, -1840700269;
add.s32 %r6464, %r6463, %r6462;
shr.u32 %r6465, %r6464, 31;
shr.s32 %r6466, %r6464, 2;
add.s32 %r6467, %r6466, %r6465;
mad.lo.s32 %r6468, %r6467, %r2589, %r1882;
mul.lo.s32 %r6469, %r6467, 7;
sub.s32 %r6470, %r6462, %r6469;
mad.lo.s32 %r6471, %r6470, %r2590, %r6468;
mul.wide.s32 %rd2125, %r6471, 4;
add.s64 %rd2126, %rd3, %rd2125;
ld.global.f32 %f5401, [%rd2126];
$L__BB0_1420:
mov.u32 %r7783, %ctaid.x;
mul.lo.s32 %r1884, %r2578, %r7783;
shl.b32 %r6473, %r2576, 2;
add.s32 %r1885, %r6473, %r1884;
@%p1194 bra $L__BB0_1425;
shl.b32 %r1886, %r12, 5;
neg.s32 %r6474, %r1886;
setp.ge.s32 %p1207, %r11, %r6474;
@%p1207 bra $L__BB0_1423;
add.s32 %r6475, %r1886, %r1;
mul.hi.s32 %r6476, %r6475, 954437177;
shr.u32 %r6477, %r6476, 31;
shr.s32 %r6478, %r6476, 1;
add.s32 %r6479, %r6478, %r6477;
mad.lo.s32 %r6480, %r6479, %r2579, %r1885;
mul.lo.s32 %r6481, %r6479, 9;
sub.s32 %r6482, %r6475, %r6481;
mad.lo.s32 %r6483, %r6482, %r2580, %r6480;
mul.wide.s32 %rd2127, %r6483, 4;
add.s64 %rd2128, %rd2, %rd2127;
ld.global.f32 %f5416, [%rd2128];
$L__BB0_1423:
mov.u32 %r6484, -32;
sub.s32 %r6485, %r6484, %r1886;
setp.ge.s32 %p1208, %r11, %r6485;
@%p1208 bra $L__BB0_1425;
add.s32 %r6486, %r1886, %r1;
add.s32 %r6487, %r6486, 32;
mul.hi.s32 %r6488, %r6487, 954437177;
shr.u32 %r6489, %r6488, 31;
shr.s32 %r6490, %r6488, 1;
add.s32 %r6491, %r6490, %r6489;
mad.lo.s32 %r6492, %r6491, %r2579, %r1885;
mul.lo.s32 %r6493, %r6491, 9;
sub.s32 %r6494, %r6487, %r6493;
mad.lo.s32 %r6495, %r6494, %r2580, %r6492;
mul.wide.s32 %rd2129, %r6495, 4;
add.s64 %rd2130, %rd2, %rd2129;
ld.global.f32 %f5415, [%rd2130];
$L__BB0_1425:
add.s32 %r1887, %r1885, %r2577;
@%p1197 bra $L__BB0_1430;
shl.b32 %r1888, %r12, 5;
neg.s32 %r6496, %r1888;
setp.ge.s32 %p1210, %r11, %r6496;
@%p1210 bra $L__BB0_1428;
add.s32 %r6497, %r1888, %r1;
mul.hi.s32 %r6498, %r6497, 954437177;
shr.u32 %r6499, %r6498, 31;
shr.s32 %r6500, %r6498, 1;
add.s32 %r6501, %r6500, %r6499;
mad.lo.s32 %r6502, %r6501, %r2579, %r1887;
mul.lo.s32 %r6503, %r6501, 9;
sub.s32 %r6504, %r6497, %r6503;
mad.lo.s32 %r6505, %r6504, %r2580, %r6502;
mul.wide.s32 %rd2131, %r6505, 4;
add.s64 %rd2132, %rd2, %rd2131;
ld.global.f32 %f5414, [%rd2132];
$L__BB0_1428:
mov.u32 %r6506, -32;
sub.s32 %r6507, %r6506, %r1888;
setp.ge.s32 %p1211, %r11, %r6507;
@%p1211 bra $L__BB0_1430;
add.s32 %r6508, %r1888, %r1;
add.s32 %r6509, %r6508, 32;
mul.hi.s32 %r6510, %r6509, 954437177;
shr.u32 %r6511, %r6510, 31;
shr.s32 %r6512, %r6510, 1;
add.s32 %r6513, %r6512, %r6511;
mad.lo.s32 %r6514, %r6513, %r2579, %r1887;
mul.lo.s32 %r6515, %r6513, 9;
sub.s32 %r6516, %r6509, %r6515;
mad.lo.s32 %r6517, %r6516, %r2580, %r6514;
mul.wide.s32 %rd2133, %r6517, 4;
add.s64 %rd2134, %rd2, %rd2133;
ld.global.f32 %f5413, [%rd2134];
$L__BB0_1430:
add.s32 %r1889, %r1887, %r2577;
@%p1200 bra $L__BB0_1435;
shl.b32 %r1890, %r12, 5;
neg.s32 %r6518, %r1890;
setp.ge.s32 %p1213, %r11, %r6518;
@%p1213 bra $L__BB0_1433;
add.s32 %r6519, %r1890, %r1;
mul.hi.s32 %r6520, %r6519, 954437177;
shr.u32 %r6521, %r6520, 31;
shr.s32 %r6522, %r6520, 1;
add.s32 %r6523, %r6522, %r6521;
mad.lo.s32 %r6524, %r6523, %r2579, %r1889;
mul.lo.s32 %r6525, %r6523, 9;
sub.s32 %r6526, %r6519, %r6525;
mad.lo.s32 %r6527, %r6526, %r2580, %r6524;
mul.wide.s32 %rd2135, %r6527, 4;
add.s64 %rd2136, %rd2, %rd2135;
ld.global.f32 %f5412, [%rd2136];
$L__BB0_1433:
mov.u32 %r6528, -32;
sub.s32 %r6529, %r6528, %r1890;
setp.ge.s32 %p1214, %r11, %r6529;
@%p1214 bra $L__BB0_1435;
add.s32 %r6530, %r1890, %r1;
add.s32 %r6531, %r6530, 32;
mul.hi.s32 %r6532, %r6531, 954437177;
shr.u32 %r6533, %r6532, 31;
shr.s32 %r6534, %r6532, 1;
add.s32 %r6535, %r6534, %r6533;
mad.lo.s32 %r6536, %r6535, %r2579, %r1889;
mul.lo.s32 %r6537, %r6535, 9;
sub.s32 %r6538, %r6531, %r6537;
mad.lo.s32 %r6539, %r6538, %r2580, %r6536;
mul.wide.s32 %rd2137, %r6539, 4;
add.s64 %rd2138, %rd2, %rd2137;
ld.global.f32 %f5411, [%rd2138];
$L__BB0_1435:
mad.lo.s32 %r1891, %r2576, 5, %r1884;
@%p1203 bra $L__BB0_1440;
shl.b32 %r1892, %r12, 5;
neg.s32 %r6540, %r1892;
setp.ge.s32 %p1216, %r11, %r6540;
@%p1216 bra $L__BB0_1438;
add.s32 %r6541, %r1892, %r1;
mul.hi.s32 %r6542, %r6541, 954437177;
shr.u32 %r6543, %r6542, 31;
shr.s32 %r6544, %r6542, 1;
add.s32 %r6545, %r6544, %r6543;
mad.lo.s32 %r6546, %r6545, %r2579, %r1891;
mul.lo.s32 %r6547, %r6545, 9;
sub.s32 %r6548, %r6541, %r6547;
mad.lo.s32 %r6549, %r6548, %r2580, %r6546;
mul.wide.s32 %rd2139, %r6549, 4;
add.s64 %rd2140, %rd2, %rd2139;
ld.global.f32 %f5410, [%rd2140];
$L__BB0_1438:
mov.u32 %r6550, -32;
sub.s32 %r6551, %r6550, %r1892;
setp.ge.s32 %p1217, %r11, %r6551;
@%p1217 bra $L__BB0_1440;
add.s32 %r6552, %r1892, %r1;
add.s32 %r6553, %r6552, 32;
mul.hi.s32 %r6554, %r6553, 954437177;
shr.u32 %r6555, %r6554, 31;
shr.s32 %r6556, %r6554, 1;
add.s32 %r6557, %r6556, %r6555;
mad.lo.s32 %r6558, %r6557, %r2579, %r1891;
mul.lo.s32 %r6559, %r6557, 9;
sub.s32 %r6560, %r6553, %r6559;
mad.lo.s32 %r6561, %r6560, %r2580, %r6558;
mul.wide.s32 %rd2141, %r6561, 4;
add.s64 %rd2142, %rd2, %rd2141;
ld.global.f32 %f5409, [%rd2142];
$L__BB0_1440:
@%p1194 bra $L__BB0_1469;
shl.b32 %r6562, %r12, 5;
neg.s32 %r1893, %r6562;
setp.ge.s32 %p1219, %r11, %r1893;
@%p1219 bra $L__BB0_1454;
mul.f32 %f4380, %f5416, 0f3F22F983;
cvt.rni.s32.f32 %r8354, %f4380;
cvt.rn.f32.s32 %f4381, %r8354;
mov.f32 %f4382, 0fBFC90FDA;
fma.rn.f32 %f4383, %f4381, %f4382, %f5416;
mov.f32 %f4384, 0fB3A22168;
fma.rn.f32 %f4385, %f4381, %f4384, %f4383;
mov.f32 %f4386, 0fA7C234C5;
fma.rn.f32 %f5843, %f4381, %f4386, %f4385;
abs.f32 %f1691, %f5416;
setp.ltu.f32 %p1220, %f1691, 0f47CE4780;
@%p1220 bra $L__BB0_1450;
setp.eq.f32 %p1221, %f1691, 0f7F800000;
@%p1221 bra $L__BB0_1449;
bra.uni $L__BB0_1444;
$L__BB0_1449:
mov.f32 %f4389, 0f00000000;
mul.rn.f32 %f5843, %f5416, %f4389;
mov.u32 %r8354, 0;
bra.uni $L__BB0_1450;
$L__BB0_1674:
mov.b32 %r2198, %f1965;
shr.u32 %r7133, %r2198, 23;
and.b32 %r7134, %r7133, 255;
add.s32 %r2199, %r7134, -128;
shl.b32 %r7135, %r2198, 8;
or.b32 %r2200, %r7135, -2147483648;
shr.u32 %r2201, %r2199, 5;
mov.u64 %rd2747, 0;
mov.u32 %r8415, 0;
mov.u64 %rd2745, __cudart_i2opi_f;
mov.u64 %rd2746, %rd1;
$L__BB0_1675:
.pragma "nounroll";
ld.global.nc.u32 %r7136, [%rd2745];
mad.wide.u32 %rd2331, %r7136, %r2200, %rd2747;
shr.u64 %rd2747, %rd2331, 32;
st.local.u32 [%rd2746], %rd2331;
add.s64 %rd2746, %rd2746, 4;
add.s64 %rd2745, %rd2745, 4;
add.s32 %r8415, %r8415, 1;
setp.ne.s32 %p1421, %r8415, 6;
@%p1421 bra $L__BB0_1675;
st.local.u32 [%rd4], %rd2747;
mov.u32 %r7137, 4;
sub.s32 %r2204, %r7137, %r2201;
mov.u32 %r7138, 6;
sub.s32 %r7139, %r7138, %r2201;
mul.wide.s32 %rd2332, %r7139, 4;
add.s64 %rd2333, %rd1, %rd2332;
ld.local.u32 %r8416, [%rd2333];
ld.local.u32 %r8417, [%rd2333+-4];
and.b32 %r2207, %r2199, 31;
setp.eq.s32 %p1422, %r2207, 0;
@%p1422 bra $L__BB0_1678;
mov.u32 %r7140, 32;
sub.s32 %r7141, %r7140, %r2207;
shr.u32 %r7142, %r8417, %r7141;
shl.b32 %r7143, %r8416, %r2207;
add.s32 %r8416, %r7142, %r7143;
mul.wide.s32 %rd2334, %r2204, 4;
add.s64 %rd2335, %rd1, %rd2334;
ld.local.u32 %r7144, [%rd2335];
shr.u32 %r7145, %r7144, %r7141;
shl.b32 %r7146, %r8417, %r2207;
add.s32 %r8417, %r7145, %r7146;
$L__BB0_1678:
and.b32 %r7147, %r2198, -2147483648;
shr.u32 %r7148, %r8417, 30;
shl.b32 %r7149, %r8416, 2;
or.b32 %r7150, %r7148, %r7149;
shr.u32 %r7151, %r7150, 31;
shr.u32 %r7152, %r8416, 30;
add.s32 %r7153, %r7151, %r7152;
neg.s32 %r7154, %r7153;
setp.eq.s32 %p1423, %r7147, 0;
selp.b32 %r8418, %r7153, %r7154, %p1423;
setp.ne.s32 %p1424, %r7151, 0;
xor.b32 %r7155, %r7147, -2147483648;
selp.b32 %r7156, %r7155, %r7147, %p1424;
selp.b32 %r7157, -1, 0, %p1424;
xor.b32 %r7158, %r7150, %r7157;
shl.b32 %r7159, %r8417, 2;
xor.b32 %r7160, %r7159, %r7157;
cvt.u64.u32 %rd2336, %r7158;
cvt.u64.u32 %rd2337, %r7160;
bfi.b64 %rd2338, %rd2336, %rd2337, 32, 32;
cvt.rn.f64.s64 %fd225, %rd2338;
mul.f64 %fd226, %fd225, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4738, %fd226;
setp.eq.s32 %p1425, %r7156, 0;
neg.f32 %f4739, %f4738;
selp.f32 %f5942, %f4738, %f4739, %p1425;
$L__BB0_1680:
and.b32 %r2214, %r8418, 1;
setp.eq.s32 %p1426, %r2214, 0;
selp.f32 %f1978, %f5942, 0f3F800000, %p1426;
mul.rn.f32 %f1979, %f5942, %f5942;
mov.f32 %f5943, 0fB94D4153;
@%p1426 bra $L__BB0_1682;
mov.f32 %f4742, 0fBAB607ED;
mov.f32 %f4743, 0f37CBAC00;
fma.rn.f32 %f5943, %f4743, %f1979, %f4742;
$L__BB0_1682:
selp.f32 %f4744, 0f3C0885E4, 0f3D2AAABB, %p1426;
fma.rn.f32 %f4745, %f5943, %f1979, %f4744;
selp.f32 %f4746, 0fBE2AAAA8, 0fBEFFFFFF, %p1426;
fma.rn.f32 %f4747, %f4745, %f1979, %f4746;
mov.f32 %f4748, 0f00000000;
fma.rn.f32 %f4749, %f1979, %f1978, %f4748;
fma.rn.f32 %f5944, %f4747, %f4749, %f1978;
and.b32 %r7162, %r8418, 2;
setp.eq.s32 %p1428, %r7162, 0;
@%p1428 bra $L__BB0_1684;
mov.f32 %f4751, 0fBF800000;
fma.rn.f32 %f5944, %f5944, %f4751, %f4748;
$L__BB0_1684:
mul.f32 %f4752, %f1957, 0f3F22F983;
cvt.rni.s32.f32 %r8422, %f4752;
cvt.rn.f32.s32 %f4753, %r8422;
mov.f32 %f4754, 0fBFC90FDA;
fma.rn.f32 %f4755, %f4753, %f4754, %f1957;
mov.f32 %f4756, 0fB3A22168;
fma.rn.f32 %f4757, %f4753, %f4756, %f4755;
mov.f32 %f4758, 0fA7C234C5;
fma.rn.f32 %f5945, %f4753, %f4758, %f4757;
abs.f32 %f1986, %f1957;
setp.ltu.f32 %p1429, %f1986, 0f47CE4780;
@%p1429 bra $L__BB0_1692;
setp.eq.f32 %p1430, %f1986, 0f7F800000;
@%p1430 bra $L__BB0_1691;
bra.uni $L__BB0_1686;
$L__BB0_1691:
mov.f32 %f4761, 0f00000000;
mul.rn.f32 %f5945, %f1957, %f4761;
mov.u32 %r8422, 0;
bra.uni $L__BB0_1692;
$L__BB0_1686:
mov.b32 %r2216, %f1957;
shr.u32 %r7164, %r2216, 23;
and.b32 %r7165, %r7164, 255;
add.s32 %r2217, %r7165, -128;
shl.b32 %r7166, %r2216, 8;
or.b32 %r2218, %r7166, -2147483648;
shr.u32 %r2219, %r2217, 5;
mov.u64 %rd2750, 0;
mov.u32 %r8419, 0;
mov.u64 %rd2748, __cudart_i2opi_f;
mov.u64 %rd2749, %rd1;
$L__BB0_1687:
.pragma "nounroll";
ld.global.nc.u32 %r7167, [%rd2748];
mad.wide.u32 %rd2341, %r7167, %r2218, %rd2750;
shr.u64 %rd2750, %rd2341, 32;
st.local.u32 [%rd2749], %rd2341;
add.s64 %rd2749, %rd2749, 4;
add.s64 %rd2748, %rd2748, 4;
add.s32 %r8419, %r8419, 1;
setp.ne.s32 %p1431, %r8419, 6;
@%p1431 bra $L__BB0_1687;
st.local.u32 [%rd4], %rd2750;
mov.u32 %r7168, 4;
sub.s32 %r2222, %r7168, %r2219;
mov.u32 %r7169, 6;
sub.s32 %r7170, %r7169, %r2219;
mul.wide.s32 %rd2342, %r7170, 4;
add.s64 %rd2343, %rd1, %rd2342;
ld.local.u32 %r8420, [%rd2343];
ld.local.u32 %r8421, [%rd2343+-4];
and.b32 %r2225, %r2217, 31;
setp.eq.s32 %p1432, %r2225, 0;
@%p1432 bra $L__BB0_1690;
mov.u32 %r7171, 32;
sub.s32 %r7172, %r7171, %r2225;
shr.u32 %r7173, %r8421, %r7172;
shl.b32 %r7174, %r8420, %r2225;
add.s32 %r8420, %r7173, %r7174;
mul.wide.s32 %rd2344, %r2222, 4;
add.s64 %rd2345, %rd1, %rd2344;
ld.local.u32 %r7175, [%rd2345];
shr.u32 %r7176, %r7175, %r7172;
shl.b32 %r7177, %r8421, %r2225;
add.s32 %r8421, %r7176, %r7177;
$L__BB0_1690:
and.b32 %r7178, %r2216, -2147483648;
shr.u32 %r7179, %r8421, 30;
shl.b32 %r7180, %r8420, 2;
or.b32 %r7181, %r7179, %r7180;
shr.u32 %r7182, %r7181, 31;
shr.u32 %r7183, %r8420, 30;
add.s32 %r7184, %r7182, %r7183;
neg.s32 %r7185, %r7184;
setp.eq.s32 %p1433, %r7178, 0;
selp.b32 %r8422, %r7184, %r7185, %p1433;
setp.ne.s32 %p1434, %r7182, 0;
xor.b32 %r7186, %r7178, -2147483648;
selp.b32 %r7187, %r7186, %r7178, %p1434;
selp.b32 %r7188, -1, 0, %p1434;
xor.b32 %r7189, %r7181, %r7188;
shl.b32 %r7190, %r8421, 2;
xor.b32 %r7191, %r7190, %r7188;
cvt.u64.u32 %rd2346, %r7189;
cvt.u64.u32 %rd2347, %r7191;
bfi.b64 %rd2348, %rd2346, %rd2347, 32, 32;
cvt.rn.f64.s64 %fd227, %rd2348;
mul.f64 %fd228, %fd227, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4759, %fd228;
setp.eq.s32 %p1435, %r7187, 0;
neg.f32 %f4760, %f4759;
selp.f32 %f5945, %f4759, %f4760, %p1435;
$L__BB0_1692:
add.s32 %r2232, %r8422, 1;
and.b32 %r2233, %r2232, 1;
setp.eq.s32 %p1436, %r2233, 0;
selp.f32 %f1990, %f5945, 0f3F800000, %p1436;
mul.rn.f32 %f1991, %f5945, %f5945;
mov.f32 %f5946, 0fB94D4153;
@%p1436 bra $L__BB0_1694;
mov.f32 %f4763, 0fBAB607ED;
mov.f32 %f4764, 0f37CBAC00;
fma.rn.f32 %f5946, %f4764, %f1991, %f4763;
$L__BB0_1694:
selp.f32 %f4765, 0f3C0885E4, 0f3D2AAABB, %p1436;
fma.rn.f32 %f4766, %f5946, %f1991, %f4765;
selp.f32 %f4767, 0fBE2AAAA8, 0fBEFFFFFF, %p1436;
fma.rn.f32 %f4768, %f4766, %f1991, %f4767;
mov.f32 %f4769, 0f00000000;
fma.rn.f32 %f4770, %f1991, %f1990, %f4769;
fma.rn.f32 %f5947, %f4768, %f4770, %f1990;
and.b32 %r7193, %r2232, 2;
setp.eq.s32 %p1438, %r7193, 0;
@%p1438 bra $L__BB0_1696;
mov.f32 %f4772, 0fBF800000;
fma.rn.f32 %f5947, %f5947, %f4772, %f4769;
$L__BB0_1696:
add.f32 %f5997, %f5944, %f5947;
mul.f32 %f4773, %f1966, 0f3F22F983;
cvt.rni.s32.f32 %r8426, %f4773;
cvt.rn.f32.s32 %f4774, %r8426;
mov.f32 %f4775, 0fBFC90FDA;
fma.rn.f32 %f4776, %f4774, %f4775, %f1966;
mov.f32 %f4777, 0fB3A22168;
fma.rn.f32 %f4778, %f4774, %f4777, %f4776;
mov.f32 %f4779, 0fA7C234C5;
fma.rn.f32 %f5948, %f4774, %f4779, %f4778;
abs.f32 %f1999, %f1966;
setp.ltu.f32 %p1439, %f1999, 0f47CE4780;
@%p1439 bra $L__BB0_1704;
setp.eq.f32 %p1440, %f1999, 0f7F800000;
@%p1440 bra $L__BB0_1703;
bra.uni $L__BB0_1698;
$L__BB0_1703:
mov.f32 %f4782, 0f00000000;
mul.rn.f32 %f5948, %f1966, %f4782;
mov.u32 %r8426, 0;
bra.uni $L__BB0_1704;
$L__BB0_1698:
mov.b32 %r2235, %f1966;
shr.u32 %r7195, %r2235, 23;
and.b32 %r7196, %r7195, 255;
add.s32 %r2236, %r7196, -128;
shl.b32 %r7197, %r2235, 8;
or.b32 %r2237, %r7197, -2147483648;
shr.u32 %r2238, %r2236, 5;
mov.u64 %rd2753, 0;
mov.u32 %r8423, 0;
mov.u64 %rd2751, __cudart_i2opi_f;
mov.u64 %rd2752, %rd1;
$L__BB0_1699:
.pragma "nounroll";
ld.global.nc.u32 %r7198, [%rd2751];
mad.wide.u32 %rd2351, %r7198, %r2237, %rd2753;
shr.u64 %rd2753, %rd2351, 32;
st.local.u32 [%rd2752], %rd2351;
add.s64 %rd2752, %rd2752, 4;
add.s64 %rd2751, %rd2751, 4;
add.s32 %r8423, %r8423, 1;
setp.ne.s32 %p1441, %r8423, 6;
@%p1441 bra $L__BB0_1699;
st.local.u32 [%rd4], %rd2753;
mov.u32 %r7199, 4;
sub.s32 %r2241, %r7199, %r2238;
mov.u32 %r7200, 6;
sub.s32 %r7201, %r7200, %r2238;
mul.wide.s32 %rd2352, %r7201, 4;
add.s64 %rd2353, %rd1, %rd2352;
ld.local.u32 %r8424, [%rd2353];
ld.local.u32 %r8425, [%rd2353+-4];
and.b32 %r2244, %r2236, 31;
setp.eq.s32 %p1442, %r2244, 0;
@%p1442 bra $L__BB0_1702;
mov.u32 %r7202, 32;
sub.s32 %r7203, %r7202, %r2244;
shr.u32 %r7204, %r8425, %r7203;
shl.b32 %r7205, %r8424, %r2244;
add.s32 %r8424, %r7204, %r7205;
mul.wide.s32 %rd2354, %r2241, 4;
add.s64 %rd2355, %rd1, %rd2354;
ld.local.u32 %r7206, [%rd2355];
shr.u32 %r7207, %r7206, %r7203;
shl.b32 %r7208, %r8425, %r2244;
add.s32 %r8425, %r7207, %r7208;
$L__BB0_1702:
and.b32 %r7209, %r2235, -2147483648;
shr.u32 %r7210, %r8425, 30;
shl.b32 %r7211, %r8424, 2;
or.b32 %r7212, %r7210, %r7211;
shr.u32 %r7213, %r7212, 31;
shr.u32 %r7214, %r8424, 30;
add.s32 %r7215, %r7213, %r7214;
neg.s32 %r7216, %r7215;
setp.eq.s32 %p1443, %r7209, 0;
selp.b32 %r8426, %r7215, %r7216, %p1443;
setp.ne.s32 %p1444, %r7213, 0;
xor.b32 %r7217, %r7209, -2147483648;
selp.b32 %r7218, %r7217, %r7209, %p1444;
selp.b32 %r7219, -1, 0, %p1444;
xor.b32 %r7220, %r7212, %r7219;
shl.b32 %r7221, %r8425, 2;
xor.b32 %r7222, %r7221, %r7219;
cvt.u64.u32 %rd2356, %r7220;
cvt.u64.u32 %rd2357, %r7222;
bfi.b64 %rd2358, %rd2356, %rd2357, 32, 32;
cvt.rn.f64.s64 %fd229, %rd2358;
mul.f64 %fd230, %fd229, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4780, %fd230;
setp.eq.s32 %p1445, %r7218, 0;
neg.f32 %f4781, %f4780;
selp.f32 %f5948, %f4780, %f4781, %p1445;
$L__BB0_1704:
and.b32 %r2251, %r8426, 1;
setp.eq.s32 %p1446, %r2251, 0;
selp.f32 %f2003, %f5948, 0f3F800000, %p1446;
mul.rn.f32 %f2004, %f5948, %f5948;
mov.f32 %f5949, 0fB94D4153;
@%p1446 bra $L__BB0_1706;
mov.f32 %f4784, 0fBAB607ED;
mov.f32 %f4785, 0f37CBAC00;
fma.rn.f32 %f5949, %f4785, %f2004, %f4784;
$L__BB0_1706:
selp.f32 %f4786, 0f3C0885E4, 0f3D2AAABB, %p1446;
fma.rn.f32 %f4787, %f5949, %f2004, %f4786;
selp.f32 %f4788, 0fBE2AAAA8, 0fBEFFFFFF, %p1446;
fma.rn.f32 %f4789, %f4787, %f2004, %f4788;
mov.f32 %f4790, 0f00000000;
fma.rn.f32 %f4791, %f2004, %f2003, %f4790;
fma.rn.f32 %f5950, %f4789, %f4791, %f2003;
and.b32 %r7224, %r8426, 2;
setp.eq.s32 %p1448, %r7224, 0;
@%p1448 bra $L__BB0_1708;
mov.f32 %f4793, 0fBF800000;
fma.rn.f32 %f5950, %f5950, %f4793, %f4790;
$L__BB0_1708:
mul.f32 %f4794, %f1958, 0f3F22F983;
cvt.rni.s32.f32 %r8430, %f4794;
cvt.rn.f32.s32 %f4795, %r8430;
mov.f32 %f4796, 0fBFC90FDA;
fma.rn.f32 %f4797, %f4795, %f4796, %f1958;
mov.f32 %f4798, 0fB3A22168;
fma.rn.f32 %f4799, %f4795, %f4798, %f4797;
mov.f32 %f4800, 0fA7C234C5;
fma.rn.f32 %f5951, %f4795, %f4800, %f4799;
abs.f32 %f2011, %f1958;
setp.ltu.f32 %p1449, %f2011, 0f47CE4780;
@%p1449 bra $L__BB0_1716;
setp.eq.f32 %p1450, %f2011, 0f7F800000;
@%p1450 bra $L__BB0_1715;
bra.uni $L__BB0_1710;
$L__BB0_1715:
mov.f32 %f4803, 0f00000000;
mul.rn.f32 %f5951, %f1958, %f4803;
mov.u32 %r8430, 0;
bra.uni $L__BB0_1716;
$L__BB0_1710:
mov.b32 %r2253, %f1958;
shr.u32 %r7226, %r2253, 23;
and.b32 %r7227, %r7226, 255;
add.s32 %r2254, %r7227, -128;
shl.b32 %r7228, %r2253, 8;
or.b32 %r2255, %r7228, -2147483648;
shr.u32 %r2256, %r2254, 5;
mov.u64 %rd2756, 0;
mov.u32 %r8427, 0;
mov.u64 %rd2754, __cudart_i2opi_f;
mov.u64 %rd2755, %rd1;
$L__BB0_1711:
.pragma "nounroll";
ld.global.nc.u32 %r7229, [%rd2754];
mad.wide.u32 %rd2361, %r7229, %r2255, %rd2756;
shr.u64 %rd2756, %rd2361, 32;
st.local.u32 [%rd2755], %rd2361;
add.s64 %rd2755, %rd2755, 4;
add.s64 %rd2754, %rd2754, 4;
add.s32 %r8427, %r8427, 1;
setp.ne.s32 %p1451, %r8427, 6;
@%p1451 bra $L__BB0_1711;
st.local.u32 [%rd4], %rd2756;
mov.u32 %r7230, 4;
sub.s32 %r2259, %r7230, %r2256;
mov.u32 %r7231, 6;
sub.s32 %r7232, %r7231, %r2256;
mul.wide.s32 %rd2362, %r7232, 4;
add.s64 %rd2363, %rd1, %rd2362;
ld.local.u32 %r8428, [%rd2363];
ld.local.u32 %r8429, [%rd2363+-4];
and.b32 %r2262, %r2254, 31;
setp.eq.s32 %p1452, %r2262, 0;
@%p1452 bra $L__BB0_1714;
mov.u32 %r7233, 32;
sub.s32 %r7234, %r7233, %r2262;
shr.u32 %r7235, %r8429, %r7234;
shl.b32 %r7236, %r8428, %r2262;
add.s32 %r8428, %r7235, %r7236;
mul.wide.s32 %rd2364, %r2259, 4;
add.s64 %rd2365, %rd1, %rd2364;
ld.local.u32 %r7237, [%rd2365];
shr.u32 %r7238, %r7237, %r7234;
shl.b32 %r7239, %r8429, %r2262;
add.s32 %r8429, %r7238, %r7239;
$L__BB0_1714:
and.b32 %r7240, %r2253, -2147483648;
shr.u32 %r7241, %r8429, 30;
shl.b32 %r7242, %r8428, 2;
or.b32 %r7243, %r7241, %r7242;
shr.u32 %r7244, %r7243, 31;
shr.u32 %r7245, %r8428, 30;
add.s32 %r7246, %r7244, %r7245;
neg.s32 %r7247, %r7246;
setp.eq.s32 %p1453, %r7240, 0;
selp.b32 %r8430, %r7246, %r7247, %p1453;
setp.ne.s32 %p1454, %r7244, 0;
xor.b32 %r7248, %r7240, -2147483648;
selp.b32 %r7249, %r7248, %r7240, %p1454;
selp.b32 %r7250, -1, 0, %p1454;
xor.b32 %r7251, %r7243, %r7250;
shl.b32 %r7252, %r8429, 2;
xor.b32 %r7253, %r7252, %r7250;
cvt.u64.u32 %rd2366, %r7251;
cvt.u64.u32 %rd2367, %r7253;
bfi.b64 %rd2368, %rd2366, %rd2367, 32, 32;
cvt.rn.f64.s64 %fd231, %rd2368;
mul.f64 %fd232, %fd231, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4801, %fd232;
setp.eq.s32 %p1455, %r7249, 0;
neg.f32 %f4802, %f4801;
selp.f32 %f5951, %f4801, %f4802, %p1455;
$L__BB0_1716:
add.s32 %r2269, %r8430, 1;
and.b32 %r2270, %r2269, 1;
setp.eq.s32 %p1456, %r2270, 0;
selp.f32 %f2015, %f5951, 0f3F800000, %p1456;
mul.rn.f32 %f2016, %f5951, %f5951;
mov.f32 %f5952, 0fB94D4153;
@%p1456 bra $L__BB0_1718;
mov.f32 %f4805, 0fBAB607ED;
mov.f32 %f4806, 0f37CBAC00;
fma.rn.f32 %f5952, %f4806, %f2016, %f4805;
$L__BB0_1718:
selp.f32 %f4807, 0f3C0885E4, 0f3D2AAABB, %p1456;
fma.rn.f32 %f4808, %f5952, %f2016, %f4807;
selp.f32 %f4809, 0fBE2AAAA8, 0fBEFFFFFF, %p1456;
fma.rn.f32 %f4810, %f4808, %f2016, %f4809;
mov.f32 %f4811, 0f00000000;
fma.rn.f32 %f4812, %f2016, %f2015, %f4811;
fma.rn.f32 %f5953, %f4810, %f4812, %f2015;
and.b32 %r7255, %r2269, 2;
setp.eq.s32 %p1458, %r7255, 0;
@%p1458 bra $L__BB0_1720;
mov.f32 %f4814, 0fBF800000;
fma.rn.f32 %f5953, %f5953, %f4814, %f4811;
$L__BB0_1720:
add.f32 %f5996, %f5950, %f5953;
mul.f32 %f4815, %f1967, 0f3F22F983;
cvt.rni.s32.f32 %r8434, %f4815;
cvt.rn.f32.s32 %f4816, %r8434;
mov.f32 %f4817, 0fBFC90FDA;
fma.rn.f32 %f4818, %f4816, %f4817, %f1967;
mov.f32 %f4819, 0fB3A22168;
fma.rn.f32 %f4820, %f4816, %f4819, %f4818;
mov.f32 %f4821, 0fA7C234C5;
fma.rn.f32 %f5954, %f4816, %f4821, %f4820;
abs.f32 %f2024, %f1967;
setp.ltu.f32 %p1459, %f2024, 0f47CE4780;
@%p1459 bra $L__BB0_1728;
setp.eq.f32 %p1460, %f2024, 0f7F800000;
@%p1460 bra $L__BB0_1727;
bra.uni $L__BB0_1722;
$L__BB0_1727:
mov.f32 %f4824, 0f00000000;
mul.rn.f32 %f5954, %f1967, %f4824;
mov.u32 %r8434, 0;
bra.uni $L__BB0_1728;
$L__BB0_1722:
mov.b32 %r2272, %f1967;
shr.u32 %r7257, %r2272, 23;
and.b32 %r7258, %r7257, 255;
add.s32 %r2273, %r7258, -128;
shl.b32 %r7259, %r2272, 8;
or.b32 %r2274, %r7259, -2147483648;
shr.u32 %r2275, %r2273, 5;
mov.u64 %rd2759, 0;
mov.u32 %r8431, 0;
mov.u64 %rd2757, __cudart_i2opi_f;
mov.u64 %rd2758, %rd1;
$L__BB0_1723:
.pragma "nounroll";
ld.global.nc.u32 %r7260, [%rd2757];
mad.wide.u32 %rd2371, %r7260, %r2274, %rd2759;
shr.u64 %rd2759, %rd2371, 32;
st.local.u32 [%rd2758], %rd2371;
add.s64 %rd2758, %rd2758, 4;
add.s64 %rd2757, %rd2757, 4;
add.s32 %r8431, %r8431, 1;
setp.ne.s32 %p1461, %r8431, 6;
@%p1461 bra $L__BB0_1723;
st.local.u32 [%rd4], %rd2759;
mov.u32 %r7261, 4;
sub.s32 %r2278, %r7261, %r2275;
mov.u32 %r7262, 6;
sub.s32 %r7263, %r7262, %r2275;
mul.wide.s32 %rd2372, %r7263, 4;
add.s64 %rd2373, %rd1, %rd2372;
ld.local.u32 %r8432, [%rd2373];
ld.local.u32 %r8433, [%rd2373+-4];
and.b32 %r2281, %r2273, 31;
setp.eq.s32 %p1462, %r2281, 0;
@%p1462 bra $L__BB0_1726;
mov.u32 %r7264, 32;
sub.s32 %r7265, %r7264, %r2281;
shr.u32 %r7266, %r8433, %r7265;
shl.b32 %r7267, %r8432, %r2281;
add.s32 %r8432, %r7266, %r7267;
mul.wide.s32 %rd2374, %r2278, 4;
add.s64 %rd2375, %rd1, %rd2374;
ld.local.u32 %r7268, [%rd2375];
shr.u32 %r7269, %r7268, %r7265;
shl.b32 %r7270, %r8433, %r2281;
add.s32 %r8433, %r7269, %r7270;
$L__BB0_1726:
and.b32 %r7271, %r2272, -2147483648;
shr.u32 %r7272, %r8433, 30;
shl.b32 %r7273, %r8432, 2;
or.b32 %r7274, %r7272, %r7273;
shr.u32 %r7275, %r7274, 31;
shr.u32 %r7276, %r8432, 30;
add.s32 %r7277, %r7275, %r7276;
neg.s32 %r7278, %r7277;
setp.eq.s32 %p1463, %r7271, 0;
selp.b32 %r8434, %r7277, %r7278, %p1463;
setp.ne.s32 %p1464, %r7275, 0;
xor.b32 %r7279, %r7271, -2147483648;
selp.b32 %r7280, %r7279, %r7271, %p1464;
selp.b32 %r7281, -1, 0, %p1464;
xor.b32 %r7282, %r7274, %r7281;
shl.b32 %r7283, %r8433, 2;
xor.b32 %r7284, %r7283, %r7281;
cvt.u64.u32 %rd2376, %r7282;
cvt.u64.u32 %rd2377, %r7284;
bfi.b64 %rd2378, %rd2376, %rd2377, 32, 32;
cvt.rn.f64.s64 %fd233, %rd2378;
mul.f64 %fd234, %fd233, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4822, %fd234;
setp.eq.s32 %p1465, %r7280, 0;
neg.f32 %f4823, %f4822;
selp.f32 %f5954, %f4822, %f4823, %p1465;
$L__BB0_1728:
and.b32 %r2288, %r8434, 1;
setp.eq.s32 %p1466, %r2288, 0;
selp.f32 %f2028, %f5954, 0f3F800000, %p1466;
mul.rn.f32 %f2029, %f5954, %f5954;
mov.f32 %f5955, 0fB94D4153;
@%p1466 bra $L__BB0_1730;
mov.f32 %f4826, 0fBAB607ED;
mov.f32 %f4827, 0f37CBAC00;
fma.rn.f32 %f5955, %f4827, %f2029, %f4826;
$L__BB0_1730:
selp.f32 %f4828, 0f3C0885E4, 0f3D2AAABB, %p1466;
fma.rn.f32 %f4829, %f5955, %f2029, %f4828;
selp.f32 %f4830, 0fBE2AAAA8, 0fBEFFFFFF, %p1466;
fma.rn.f32 %f4831, %f4829, %f2029, %f4830;
mov.f32 %f4832, 0f00000000;
fma.rn.f32 %f4833, %f2029, %f2028, %f4832;
fma.rn.f32 %f5956, %f4831, %f4833, %f2028;
and.b32 %r7286, %r8434, 2;
setp.eq.s32 %p1468, %r7286, 0;
@%p1468 bra $L__BB0_1732;
mov.f32 %f4835, 0fBF800000;
fma.rn.f32 %f5956, %f5956, %f4835, %f4832;
$L__BB0_1732:
mul.f32 %f4836, %f1959, 0f3F22F983;
cvt.rni.s32.f32 %r8438, %f4836;
cvt.rn.f32.s32 %f4837, %r8438;
mov.f32 %f4838, 0fBFC90FDA;
fma.rn.f32 %f4839, %f4837, %f4838, %f1959;
mov.f32 %f4840, 0fB3A22168;
fma.rn.f32 %f4841, %f4837, %f4840, %f4839;
mov.f32 %f4842, 0fA7C234C5;
fma.rn.f32 %f5957, %f4837, %f4842, %f4841;
abs.f32 %f2036, %f1959;
setp.ltu.f32 %p1469, %f2036, 0f47CE4780;
@%p1469 bra $L__BB0_1740;
setp.eq.f32 %p1470, %f2036, 0f7F800000;
@%p1470 bra $L__BB0_1739;
bra.uni $L__BB0_1734;
$L__BB0_1739:
mov.f32 %f4845, 0f00000000;
mul.rn.f32 %f5957, %f1959, %f4845;
mov.u32 %r8438, 0;
bra.uni $L__BB0_1740;
$L__BB0_1734:
mov.b32 %r2290, %f1959;
shr.u32 %r7288, %r2290, 23;
and.b32 %r7289, %r7288, 255;
add.s32 %r2291, %r7289, -128;
shl.b32 %r7290, %r2290, 8;
or.b32 %r2292, %r7290, -2147483648;
shr.u32 %r2293, %r2291, 5;
mov.u64 %rd2762, 0;
mov.u32 %r8435, 0;
mov.u64 %rd2760, __cudart_i2opi_f;
mov.u64 %rd2761, %rd1;
$L__BB0_1735:
.pragma "nounroll";
ld.global.nc.u32 %r7291, [%rd2760];
mad.wide.u32 %rd2381, %r7291, %r2292, %rd2762;
shr.u64 %rd2762, %rd2381, 32;
st.local.u32 [%rd2761], %rd2381;
add.s64 %rd2761, %rd2761, 4;
add.s64 %rd2760, %rd2760, 4;
add.s32 %r8435, %r8435, 1;
setp.ne.s32 %p1471, %r8435, 6;
@%p1471 bra $L__BB0_1735;
st.local.u32 [%rd4], %rd2762;
mov.u32 %r7292, 4;
sub.s32 %r2296, %r7292, %r2293;
mov.u32 %r7293, 6;
sub.s32 %r7294, %r7293, %r2293;
mul.wide.s32 %rd2382, %r7294, 4;
add.s64 %rd2383, %rd1, %rd2382;
ld.local.u32 %r8436, [%rd2383];
ld.local.u32 %r8437, [%rd2383+-4];
and.b32 %r2299, %r2291, 31;
setp.eq.s32 %p1472, %r2299, 0;
@%p1472 bra $L__BB0_1738;
mov.u32 %r7295, 32;
sub.s32 %r7296, %r7295, %r2299;
shr.u32 %r7297, %r8437, %r7296;
shl.b32 %r7298, %r8436, %r2299;
add.s32 %r8436, %r7297, %r7298;
mul.wide.s32 %rd2384, %r2296, 4;
add.s64 %rd2385, %rd1, %rd2384;
ld.local.u32 %r7299, [%rd2385];
shr.u32 %r7300, %r7299, %r7296;
shl.b32 %r7301, %r8437, %r2299;
add.s32 %r8437, %r7300, %r7301;
$L__BB0_1738:
and.b32 %r7302, %r2290, -2147483648;
shr.u32 %r7303, %r8437, 30;
shl.b32 %r7304, %r8436, 2;
or.b32 %r7305, %r7303, %r7304;
shr.u32 %r7306, %r7305, 31;
shr.u32 %r7307, %r8436, 30;
add.s32 %r7308, %r7306, %r7307;
neg.s32 %r7309, %r7308;
setp.eq.s32 %p1473, %r7302, 0;
selp.b32 %r8438, %r7308, %r7309, %p1473;
setp.ne.s32 %p1474, %r7306, 0;
xor.b32 %r7310, %r7302, -2147483648;
selp.b32 %r7311, %r7310, %r7302, %p1474;
selp.b32 %r7312, -1, 0, %p1474;
xor.b32 %r7313, %r7305, %r7312;
shl.b32 %r7314, %r8437, 2;
xor.b32 %r7315, %r7314, %r7312;
cvt.u64.u32 %rd2386, %r7313;
cvt.u64.u32 %rd2387, %r7315;
bfi.b64 %rd2388, %rd2386, %rd2387, 32, 32;
cvt.rn.f64.s64 %fd235, %rd2388;
mul.f64 %fd236, %fd235, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4843, %fd236;
setp.eq.s32 %p1475, %r7311, 0;
neg.f32 %f4844, %f4843;
selp.f32 %f5957, %f4843, %f4844, %p1475;
$L__BB0_1740:
add.s32 %r2306, %r8438, 1;
and.b32 %r2307, %r2306, 1;
setp.eq.s32 %p1476, %r2307, 0;
selp.f32 %f2040, %f5957, 0f3F800000, %p1476;
mul.rn.f32 %f2041, %f5957, %f5957;
mov.f32 %f5958, 0fB94D4153;
@%p1476 bra $L__BB0_1742;
mov.f32 %f4847, 0fBAB607ED;
mov.f32 %f4848, 0f37CBAC00;
fma.rn.f32 %f5958, %f4848, %f2041, %f4847;
$L__BB0_1742:
selp.f32 %f4849, 0f3C0885E4, 0f3D2AAABB, %p1476;
fma.rn.f32 %f4850, %f5958, %f2041, %f4849;
selp.f32 %f4851, 0fBE2AAAA8, 0fBEFFFFFF, %p1476;
fma.rn.f32 %f4852, %f4850, %f2041, %f4851;
mov.f32 %f4853, 0f00000000;
fma.rn.f32 %f4854, %f2041, %f2040, %f4853;
fma.rn.f32 %f5959, %f4852, %f4854, %f2040;
and.b32 %r7317, %r2306, 2;
setp.eq.s32 %p1478, %r7317, 0;
@%p1478 bra $L__BB0_1744;
mov.f32 %f4856, 0fBF800000;
fma.rn.f32 %f5959, %f5959, %f4856, %f4853;
$L__BB0_1744:
add.f32 %f5995, %f5956, %f5959;
mul.f32 %f4857, %f1968, 0f3F22F983;
cvt.rni.s32.f32 %r8442, %f4857;
cvt.rn.f32.s32 %f4858, %r8442;
mov.f32 %f4859, 0fBFC90FDA;
fma.rn.f32 %f4860, %f4858, %f4859, %f1968;
mov.f32 %f4861, 0fB3A22168;
fma.rn.f32 %f4862, %f4858, %f4861, %f4860;
mov.f32 %f4863, 0fA7C234C5;
fma.rn.f32 %f5960, %f4858, %f4863, %f4862;
abs.f32 %f2049, %f1968;
setp.ltu.f32 %p1479, %f2049, 0f47CE4780;
@%p1479 bra $L__BB0_1752;
setp.eq.f32 %p1480, %f2049, 0f7F800000;
@%p1480 bra $L__BB0_1751;
bra.uni $L__BB0_1746;
$L__BB0_1751:
mov.f32 %f4866, 0f00000000;
mul.rn.f32 %f5960, %f1968, %f4866;
mov.u32 %r8442, 0;
bra.uni $L__BB0_1752;
$L__BB0_1746:
mov.b32 %r2309, %f1968;
shr.u32 %r7319, %r2309, 23;
and.b32 %r7320, %r7319, 255;
add.s32 %r2310, %r7320, -128;
shl.b32 %r7321, %r2309, 8;
or.b32 %r2311, %r7321, -2147483648;
shr.u32 %r2312, %r2310, 5;
mov.u64 %rd2765, 0;
mov.u32 %r8439, 0;
mov.u64 %rd2763, __cudart_i2opi_f;
mov.u64 %rd2764, %rd1;
$L__BB0_1747:
.pragma "nounroll";
ld.global.nc.u32 %r7322, [%rd2763];
mad.wide.u32 %rd2391, %r7322, %r2311, %rd2765;
shr.u64 %rd2765, %rd2391, 32;
st.local.u32 [%rd2764], %rd2391;
add.s64 %rd2764, %rd2764, 4;
add.s64 %rd2763, %rd2763, 4;
add.s32 %r8439, %r8439, 1;
setp.ne.s32 %p1481, %r8439, 6;
@%p1481 bra $L__BB0_1747;
st.local.u32 [%rd4], %rd2765;
mov.u32 %r7323, 4;
sub.s32 %r2315, %r7323, %r2312;
mov.u32 %r7324, 6;
sub.s32 %r7325, %r7324, %r2312;
mul.wide.s32 %rd2392, %r7325, 4;
add.s64 %rd2393, %rd1, %rd2392;
ld.local.u32 %r8440, [%rd2393];
ld.local.u32 %r8441, [%rd2393+-4];
and.b32 %r2318, %r2310, 31;
setp.eq.s32 %p1482, %r2318, 0;
@%p1482 bra $L__BB0_1750;
mov.u32 %r7326, 32;
sub.s32 %r7327, %r7326, %r2318;
shr.u32 %r7328, %r8441, %r7327;
shl.b32 %r7329, %r8440, %r2318;
add.s32 %r8440, %r7328, %r7329;
mul.wide.s32 %rd2394, %r2315, 4;
add.s64 %rd2395, %rd1, %rd2394;
ld.local.u32 %r7330, [%rd2395];
shr.u32 %r7331, %r7330, %r7327;
shl.b32 %r7332, %r8441, %r2318;
add.s32 %r8441, %r7331, %r7332;
$L__BB0_1750:
and.b32 %r7333, %r2309, -2147483648;
shr.u32 %r7334, %r8441, 30;
shl.b32 %r7335, %r8440, 2;
or.b32 %r7336, %r7334, %r7335;
shr.u32 %r7337, %r7336, 31;
shr.u32 %r7338, %r8440, 30;
add.s32 %r7339, %r7337, %r7338;
neg.s32 %r7340, %r7339;
setp.eq.s32 %p1483, %r7333, 0;
selp.b32 %r8442, %r7339, %r7340, %p1483;
setp.ne.s32 %p1484, %r7337, 0;
xor.b32 %r7341, %r7333, -2147483648;
selp.b32 %r7342, %r7341, %r7333, %p1484;
selp.b32 %r7343, -1, 0, %p1484;
xor.b32 %r7344, %r7336, %r7343;
shl.b32 %r7345, %r8441, 2;
xor.b32 %r7346, %r7345, %r7343;
cvt.u64.u32 %rd2396, %r7344;
cvt.u64.u32 %rd2397, %r7346;
bfi.b64 %rd2398, %rd2396, %rd2397, 32, 32;
cvt.rn.f64.s64 %fd237, %rd2398;
mul.f64 %fd238, %fd237, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4864, %fd238;
setp.eq.s32 %p1485, %r7342, 0;
neg.f32 %f4865, %f4864;
selp.f32 %f5960, %f4864, %f4865, %p1485;
$L__BB0_1752:
and.b32 %r2325, %r8442, 1;
setp.eq.s32 %p1486, %r2325, 0;
selp.f32 %f2053, %f5960, 0f3F800000, %p1486;
mul.rn.f32 %f2054, %f5960, %f5960;
mov.f32 %f5961, 0fB94D4153;
@%p1486 bra $L__BB0_1754;
mov.f32 %f4868, 0fBAB607ED;
mov.f32 %f4869, 0f37CBAC00;
fma.rn.f32 %f5961, %f4869, %f2054, %f4868;
$L__BB0_1754:
selp.f32 %f4870, 0f3C0885E4, 0f3D2AAABB, %p1486;
fma.rn.f32 %f4871, %f5961, %f2054, %f4870;
selp.f32 %f4872, 0fBE2AAAA8, 0fBEFFFFFF, %p1486;
fma.rn.f32 %f4873, %f4871, %f2054, %f4872;
mov.f32 %f4874, 0f00000000;
fma.rn.f32 %f4875, %f2054, %f2053, %f4874;
fma.rn.f32 %f5962, %f4873, %f4875, %f2053;
and.b32 %r7348, %r8442, 2;
setp.eq.s32 %p1488, %r7348, 0;
@%p1488 bra $L__BB0_1756;
mov.f32 %f4877, 0fBF800000;
fma.rn.f32 %f5962, %f5962, %f4877, %f4874;
$L__BB0_1756:
mul.f32 %f4878, %f1960, 0f3F22F983;
cvt.rni.s32.f32 %r8446, %f4878;
cvt.rn.f32.s32 %f4879, %r8446;
mov.f32 %f4880, 0fBFC90FDA;
fma.rn.f32 %f4881, %f4879, %f4880, %f1960;
mov.f32 %f4882, 0fB3A22168;
fma.rn.f32 %f4883, %f4879, %f4882, %f4881;
mov.f32 %f4884, 0fA7C234C5;
fma.rn.f32 %f5963, %f4879, %f4884, %f4883;
abs.f32 %f2061, %f1960;
setp.ltu.f32 %p1489, %f2061, 0f47CE4780;
@%p1489 bra $L__BB0_1764;
setp.eq.f32 %p1490, %f2061, 0f7F800000;
@%p1490 bra $L__BB0_1763;
bra.uni $L__BB0_1758;
$L__BB0_1763:
mov.f32 %f4887, 0f00000000;
mul.rn.f32 %f5963, %f1960, %f4887;
mov.u32 %r8446, 0;
bra.uni $L__BB0_1764;
$L__BB0_1758:
mov.b32 %r2327, %f1960;
shr.u32 %r7350, %r2327, 23;
and.b32 %r7351, %r7350, 255;
add.s32 %r2328, %r7351, -128;
shl.b32 %r7352, %r2327, 8;
or.b32 %r2329, %r7352, -2147483648;
shr.u32 %r2330, %r2328, 5;
mov.u64 %rd2768, 0;
mov.u32 %r8443, 0;
mov.u64 %rd2766, __cudart_i2opi_f;
mov.u64 %rd2767, %rd1;
$L__BB0_1759:
.pragma "nounroll";
ld.global.nc.u32 %r7353, [%rd2766];
mad.wide.u32 %rd2401, %r7353, %r2329, %rd2768;
shr.u64 %rd2768, %rd2401, 32;
st.local.u32 [%rd2767], %rd2401;
add.s64 %rd2767, %rd2767, 4;
add.s64 %rd2766, %rd2766, 4;
add.s32 %r8443, %r8443, 1;
setp.ne.s32 %p1491, %r8443, 6;
@%p1491 bra $L__BB0_1759;
st.local.u32 [%rd4], %rd2768;
mov.u32 %r7354, 4;
sub.s32 %r2333, %r7354, %r2330;
mov.u32 %r7355, 6;
sub.s32 %r7356, %r7355, %r2330;
mul.wide.s32 %rd2402, %r7356, 4;
add.s64 %rd2403, %rd1, %rd2402;
ld.local.u32 %r8444, [%rd2403];
ld.local.u32 %r8445, [%rd2403+-4];
and.b32 %r2336, %r2328, 31;
setp.eq.s32 %p1492, %r2336, 0;
@%p1492 bra $L__BB0_1762;
mov.u32 %r7357, 32;
sub.s32 %r7358, %r7357, %r2336;
shr.u32 %r7359, %r8445, %r7358;
shl.b32 %r7360, %r8444, %r2336;
add.s32 %r8444, %r7359, %r7360;
mul.wide.s32 %rd2404, %r2333, 4;
add.s64 %rd2405, %rd1, %rd2404;
ld.local.u32 %r7361, [%rd2405];
shr.u32 %r7362, %r7361, %r7358;
shl.b32 %r7363, %r8445, %r2336;
add.s32 %r8445, %r7362, %r7363;
$L__BB0_1762:
and.b32 %r7364, %r2327, -2147483648;
shr.u32 %r7365, %r8445, 30;
shl.b32 %r7366, %r8444, 2;
or.b32 %r7367, %r7365, %r7366;
shr.u32 %r7368, %r7367, 31;
shr.u32 %r7369, %r8444, 30;
add.s32 %r7370, %r7368, %r7369;
neg.s32 %r7371, %r7370;
setp.eq.s32 %p1493, %r7364, 0;
selp.b32 %r8446, %r7370, %r7371, %p1493;
setp.ne.s32 %p1494, %r7368, 0;
xor.b32 %r7372, %r7364, -2147483648;
selp.b32 %r7373, %r7372, %r7364, %p1494;
selp.b32 %r7374, -1, 0, %p1494;
xor.b32 %r7375, %r7367, %r7374;
shl.b32 %r7376, %r8445, 2;
xor.b32 %r7377, %r7376, %r7374;
cvt.u64.u32 %rd2406, %r7375;
cvt.u64.u32 %rd2407, %r7377;
bfi.b64 %rd2408, %rd2406, %rd2407, 32, 32;
cvt.rn.f64.s64 %fd239, %rd2408;
mul.f64 %fd240, %fd239, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4885, %fd240;
setp.eq.s32 %p1495, %r7373, 0;
neg.f32 %f4886, %f4885;
selp.f32 %f5963, %f4885, %f4886, %p1495;
$L__BB0_1764:
add.s32 %r2343, %r8446, 1;
and.b32 %r2344, %r2343, 1;
setp.eq.s32 %p1496, %r2344, 0;
selp.f32 %f2065, %f5963, 0f3F800000, %p1496;
mul.rn.f32 %f2066, %f5963, %f5963;
mov.f32 %f5964, 0fB94D4153;
@%p1496 bra $L__BB0_1766;
mov.f32 %f4889, 0fBAB607ED;
mov.f32 %f4890, 0f37CBAC00;
fma.rn.f32 %f5964, %f4890, %f2066, %f4889;
$L__BB0_1766:
selp.f32 %f4891, 0f3C0885E4, 0f3D2AAABB, %p1496;
fma.rn.f32 %f4892, %f5964, %f2066, %f4891;
selp.f32 %f4893, 0fBE2AAAA8, 0fBEFFFFFF, %p1496;
fma.rn.f32 %f4894, %f4892, %f2066, %f4893;
mov.f32 %f4895, 0f00000000;
fma.rn.f32 %f4896, %f2066, %f2065, %f4895;
fma.rn.f32 %f5965, %f4894, %f4896, %f2065;
and.b32 %r7379, %r2343, 2;
setp.eq.s32 %p1498, %r7379, 0;
@%p1498 bra $L__BB0_1768;
mov.f32 %f4898, 0fBF800000;
fma.rn.f32 %f5965, %f5965, %f4898, %f4895;
$L__BB0_1768:
add.f32 %f5994, %f5962, %f5965;
mul.f32 %f4899, %f1969, 0f3F22F983;
cvt.rni.s32.f32 %r8450, %f4899;
cvt.rn.f32.s32 %f4900, %r8450;
mov.f32 %f4901, 0fBFC90FDA;
fma.rn.f32 %f4902, %f4900, %f4901, %f1969;
mov.f32 %f4903, 0fB3A22168;
fma.rn.f32 %f4904, %f4900, %f4903, %f4902;
mov.f32 %f4905, 0fA7C234C5;
fma.rn.f32 %f5966, %f4900, %f4905, %f4904;
abs.f32 %f2074, %f1969;
setp.ltu.f32 %p1499, %f2074, 0f47CE4780;
@%p1499 bra $L__BB0_1776;
setp.eq.f32 %p1500, %f2074, 0f7F800000;
@%p1500 bra $L__BB0_1775;
bra.uni $L__BB0_1770;
$L__BB0_1775:
mov.f32 %f4908, 0f00000000;
mul.rn.f32 %f5966, %f1969, %f4908;
mov.u32 %r8450, 0;
bra.uni $L__BB0_1776;
$L__BB0_1770:
mov.b32 %r2346, %f1969;
shr.u32 %r7381, %r2346, 23;
and.b32 %r7382, %r7381, 255;
add.s32 %r2347, %r7382, -128;
shl.b32 %r7383, %r2346, 8;
or.b32 %r2348, %r7383, -2147483648;
shr.u32 %r2349, %r2347, 5;
mov.u64 %rd2771, 0;
mov.u32 %r8447, 0;
mov.u64 %rd2769, __cudart_i2opi_f;
mov.u64 %rd2770, %rd1;
$L__BB0_1771:
.pragma "nounroll";
ld.global.nc.u32 %r7384, [%rd2769];
mad.wide.u32 %rd2411, %r7384, %r2348, %rd2771;
shr.u64 %rd2771, %rd2411, 32;
st.local.u32 [%rd2770], %rd2411;
add.s64 %rd2770, %rd2770, 4;
add.s64 %rd2769, %rd2769, 4;
add.s32 %r8447, %r8447, 1;
setp.ne.s32 %p1501, %r8447, 6;
@%p1501 bra $L__BB0_1771;
st.local.u32 [%rd4], %rd2771;
mov.u32 %r7385, 4;
sub.s32 %r2352, %r7385, %r2349;
mov.u32 %r7386, 6;
sub.s32 %r7387, %r7386, %r2349;
mul.wide.s32 %rd2412, %r7387, 4;
add.s64 %rd2413, %rd1, %rd2412;
ld.local.u32 %r8448, [%rd2413];
ld.local.u32 %r8449, [%rd2413+-4];
and.b32 %r2355, %r2347, 31;
setp.eq.s32 %p1502, %r2355, 0;
@%p1502 bra $L__BB0_1774;
mov.u32 %r7388, 32;
sub.s32 %r7389, %r7388, %r2355;
shr.u32 %r7390, %r8449, %r7389;
shl.b32 %r7391, %r8448, %r2355;
add.s32 %r8448, %r7390, %r7391;
mul.wide.s32 %rd2414, %r2352, 4;
add.s64 %rd2415, %rd1, %rd2414;
ld.local.u32 %r7392, [%rd2415];
shr.u32 %r7393, %r7392, %r7389;
shl.b32 %r7394, %r8449, %r2355;
add.s32 %r8449, %r7393, %r7394;
$L__BB0_1774:
and.b32 %r7395, %r2346, -2147483648;
shr.u32 %r7396, %r8449, 30;
shl.b32 %r7397, %r8448, 2;
or.b32 %r7398, %r7396, %r7397;
shr.u32 %r7399, %r7398, 31;
shr.u32 %r7400, %r8448, 30;
add.s32 %r7401, %r7399, %r7400;
neg.s32 %r7402, %r7401;
setp.eq.s32 %p1503, %r7395, 0;
selp.b32 %r8450, %r7401, %r7402, %p1503;
setp.ne.s32 %p1504, %r7399, 0;
xor.b32 %r7403, %r7395, -2147483648;
selp.b32 %r7404, %r7403, %r7395, %p1504;
selp.b32 %r7405, -1, 0, %p1504;
xor.b32 %r7406, %r7398, %r7405;
shl.b32 %r7407, %r8449, 2;
xor.b32 %r7408, %r7407, %r7405;
cvt.u64.u32 %rd2416, %r7406;
cvt.u64.u32 %rd2417, %r7408;
bfi.b64 %rd2418, %rd2416, %rd2417, 32, 32;
cvt.rn.f64.s64 %fd241, %rd2418;
mul.f64 %fd242, %fd241, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4906, %fd242;
setp.eq.s32 %p1505, %r7404, 0;
neg.f32 %f4907, %f4906;
selp.f32 %f5966, %f4906, %f4907, %p1505;
$L__BB0_1776:
and.b32 %r2362, %r8450, 1;
setp.eq.s32 %p1506, %r2362, 0;
selp.f32 %f2078, %f5966, 0f3F800000, %p1506;
mul.rn.f32 %f2079, %f5966, %f5966;
mov.f32 %f5967, 0fB94D4153;
@%p1506 bra $L__BB0_1778;
mov.f32 %f4910, 0fBAB607ED;
mov.f32 %f4911, 0f37CBAC00;
fma.rn.f32 %f5967, %f4911, %f2079, %f4910;
$L__BB0_1778:
selp.f32 %f4912, 0f3C0885E4, 0f3D2AAABB, %p1506;
fma.rn.f32 %f4913, %f5967, %f2079, %f4912;
selp.f32 %f4914, 0fBE2AAAA8, 0fBEFFFFFF, %p1506;
fma.rn.f32 %f4915, %f4913, %f2079, %f4914;
mov.f32 %f4916, 0f00000000;
fma.rn.f32 %f4917, %f2079, %f2078, %f4916;
fma.rn.f32 %f5968, %f4915, %f4917, %f2078;
and.b32 %r7410, %r8450, 2;
setp.eq.s32 %p1508, %r7410, 0;
@%p1508 bra $L__BB0_1780;
mov.f32 %f4919, 0fBF800000;
fma.rn.f32 %f5968, %f5968, %f4919, %f4916;
$L__BB0_1780:
mul.f32 %f4920, %f1961, 0f3F22F983;
cvt.rni.s32.f32 %r8454, %f4920;
cvt.rn.f32.s32 %f4921, %r8454;
mov.f32 %f4922, 0fBFC90FDA;
fma.rn.f32 %f4923, %f4921, %f4922, %f1961;
mov.f32 %f4924, 0fB3A22168;
fma.rn.f32 %f4925, %f4921, %f4924, %f4923;
mov.f32 %f4926, 0fA7C234C5;
fma.rn.f32 %f5969, %f4921, %f4926, %f4925;
abs.f32 %f2086, %f1961;
setp.ltu.f32 %p1509, %f2086, 0f47CE4780;
@%p1509 bra $L__BB0_1788;
setp.eq.f32 %p1510, %f2086, 0f7F800000;
@%p1510 bra $L__BB0_1787;
bra.uni $L__BB0_1782;
$L__BB0_1787:
mov.f32 %f4929, 0f00000000;
mul.rn.f32 %f5969, %f1961, %f4929;
mov.u32 %r8454, 0;
bra.uni $L__BB0_1788;
$L__BB0_1782:
mov.b32 %r2364, %f1961;
shr.u32 %r7412, %r2364, 23;
and.b32 %r7413, %r7412, 255;
add.s32 %r2365, %r7413, -128;
shl.b32 %r7414, %r2364, 8;
or.b32 %r2366, %r7414, -2147483648;
shr.u32 %r2367, %r2365, 5;
mov.u64 %rd2774, 0;
mov.u32 %r8451, 0;
mov.u64 %rd2772, __cudart_i2opi_f;
mov.u64 %rd2773, %rd1;
$L__BB0_1783:
.pragma "nounroll";
ld.global.nc.u32 %r7415, [%rd2772];
mad.wide.u32 %rd2421, %r7415, %r2366, %rd2774;
shr.u64 %rd2774, %rd2421, 32;
st.local.u32 [%rd2773], %rd2421;
add.s64 %rd2773, %rd2773, 4;
add.s64 %rd2772, %rd2772, 4;
add.s32 %r8451, %r8451, 1;
setp.ne.s32 %p1511, %r8451, 6;
@%p1511 bra $L__BB0_1783;
st.local.u32 [%rd4], %rd2774;
mov.u32 %r7416, 4;
sub.s32 %r2370, %r7416, %r2367;
mov.u32 %r7417, 6;
sub.s32 %r7418, %r7417, %r2367;
mul.wide.s32 %rd2422, %r7418, 4;
add.s64 %rd2423, %rd1, %rd2422;
ld.local.u32 %r8452, [%rd2423];
ld.local.u32 %r8453, [%rd2423+-4];
and.b32 %r2373, %r2365, 31;
setp.eq.s32 %p1512, %r2373, 0;
@%p1512 bra $L__BB0_1786;
mov.u32 %r7419, 32;
sub.s32 %r7420, %r7419, %r2373;
shr.u32 %r7421, %r8453, %r7420;
shl.b32 %r7422, %r8452, %r2373;
add.s32 %r8452, %r7421, %r7422;
mul.wide.s32 %rd2424, %r2370, 4;
add.s64 %rd2425, %rd1, %rd2424;
ld.local.u32 %r7423, [%rd2425];
shr.u32 %r7424, %r7423, %r7420;
shl.b32 %r7425, %r8453, %r2373;
add.s32 %r8453, %r7424, %r7425;
$L__BB0_1786:
and.b32 %r7426, %r2364, -2147483648;
shr.u32 %r7427, %r8453, 30;
shl.b32 %r7428, %r8452, 2;
or.b32 %r7429, %r7427, %r7428;
shr.u32 %r7430, %r7429, 31;
shr.u32 %r7431, %r8452, 30;
add.s32 %r7432, %r7430, %r7431;
neg.s32 %r7433, %r7432;
setp.eq.s32 %p1513, %r7426, 0;
selp.b32 %r8454, %r7432, %r7433, %p1513;
setp.ne.s32 %p1514, %r7430, 0;
xor.b32 %r7434, %r7426, -2147483648;
selp.b32 %r7435, %r7434, %r7426, %p1514;
selp.b32 %r7436, -1, 0, %p1514;
xor.b32 %r7437, %r7429, %r7436;
shl.b32 %r7438, %r8453, 2;
xor.b32 %r7439, %r7438, %r7436;
cvt.u64.u32 %rd2426, %r7437;
cvt.u64.u32 %rd2427, %r7439;
bfi.b64 %rd2428, %rd2426, %rd2427, 32, 32;
cvt.rn.f64.s64 %fd243, %rd2428;
mul.f64 %fd244, %fd243, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4927, %fd244;
setp.eq.s32 %p1515, %r7435, 0;
neg.f32 %f4928, %f4927;
selp.f32 %f5969, %f4927, %f4928, %p1515;
$L__BB0_1788:
add.s32 %r2380, %r8454, 1;
and.b32 %r2381, %r2380, 1;
setp.eq.s32 %p1516, %r2381, 0;
selp.f32 %f2090, %f5969, 0f3F800000, %p1516;
mul.rn.f32 %f2091, %f5969, %f5969;
mov.f32 %f5970, 0fB94D4153;
@%p1516 bra $L__BB0_1790;
mov.f32 %f4931, 0fBAB607ED;
mov.f32 %f4932, 0f37CBAC00;
fma.rn.f32 %f5970, %f4932, %f2091, %f4931;
$L__BB0_1790:
selp.f32 %f4933, 0f3C0885E4, 0f3D2AAABB, %p1516;
fma.rn.f32 %f4934, %f5970, %f2091, %f4933;
selp.f32 %f4935, 0fBE2AAAA8, 0fBEFFFFFF, %p1516;
fma.rn.f32 %f4936, %f4934, %f2091, %f4935;
mov.f32 %f4937, 0f00000000;
fma.rn.f32 %f4938, %f2091, %f2090, %f4937;
fma.rn.f32 %f5971, %f4936, %f4938, %f2090;
and.b32 %r7441, %r2380, 2;
setp.eq.s32 %p1518, %r7441, 0;
@%p1518 bra $L__BB0_1792;
mov.f32 %f4940, 0fBF800000;
fma.rn.f32 %f5971, %f5971, %f4940, %f4937;
$L__BB0_1792:
add.f32 %f5993, %f5968, %f5971;
mul.f32 %f4941, %f1970, 0f3F22F983;
cvt.rni.s32.f32 %r8458, %f4941;
cvt.rn.f32.s32 %f4942, %r8458;
mov.f32 %f4943, 0fBFC90FDA;
fma.rn.f32 %f4944, %f4942, %f4943, %f1970;
mov.f32 %f4945, 0fB3A22168;
fma.rn.f32 %f4946, %f4942, %f4945, %f4944;
mov.f32 %f4947, 0fA7C234C5;
fma.rn.f32 %f5972, %f4942, %f4947, %f4946;
abs.f32 %f2099, %f1970;
setp.ltu.f32 %p1519, %f2099, 0f47CE4780;
@%p1519 bra $L__BB0_1800;
setp.eq.f32 %p1520, %f2099, 0f7F800000;
@%p1520 bra $L__BB0_1799;
bra.uni $L__BB0_1794;
$L__BB0_1799:
mov.f32 %f4950, 0f00000000;
mul.rn.f32 %f5972, %f1970, %f4950;
mov.u32 %r8458, 0;
bra.uni $L__BB0_1800;
$L__BB0_1794:
mov.b32 %r2383, %f1970;
shr.u32 %r7443, %r2383, 23;
and.b32 %r7444, %r7443, 255;
add.s32 %r2384, %r7444, -128;
shl.b32 %r7445, %r2383, 8;
or.b32 %r2385, %r7445, -2147483648;
shr.u32 %r2386, %r2384, 5;
mov.u64 %rd2777, 0;
mov.u32 %r8455, 0;
mov.u64 %rd2775, __cudart_i2opi_f;
mov.u64 %rd2776, %rd1;
$L__BB0_1795:
.pragma "nounroll";
ld.global.nc.u32 %r7446, [%rd2775];
mad.wide.u32 %rd2431, %r7446, %r2385, %rd2777;
shr.u64 %rd2777, %rd2431, 32;
st.local.u32 [%rd2776], %rd2431;
add.s64 %rd2776, %rd2776, 4;
add.s64 %rd2775, %rd2775, 4;
add.s32 %r8455, %r8455, 1;
setp.ne.s32 %p1521, %r8455, 6;
@%p1521 bra $L__BB0_1795;
st.local.u32 [%rd4], %rd2777;
mov.u32 %r7447, 4;
sub.s32 %r2389, %r7447, %r2386;
mov.u32 %r7448, 6;
sub.s32 %r7449, %r7448, %r2386;
mul.wide.s32 %rd2432, %r7449, 4;
add.s64 %rd2433, %rd1, %rd2432;
ld.local.u32 %r8456, [%rd2433];
ld.local.u32 %r8457, [%rd2433+-4];
and.b32 %r2392, %r2384, 31;
setp.eq.s32 %p1522, %r2392, 0;
@%p1522 bra $L__BB0_1798;
mov.u32 %r7450, 32;
sub.s32 %r7451, %r7450, %r2392;
shr.u32 %r7452, %r8457, %r7451;
shl.b32 %r7453, %r8456, %r2392;
add.s32 %r8456, %r7452, %r7453;
mul.wide.s32 %rd2434, %r2389, 4;
add.s64 %rd2435, %rd1, %rd2434;
ld.local.u32 %r7454, [%rd2435];
shr.u32 %r7455, %r7454, %r7451;
shl.b32 %r7456, %r8457, %r2392;
add.s32 %r8457, %r7455, %r7456;
$L__BB0_1798:
and.b32 %r7457, %r2383, -2147483648;
shr.u32 %r7458, %r8457, 30;
shl.b32 %r7459, %r8456, 2;
or.b32 %r7460, %r7458, %r7459;
shr.u32 %r7461, %r7460, 31;
shr.u32 %r7462, %r8456, 30;
add.s32 %r7463, %r7461, %r7462;
neg.s32 %r7464, %r7463;
setp.eq.s32 %p1523, %r7457, 0;
selp.b32 %r8458, %r7463, %r7464, %p1523;
setp.ne.s32 %p1524, %r7461, 0;
xor.b32 %r7465, %r7457, -2147483648;
selp.b32 %r7466, %r7465, %r7457, %p1524;
selp.b32 %r7467, -1, 0, %p1524;
xor.b32 %r7468, %r7460, %r7467;
shl.b32 %r7469, %r8457, 2;
xor.b32 %r7470, %r7469, %r7467;
cvt.u64.u32 %rd2436, %r7468;
cvt.u64.u32 %rd2437, %r7470;
bfi.b64 %rd2438, %rd2436, %rd2437, 32, 32;
cvt.rn.f64.s64 %fd245, %rd2438;
mul.f64 %fd246, %fd245, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4948, %fd246;
setp.eq.s32 %p1525, %r7466, 0;
neg.f32 %f4949, %f4948;
selp.f32 %f5972, %f4948, %f4949, %p1525;
$L__BB0_1800:
and.b32 %r2399, %r8458, 1;
setp.eq.s32 %p1526, %r2399, 0;
selp.f32 %f2103, %f5972, 0f3F800000, %p1526;
mul.rn.f32 %f2104, %f5972, %f5972;
mov.f32 %f5973, 0fB94D4153;
@%p1526 bra $L__BB0_1802;
mov.f32 %f4952, 0fBAB607ED;
mov.f32 %f4953, 0f37CBAC00;
fma.rn.f32 %f5973, %f4953, %f2104, %f4952;
$L__BB0_1802:
selp.f32 %f4954, 0f3C0885E4, 0f3D2AAABB, %p1526;
fma.rn.f32 %f4955, %f5973, %f2104, %f4954;
selp.f32 %f4956, 0fBE2AAAA8, 0fBEFFFFFF, %p1526;
fma.rn.f32 %f4957, %f4955, %f2104, %f4956;
mov.f32 %f4958, 0f00000000;
fma.rn.f32 %f4959, %f2104, %f2103, %f4958;
fma.rn.f32 %f5974, %f4957, %f4959, %f2103;
and.b32 %r7472, %r8458, 2;
setp.eq.s32 %p1528, %r7472, 0;
@%p1528 bra $L__BB0_1804;
mov.f32 %f4961, 0fBF800000;
fma.rn.f32 %f5974, %f5974, %f4961, %f4958;
$L__BB0_1804:
mul.f32 %f4962, %f1962, 0f3F22F983;
cvt.rni.s32.f32 %r8462, %f4962;
cvt.rn.f32.s32 %f4963, %r8462;
mov.f32 %f4964, 0fBFC90FDA;
fma.rn.f32 %f4965, %f4963, %f4964, %f1962;
mov.f32 %f4966, 0fB3A22168;
fma.rn.f32 %f4967, %f4963, %f4966, %f4965;
mov.f32 %f4968, 0fA7C234C5;
fma.rn.f32 %f5975, %f4963, %f4968, %f4967;
abs.f32 %f2111, %f1962;
setp.ltu.f32 %p1529, %f2111, 0f47CE4780;
@%p1529 bra $L__BB0_1812;
setp.eq.f32 %p1530, %f2111, 0f7F800000;
@%p1530 bra $L__BB0_1811;
bra.uni $L__BB0_1806;
$L__BB0_1811:
mov.f32 %f4971, 0f00000000;
mul.rn.f32 %f5975, %f1962, %f4971;
mov.u32 %r8462, 0;
bra.uni $L__BB0_1812;
$L__BB0_1806:
mov.b32 %r2401, %f1962;
shr.u32 %r7474, %r2401, 23;
and.b32 %r7475, %r7474, 255;
add.s32 %r2402, %r7475, -128;
shl.b32 %r7476, %r2401, 8;
or.b32 %r2403, %r7476, -2147483648;
shr.u32 %r2404, %r2402, 5;
mov.u64 %rd2780, 0;
mov.u32 %r8459, 0;
mov.u64 %rd2778, __cudart_i2opi_f;
mov.u64 %rd2779, %rd1;
$L__BB0_1807:
.pragma "nounroll";
ld.global.nc.u32 %r7477, [%rd2778];
mad.wide.u32 %rd2441, %r7477, %r2403, %rd2780;
shr.u64 %rd2780, %rd2441, 32;
st.local.u32 [%rd2779], %rd2441;
add.s64 %rd2779, %rd2779, 4;
add.s64 %rd2778, %rd2778, 4;
add.s32 %r8459, %r8459, 1;
setp.ne.s32 %p1531, %r8459, 6;
@%p1531 bra $L__BB0_1807;
st.local.u32 [%rd4], %rd2780;
mov.u32 %r7478, 4;
sub.s32 %r2407, %r7478, %r2404;
mov.u32 %r7479, 6;
sub.s32 %r7480, %r7479, %r2404;
mul.wide.s32 %rd2442, %r7480, 4;
add.s64 %rd2443, %rd1, %rd2442;
ld.local.u32 %r8460, [%rd2443];
ld.local.u32 %r8461, [%rd2443+-4];
and.b32 %r2410, %r2402, 31;
setp.eq.s32 %p1532, %r2410, 0;
@%p1532 bra $L__BB0_1810;
mov.u32 %r7481, 32;
sub.s32 %r7482, %r7481, %r2410;
shr.u32 %r7483, %r8461, %r7482;
shl.b32 %r7484, %r8460, %r2410;
add.s32 %r8460, %r7483, %r7484;
mul.wide.s32 %rd2444, %r2407, 4;
add.s64 %rd2445, %rd1, %rd2444;
ld.local.u32 %r7485, [%rd2445];
shr.u32 %r7486, %r7485, %r7482;
shl.b32 %r7487, %r8461, %r2410;
add.s32 %r8461, %r7486, %r7487;
$L__BB0_1810:
and.b32 %r7488, %r2401, -2147483648;
shr.u32 %r7489, %r8461, 30;
shl.b32 %r7490, %r8460, 2;
or.b32 %r7491, %r7489, %r7490;
shr.u32 %r7492, %r7491, 31;
shr.u32 %r7493, %r8460, 30;
add.s32 %r7494, %r7492, %r7493;
neg.s32 %r7495, %r7494;
setp.eq.s32 %p1533, %r7488, 0;
selp.b32 %r8462, %r7494, %r7495, %p1533;
setp.ne.s32 %p1534, %r7492, 0;
xor.b32 %r7496, %r7488, -2147483648;
selp.b32 %r7497, %r7496, %r7488, %p1534;
selp.b32 %r7498, -1, 0, %p1534;
xor.b32 %r7499, %r7491, %r7498;
shl.b32 %r7500, %r8461, 2;
xor.b32 %r7501, %r7500, %r7498;
cvt.u64.u32 %rd2446, %r7499;
cvt.u64.u32 %rd2447, %r7501;
bfi.b64 %rd2448, %rd2446, %rd2447, 32, 32;
cvt.rn.f64.s64 %fd247, %rd2448;
mul.f64 %fd248, %fd247, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4969, %fd248;
setp.eq.s32 %p1535, %r7497, 0;
neg.f32 %f4970, %f4969;
selp.f32 %f5975, %f4969, %f4970, %p1535;
$L__BB0_1812:
add.s32 %r2417, %r8462, 1;
and.b32 %r2418, %r2417, 1;
setp.eq.s32 %p1536, %r2418, 0;
selp.f32 %f2115, %f5975, 0f3F800000, %p1536;
mul.rn.f32 %f2116, %f5975, %f5975;
mov.f32 %f5976, 0fB94D4153;
@%p1536 bra $L__BB0_1814;
mov.f32 %f4973, 0fBAB607ED;
mov.f32 %f4974, 0f37CBAC00;
fma.rn.f32 %f5976, %f4974, %f2116, %f4973;
$L__BB0_1814:
selp.f32 %f4975, 0f3C0885E4, 0f3D2AAABB, %p1536;
fma.rn.f32 %f4976, %f5976, %f2116, %f4975;
selp.f32 %f4977, 0fBE2AAAA8, 0fBEFFFFFF, %p1536;
fma.rn.f32 %f4978, %f4976, %f2116, %f4977;
mov.f32 %f4979, 0f00000000;
fma.rn.f32 %f4980, %f2116, %f2115, %f4979;
fma.rn.f32 %f5977, %f4978, %f4980, %f2115;
and.b32 %r7503, %r2417, 2;
setp.eq.s32 %p1538, %r7503, 0;
@%p1538 bra $L__BB0_1816;
mov.f32 %f4982, 0fBF800000;
fma.rn.f32 %f5977, %f5977, %f4982, %f4979;
$L__BB0_1816:
add.f32 %f5992, %f5974, %f5977;
mul.f32 %f4983, %f1971, 0f3F22F983;
cvt.rni.s32.f32 %r8466, %f4983;
cvt.rn.f32.s32 %f4984, %r8466;
mov.f32 %f4985, 0fBFC90FDA;
fma.rn.f32 %f4986, %f4984, %f4985, %f1971;
mov.f32 %f4987, 0fB3A22168;
fma.rn.f32 %f4988, %f4984, %f4987, %f4986;
mov.f32 %f4989, 0fA7C234C5;
fma.rn.f32 %f5978, %f4984, %f4989, %f4988;
abs.f32 %f2124, %f1971;
setp.ltu.f32 %p1539, %f2124, 0f47CE4780;
@%p1539 bra $L__BB0_1824;
setp.eq.f32 %p1540, %f2124, 0f7F800000;
@%p1540 bra $L__BB0_1823;
bra.uni $L__BB0_1818;
$L__BB0_1823:
mov.f32 %f4992, 0f00000000;
mul.rn.f32 %f5978, %f1971, %f4992;
mov.u32 %r8466, 0;
bra.uni $L__BB0_1824;
$L__BB0_1818:
mov.b32 %r2420, %f1971;
shr.u32 %r7505, %r2420, 23;
and.b32 %r7506, %r7505, 255;
add.s32 %r2421, %r7506, -128;
shl.b32 %r7507, %r2420, 8;
or.b32 %r2422, %r7507, -2147483648;
shr.u32 %r2423, %r2421, 5;
mov.u64 %rd2783, 0;
mov.u32 %r8463, 0;
mov.u64 %rd2781, __cudart_i2opi_f;
mov.u64 %rd2782, %rd1;
$L__BB0_1819:
.pragma "nounroll";
ld.global.nc.u32 %r7508, [%rd2781];
mad.wide.u32 %rd2451, %r7508, %r2422, %rd2783;
shr.u64 %rd2783, %rd2451, 32;
st.local.u32 [%rd2782], %rd2451;
add.s64 %rd2782, %rd2782, 4;
add.s64 %rd2781, %rd2781, 4;
add.s32 %r8463, %r8463, 1;
setp.ne.s32 %p1541, %r8463, 6;
@%p1541 bra $L__BB0_1819;
st.local.u32 [%rd4], %rd2783;
mov.u32 %r7509, 4;
sub.s32 %r2426, %r7509, %r2423;
mov.u32 %r7510, 6;
sub.s32 %r7511, %r7510, %r2423;
mul.wide.s32 %rd2452, %r7511, 4;
add.s64 %rd2453, %rd1, %rd2452;
ld.local.u32 %r8464, [%rd2453];
ld.local.u32 %r8465, [%rd2453+-4];
and.b32 %r2429, %r2421, 31;
setp.eq.s32 %p1542, %r2429, 0;
@%p1542 bra $L__BB0_1822;
mov.u32 %r7512, 32;
sub.s32 %r7513, %r7512, %r2429;
shr.u32 %r7514, %r8465, %r7513;
shl.b32 %r7515, %r8464, %r2429;
add.s32 %r8464, %r7514, %r7515;
mul.wide.s32 %rd2454, %r2426, 4;
add.s64 %rd2455, %rd1, %rd2454;
ld.local.u32 %r7516, [%rd2455];
shr.u32 %r7517, %r7516, %r7513;
shl.b32 %r7518, %r8465, %r2429;
add.s32 %r8465, %r7517, %r7518;
$L__BB0_1822:
and.b32 %r7519, %r2420, -2147483648;
shr.u32 %r7520, %r8465, 30;
shl.b32 %r7521, %r8464, 2;
or.b32 %r7522, %r7520, %r7521;
shr.u32 %r7523, %r7522, 31;
shr.u32 %r7524, %r8464, 30;
add.s32 %r7525, %r7523, %r7524;
neg.s32 %r7526, %r7525;
setp.eq.s32 %p1543, %r7519, 0;
selp.b32 %r8466, %r7525, %r7526, %p1543;
setp.ne.s32 %p1544, %r7523, 0;
xor.b32 %r7527, %r7519, -2147483648;
selp.b32 %r7528, %r7527, %r7519, %p1544;
selp.b32 %r7529, -1, 0, %p1544;
xor.b32 %r7530, %r7522, %r7529;
shl.b32 %r7531, %r8465, 2;
xor.b32 %r7532, %r7531, %r7529;
cvt.u64.u32 %rd2456, %r7530;
cvt.u64.u32 %rd2457, %r7532;
bfi.b64 %rd2458, %rd2456, %rd2457, 32, 32;
cvt.rn.f64.s64 %fd249, %rd2458;
mul.f64 %fd250, %fd249, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4990, %fd250;
setp.eq.s32 %p1545, %r7528, 0;
neg.f32 %f4991, %f4990;
selp.f32 %f5978, %f4990, %f4991, %p1545;
$L__BB0_1824:
and.b32 %r2436, %r8466, 1;
setp.eq.s32 %p1546, %r2436, 0;
selp.f32 %f2128, %f5978, 0f3F800000, %p1546;
mul.rn.f32 %f2129, %f5978, %f5978;
mov.f32 %f5979, 0fB94D4153;
@%p1546 bra $L__BB0_1826;
mov.f32 %f4994, 0fBAB607ED;
mov.f32 %f4995, 0f37CBAC00;
fma.rn.f32 %f5979, %f4995, %f2129, %f4994;
$L__BB0_1826:
selp.f32 %f4996, 0f3C0885E4, 0f3D2AAABB, %p1546;
fma.rn.f32 %f4997, %f5979, %f2129, %f4996;
selp.f32 %f4998, 0fBE2AAAA8, 0fBEFFFFFF, %p1546;
fma.rn.f32 %f4999, %f4997, %f2129, %f4998;
mov.f32 %f5000, 0f00000000;
fma.rn.f32 %f5001, %f2129, %f2128, %f5000;
fma.rn.f32 %f5980, %f4999, %f5001, %f2128;
and.b32 %r7534, %r8466, 2;
setp.eq.s32 %p1548, %r7534, 0;
@%p1548 bra $L__BB0_1828;
mov.f32 %f5003, 0fBF800000;
fma.rn.f32 %f5980, %f5980, %f5003, %f5000;
$L__BB0_1828:
mul.f32 %f5004, %f1963, 0f3F22F983;
cvt.rni.s32.f32 %r8470, %f5004;
cvt.rn.f32.s32 %f5005, %r8470;
mov.f32 %f5006, 0fBFC90FDA;
fma.rn.f32 %f5007, %f5005, %f5006, %f1963;
mov.f32 %f5008, 0fB3A22168;
fma.rn.f32 %f5009, %f5005, %f5008, %f5007;
mov.f32 %f5010, 0fA7C234C5;
fma.rn.f32 %f5981, %f5005, %f5010, %f5009;
abs.f32 %f2136, %f1963;
setp.ltu.f32 %p1549, %f2136, 0f47CE4780;
@%p1549 bra $L__BB0_1836;
setp.eq.f32 %p1550, %f2136, 0f7F800000;
@%p1550 bra $L__BB0_1835;
bra.uni $L__BB0_1830;
$L__BB0_1835:
mov.f32 %f5013, 0f00000000;
mul.rn.f32 %f5981, %f1963, %f5013;
mov.u32 %r8470, 0;
bra.uni $L__BB0_1836;
$L__BB0_1830:
mov.b32 %r2438, %f1963;
shr.u32 %r7536, %r2438, 23;
and.b32 %r7537, %r7536, 255;
add.s32 %r2439, %r7537, -128;
shl.b32 %r7538, %r2438, 8;
or.b32 %r2440, %r7538, -2147483648;
shr.u32 %r2441, %r2439, 5;
mov.u64 %rd2786, 0;
mov.u32 %r8467, 0;
mov.u64 %rd2784, __cudart_i2opi_f;
mov.u64 %rd2785, %rd1;
$L__BB0_1831:
.pragma "nounroll";
ld.global.nc.u32 %r7539, [%rd2784];
mad.wide.u32 %rd2461, %r7539, %r2440, %rd2786;
shr.u64 %rd2786, %rd2461, 32;
st.local.u32 [%rd2785], %rd2461;
add.s64 %rd2785, %rd2785, 4;
add.s64 %rd2784, %rd2784, 4;
add.s32 %r8467, %r8467, 1;
setp.ne.s32 %p1551, %r8467, 6;
@%p1551 bra $L__BB0_1831;
st.local.u32 [%rd4], %rd2786;
mov.u32 %r7540, 4;
sub.s32 %r2444, %r7540, %r2441;
mov.u32 %r7541, 6;
sub.s32 %r7542, %r7541, %r2441;
mul.wide.s32 %rd2462, %r7542, 4;
add.s64 %rd2463, %rd1, %rd2462;
ld.local.u32 %r8468, [%rd2463];
ld.local.u32 %r8469, [%rd2463+-4];
and.b32 %r2447, %r2439, 31;
setp.eq.s32 %p1552, %r2447, 0;
@%p1552 bra $L__BB0_1834;
mov.u32 %r7543, 32;
sub.s32 %r7544, %r7543, %r2447;
shr.u32 %r7545, %r8469, %r7544;
shl.b32 %r7546, %r8468, %r2447;
add.s32 %r8468, %r7545, %r7546;
mul.wide.s32 %rd2464, %r2444, 4;
add.s64 %rd2465, %rd1, %rd2464;
ld.local.u32 %r7547, [%rd2465];
shr.u32 %r7548, %r7547, %r7544;
shl.b32 %r7549, %r8469, %r2447;
add.s32 %r8469, %r7548, %r7549;
$L__BB0_1834:
and.b32 %r7550, %r2438, -2147483648;
shr.u32 %r7551, %r8469, 30;
shl.b32 %r7552, %r8468, 2;
or.b32 %r7553, %r7551, %r7552;
shr.u32 %r7554, %r7553, 31;
shr.u32 %r7555, %r8468, 30;
add.s32 %r7556, %r7554, %r7555;
neg.s32 %r7557, %r7556;
setp.eq.s32 %p1553, %r7550, 0;
selp.b32 %r8470, %r7556, %r7557, %p1553;
setp.ne.s32 %p1554, %r7554, 0;
xor.b32 %r7558, %r7550, -2147483648;
selp.b32 %r7559, %r7558, %r7550, %p1554;
selp.b32 %r7560, -1, 0, %p1554;
xor.b32 %r7561, %r7553, %r7560;
shl.b32 %r7562, %r8469, 2;
xor.b32 %r7563, %r7562, %r7560;
cvt.u64.u32 %rd2466, %r7561;
cvt.u64.u32 %rd2467, %r7563;
bfi.b64 %rd2468, %rd2466, %rd2467, 32, 32;
cvt.rn.f64.s64 %fd251, %rd2468;
mul.f64 %fd252, %fd251, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f5011, %fd252;
setp.eq.s32 %p1555, %r7559, 0;
neg.f32 %f5012, %f5011;
selp.f32 %f5981, %f5011, %f5012, %p1555;
$L__BB0_1836:
add.s32 %r2454, %r8470, 1;
and.b32 %r2455, %r2454, 1;
setp.eq.s32 %p1556, %r2455, 0;
selp.f32 %f2140, %f5981, 0f3F800000, %p1556;
mul.rn.f32 %f2141, %f5981, %f5981;
mov.f32 %f5982, 0fB94D4153;
@%p1556 bra $L__BB0_1838;
mov.f32 %f5015, 0fBAB607ED;
mov.f32 %f5016, 0f37CBAC00;
fma.rn.f32 %f5982, %f5016, %f2141, %f5015;
$L__BB0_1838:
selp.f32 %f5017, 0f3C0885E4, 0f3D2AAABB, %p1556;
fma.rn.f32 %f5018, %f5982, %f2141, %f5017;
selp.f32 %f5019, 0fBE2AAAA8, 0fBEFFFFFF, %p1556;
fma.rn.f32 %f5020, %f5018, %f2141, %f5019;
mov.f32 %f5021, 0f00000000;
fma.rn.f32 %f5022, %f2141, %f2140, %f5021;
fma.rn.f32 %f5983, %f5020, %f5022, %f2140;
and.b32 %r7565, %r2454, 2;
setp.eq.s32 %p1558, %r7565, 0;
@%p1558 bra $L__BB0_1840;
mov.f32 %f5024, 0fBF800000;
fma.rn.f32 %f5983, %f5983, %f5024, %f5021;
$L__BB0_1840:
add.f32 %f5991, %f5980, %f5983;
mul.f32 %f5025, %f1972, 0f3F22F983;
cvt.rni.s32.f32 %r8474, %f5025;
cvt.rn.f32.s32 %f5026, %r8474;
mov.f32 %f5027, 0fBFC90FDA;
fma.rn.f32 %f5028, %f5026, %f5027, %f1972;
mov.f32 %f5029, 0fB3A22168;
fma.rn.f32 %f5030, %f5026, %f5029, %f5028;
mov.f32 %f5031, 0fA7C234C5;
fma.rn.f32 %f5984, %f5026, %f5031, %f5030;
abs.f32 %f2149, %f1972;
setp.ltu.f32 %p1559, %f2149, 0f47CE4780;
@%p1559 bra $L__BB0_1848;
setp.eq.f32 %p1560, %f2149, 0f7F800000;
@%p1560 bra $L__BB0_1847;
bra.uni $L__BB0_1842;
$L__BB0_1847:
mov.f32 %f5034, 0f00000000;
mul.rn.f32 %f5984, %f1972, %f5034;
mov.u32 %r8474, 0;
bra.uni $L__BB0_1848;
$L__BB0_1842:
mov.b32 %r2457, %f1972;
shr.u32 %r7567, %r2457, 23;
and.b32 %r7568, %r7567, 255;
add.s32 %r2458, %r7568, -128;
shl.b32 %r7569, %r2457, 8;
or.b32 %r2459, %r7569, -2147483648;
shr.u32 %r2460, %r2458, 5;
mov.u64 %rd2789, 0;
mov.u32 %r8471, 0;
mov.u64 %rd2787, __cudart_i2opi_f;
mov.u64 %rd2788, %rd1;
$L__BB0_1843:
.pragma "nounroll";
ld.global.nc.u32 %r7570, [%rd2787];
mad.wide.u32 %rd2471, %r7570, %r2459, %rd2789;
shr.u64 %rd2789, %rd2471, 32;
st.local.u32 [%rd2788], %rd2471;
add.s64 %rd2788, %rd2788, 4;
add.s64 %rd2787, %rd2787, 4;
add.s32 %r8471, %r8471, 1;
setp.ne.s32 %p1561, %r8471, 6;
@%p1561 bra $L__BB0_1843;
st.local.u32 [%rd4], %rd2789;
mov.u32 %r7571, 4;
sub.s32 %r2463, %r7571, %r2460;
mov.u32 %r7572, 6;
sub.s32 %r7573, %r7572, %r2460;
mul.wide.s32 %rd2472, %r7573, 4;
add.s64 %rd2473, %rd1, %rd2472;
ld.local.u32 %r8472, [%rd2473];
ld.local.u32 %r8473, [%rd2473+-4];
and.b32 %r2466, %r2458, 31;
setp.eq.s32 %p1562, %r2466, 0;
@%p1562 bra $L__BB0_1846;
mov.u32 %r7574, 32;
sub.s32 %r7575, %r7574, %r2466;
shr.u32 %r7576, %r8473, %r7575;
shl.b32 %r7577, %r8472, %r2466;
add.s32 %r8472, %r7576, %r7577;
mul.wide.s32 %rd2474, %r2463, 4;
add.s64 %rd2475, %rd1, %rd2474;
ld.local.u32 %r7578, [%rd2475];
shr.u32 %r7579, %r7578, %r7575;
shl.b32 %r7580, %r8473, %r2466;
add.s32 %r8473, %r7579, %r7580;
$L__BB0_1846:
and.b32 %r7581, %r2457, -2147483648;
shr.u32 %r7582, %r8473, 30;
shl.b32 %r7583, %r8472, 2;
or.b32 %r7584, %r7582, %r7583;
shr.u32 %r7585, %r7584, 31;
shr.u32 %r7586, %r8472, 30;
add.s32 %r7587, %r7585, %r7586;
neg.s32 %r7588, %r7587;
setp.eq.s32 %p1563, %r7581, 0;
selp.b32 %r8474, %r7587, %r7588, %p1563;
setp.ne.s32 %p1564, %r7585, 0;
xor.b32 %r7589, %r7581, -2147483648;
selp.b32 %r7590, %r7589, %r7581, %p1564;
selp.b32 %r7591, -1, 0, %p1564;
xor.b32 %r7592, %r7584, %r7591;
shl.b32 %r7593, %r8473, 2;
xor.b32 %r7594, %r7593, %r7591;
cvt.u64.u32 %rd2476, %r7592;
cvt.u64.u32 %rd2477, %r7594;
bfi.b64 %rd2478, %rd2476, %rd2477, 32, 32;
cvt.rn.f64.s64 %fd253, %rd2478;
mul.f64 %fd254, %fd253, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f5032, %fd254;
setp.eq.s32 %p1565, %r7590, 0;
neg.f32 %f5033, %f5032;
selp.f32 %f5984, %f5032, %f5033, %p1565;
$L__BB0_1848:
and.b32 %r2473, %r8474, 1;
setp.eq.s32 %p1566, %r2473, 0;
selp.f32 %f2153, %f5984, 0f3F800000, %p1566;
mul.rn.f32 %f2154, %f5984, %f5984;
mov.f32 %f5985, 0fB94D4153;
@%p1566 bra $L__BB0_1850;
mov.f32 %f5036, 0fBAB607ED;
mov.f32 %f5037, 0f37CBAC00;
fma.rn.f32 %f5985, %f5037, %f2154, %f5036;
$L__BB0_1850:
selp.f32 %f5038, 0f3C0885E4, 0f3D2AAABB, %p1566;
fma.rn.f32 %f5039, %f5985, %f2154, %f5038;
selp.f32 %f5040, 0fBE2AAAA8, 0fBEFFFFFF, %p1566;
fma.rn.f32 %f5041, %f5039, %f2154, %f5040;
mov.f32 %f5042, 0f00000000;
fma.rn.f32 %f5043, %f2154, %f2153, %f5042;
fma.rn.f32 %f5986, %f5041, %f5043, %f2153;
and.b32 %r7596, %r8474, 2;
setp.eq.s32 %p1568, %r7596, 0;
@%p1568 bra $L__BB0_1852;
mov.f32 %f5045, 0fBF800000;
fma.rn.f32 %f5986, %f5986, %f5045, %f5042;
$L__BB0_1852:
mul.f32 %f5046, %f1964, 0f3F22F983;
cvt.rni.s32.f32 %r8478, %f5046;
cvt.rn.f32.s32 %f5047, %r8478;
mov.f32 %f5048, 0fBFC90FDA;
fma.rn.f32 %f5049, %f5047, %f5048, %f1964;
mov.f32 %f5050, 0fB3A22168;
fma.rn.f32 %f5051, %f5047, %f5050, %f5049;
mov.f32 %f5052, 0fA7C234C5;
fma.rn.f32 %f5987, %f5047, %f5052, %f5051;
abs.f32 %f2161, %f1964;
setp.ltu.f32 %p1569, %f2161, 0f47CE4780;
@%p1569 bra $L__BB0_1860;
setp.eq.f32 %p1570, %f2161, 0f7F800000;
@%p1570 bra $L__BB0_1859;
bra.uni $L__BB0_1854;
$L__BB0_1859:
mov.f32 %f5055, 0f00000000;
mul.rn.f32 %f5987, %f1964, %f5055;
mov.u32 %r8478, 0;
bra.uni $L__BB0_1860;
$L__BB0_1854:
mov.b32 %r2475, %f1964;
shr.u32 %r7598, %r2475, 23;
and.b32 %r7599, %r7598, 255;
add.s32 %r2476, %r7599, -128;
shl.b32 %r7600, %r2475, 8;
or.b32 %r2477, %r7600, -2147483648;
shr.u32 %r2478, %r2476, 5;
mov.u64 %rd2792, 0;
mov.u32 %r8475, 0;
mov.u64 %rd2790, __cudart_i2opi_f;
mov.u64 %rd2791, %rd1;
$L__BB0_1855:
.pragma "nounroll";
ld.global.nc.u32 %r7601, [%rd2790];
mad.wide.u32 %rd2481, %r7601, %r2477, %rd2792;
shr.u64 %rd2792, %rd2481, 32;
st.local.u32 [%rd2791], %rd2481;
add.s64 %rd2791, %rd2791, 4;
add.s64 %rd2790, %rd2790, 4;
add.s32 %r8475, %r8475, 1;
setp.ne.s32 %p1571, %r8475, 6;
@%p1571 bra $L__BB0_1855;
st.local.u32 [%rd4], %rd2792;
mov.u32 %r7602, 4;
sub.s32 %r2481, %r7602, %r2478;
mov.u32 %r7603, 6;
sub.s32 %r7604, %r7603, %r2478;
mul.wide.s32 %rd2482, %r7604, 4;
add.s64 %rd2483, %rd1, %rd2482;
ld.local.u32 %r8476, [%rd2483];
ld.local.u32 %r8477, [%rd2483+-4];
and.b32 %r2484, %r2476, 31;
setp.eq.s32 %p1572, %r2484, 0;
@%p1572 bra $L__BB0_1858;
mov.u32 %r7605, 32;
sub.s32 %r7606, %r7605, %r2484;
shr.u32 %r7607, %r8477, %r7606;
shl.b32 %r7608, %r8476, %r2484;
add.s32 %r8476, %r7607, %r7608;
mul.wide.s32 %rd2484, %r2481, 4;
add.s64 %rd2485, %rd1, %rd2484;
ld.local.u32 %r7609, [%rd2485];
shr.u32 %r7610, %r7609, %r7606;
shl.b32 %r7611, %r8477, %r2484;
add.s32 %r8477, %r7610, %r7611;
$L__BB0_1858:
and.b32 %r7612, %r2475, -2147483648;
shr.u32 %r7613, %r8477, 30;
shl.b32 %r7614, %r8476, 2;
or.b32 %r7615, %r7613, %r7614;
shr.u32 %r7616, %r7615, 31;
shr.u32 %r7617, %r8476, 30;
add.s32 %r7618, %r7616, %r7617;
neg.s32 %r7619, %r7618;
setp.eq.s32 %p1573, %r7612, 0;
selp.b32 %r8478, %r7618, %r7619, %p1573;
setp.ne.s32 %p1574, %r7616, 0;
xor.b32 %r7620, %r7612, -2147483648;
selp.b32 %r7621, %r7620, %r7612, %p1574;
selp.b32 %r7622, -1, 0, %p1574;
xor.b32 %r7623, %r7615, %r7622;
shl.b32 %r7624, %r8477, 2;
xor.b32 %r7625, %r7624, %r7622;
cvt.u64.u32 %rd2486, %r7623;
cvt.u64.u32 %rd2487, %r7625;
bfi.b64 %rd2488, %rd2486, %rd2487, 32, 32;
cvt.rn.f64.s64 %fd255, %rd2488;
mul.f64 %fd256, %fd255, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f5053, %fd256;
setp.eq.s32 %p1575, %r7621, 0;
neg.f32 %f5054, %f5053;
selp.f32 %f5987, %f5053, %f5054, %p1575;
$L__BB0_1860:
add.s32 %r2491, %r8478, 1;
and.b32 %r2492, %r2491, 1;
setp.eq.s32 %p1576, %r2492, 0;
selp.f32 %f2165, %f5987, 0f3F800000, %p1576;
mul.rn.f32 %f2166, %f5987, %f5987;
mov.f32 %f5988, 0fB94D4153;
@%p1576 bra $L__BB0_1862;
mov.f32 %f5057, 0fBAB607ED;
mov.f32 %f5058, 0f37CBAC00;
fma.rn.f32 %f5988, %f5058, %f2166, %f5057;
$L__BB0_1862:
selp.f32 %f5059, 0f3C0885E4, 0f3D2AAABB, %p1576;
fma.rn.f32 %f5060, %f5988, %f2166, %f5059;
selp.f32 %f5061, 0fBE2AAAA8, 0fBEFFFFFF, %p1576;
fma.rn.f32 %f5062, %f5060, %f2166, %f5061;
mov.f32 %f5063, 0f00000000;
fma.rn.f32 %f5064, %f2166, %f2165, %f5063;
fma.rn.f32 %f5989, %f5062, %f5064, %f2165;
and.b32 %r7627, %r2491, 2;
setp.eq.s32 %p1578, %r7627, 0;
@%p1578 bra $L__BB0_1864;
mov.f32 %f5066, 0fBF800000;
fma.rn.f32 %f5989, %f5989, %f5066, %f5063;
$L__BB0_1864:
add.f32 %f5990, %f5986, %f5989;
bra.uni $L__BB0_1865;
$L__BB0_1444:
mov.b32 %r1895, %f5416;
shr.u32 %r6564, %r1895, 23;
and.b32 %r6565, %r6564, 255;
add.s32 %r1896, %r6565, -128;
shl.b32 %r6566, %r1895, 8;
or.b32 %r1897, %r6566, -2147483648;
shr.u32 %r1898, %r1896, 5;
mov.u64 %rd2699, 0;
mov.u32 %r8351, 0;
mov.u64 %rd2697, __cudart_i2opi_f;
mov.u64 %rd2698, %rd1;
$L__BB0_1445:
.pragma "nounroll";
ld.global.nc.u32 %r6567, [%rd2697];
mad.wide.u32 %rd2145, %r6567, %r1897, %rd2699;
shr.u64 %rd2699, %rd2145, 32;
st.local.u32 [%rd2698], %rd2145;
add.s64 %rd2698, %rd2698, 4;
add.s64 %rd2697, %rd2697, 4;
add.s32 %r8351, %r8351, 1;
setp.ne.s32 %p1222, %r8351, 6;
@%p1222 bra $L__BB0_1445;
st.local.u32 [%rd4], %rd2699;
mov.u32 %r6568, 4;
sub.s32 %r1901, %r6568, %r1898;
mov.u32 %r6569, 6;
sub.s32 %r6570, %r6569, %r1898;
mul.wide.s32 %rd2146, %r6570, 4;
add.s64 %rd2147, %rd1, %rd2146;
ld.local.u32 %r8352, [%rd2147];
ld.local.u32 %r8353, [%rd2147+-4];
and.b32 %r1904, %r1896, 31;
setp.eq.s32 %p1223, %r1904, 0;
@%p1223 bra $L__BB0_1448;
mov.u32 %r6571, 32;
sub.s32 %r6572, %r6571, %r1904;
shr.u32 %r6573, %r8353, %r6572;
shl.b32 %r6574, %r8352, %r1904;
add.s32 %r8352, %r6573, %r6574;
mul.wide.s32 %rd2148, %r1901, 4;
add.s64 %rd2149, %rd1, %rd2148;
ld.local.u32 %r6575, [%rd2149];
shr.u32 %r6576, %r6575, %r6572;
shl.b32 %r6577, %r8353, %r1904;
add.s32 %r8353, %r6576, %r6577;
$L__BB0_1448:
and.b32 %r6578, %r1895, -2147483648;
shr.u32 %r6579, %r8353, 30;
shl.b32 %r6580, %r8352, 2;
or.b32 %r6581, %r6579, %r6580;
shr.u32 %r6582, %r6581, 31;
shr.u32 %r6583, %r8352, 30;
add.s32 %r6584, %r6582, %r6583;
neg.s32 %r6585, %r6584;
setp.eq.s32 %p1224, %r6578, 0;
selp.b32 %r8354, %r6584, %r6585, %p1224;
setp.ne.s32 %p1225, %r6582, 0;
xor.b32 %r6586, %r6578, -2147483648;
selp.b32 %r6587, %r6586, %r6578, %p1225;
selp.b32 %r6588, -1, 0, %p1225;
xor.b32 %r6589, %r6581, %r6588;
shl.b32 %r6590, %r8353, 2;
xor.b32 %r6591, %r6590, %r6588;
cvt.u64.u32 %rd2150, %r6589;
cvt.u64.u32 %rd2151, %r6591;
bfi.b64 %rd2152, %rd2150, %rd2151, 32, 32;
cvt.rn.f64.s64 %fd193, %rd2152;
mul.f64 %fd194, %fd193, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4387, %fd194;
setp.eq.s32 %p1226, %r6587, 0;
neg.f32 %f4388, %f4387;
selp.f32 %f5843, %f4387, %f4388, %p1226;
$L__BB0_1450:
and.b32 %r1911, %r8354, 1;
setp.eq.s32 %p1227, %r1911, 0;
selp.f32 %f1695, %f5843, 0f3F800000, %p1227;
mul.rn.f32 %f1696, %f5843, %f5843;
mov.f32 %f5844, 0fB94D4153;
@%p1227 bra $L__BB0_1452;
mov.f32 %f4391, 0fBAB607ED;
mov.f32 %f4392, 0f37CBAC00;
fma.rn.f32 %f5844, %f4392, %f1696, %f4391;
$L__BB0_1452:
selp.f32 %f4393, 0f3C0885E4, 0f3D2AAABB, %p1227;
fma.rn.f32 %f4394, %f5844, %f1696, %f4393;
selp.f32 %f4395, 0fBE2AAAA8, 0fBEFFFFFF, %p1227;
fma.rn.f32 %f4396, %f4394, %f1696, %f4395;
mov.f32 %f4397, 0f00000000;
fma.rn.f32 %f4398, %f1696, %f1695, %f4397;
fma.rn.f32 %f5281, %f4396, %f4398, %f1695;
and.b32 %r6593, %r8354, 2;
setp.eq.s32 %p1229, %r6593, 0;
@%p1229 bra $L__BB0_1454;
mov.f32 %f4400, 0fBF800000;
fma.rn.f32 %f5281, %f5281, %f4400, %f4397;
$L__BB0_1454:
setp.lt.s32 %p24, %r11, %r1893;
@%p1219 bra $L__BB0_1467;
mul.f32 %f4401, %f5607, 0f3F22F983;
cvt.rni.s32.f32 %r8358, %f4401;
cvt.rn.f32.s32 %f4402, %r8358;
mov.f32 %f4403, 0fBFC90FDA;
fma.rn.f32 %f4404, %f4402, %f4403, %f5607;
mov.f32 %f4405, 0fB3A22168;
fma.rn.f32 %f4406, %f4402, %f4405, %f4404;
mov.f32 %f4407, 0fA7C234C5;
fma.rn.f32 %f5847, %f4402, %f4407, %f4406;
abs.f32 %f1704, %f5607;
setp.ltu.f32 %p1231, %f1704, 0f47CE4780;
@%p1231 bra $L__BB0_1463;
setp.eq.f32 %p1232, %f1704, 0f7F800000;
@%p1232 bra $L__BB0_1462;
bra.uni $L__BB0_1457;
$L__BB0_1462:
mov.f32 %f4410, 0f00000000;
mul.rn.f32 %f5847, %f5607, %f4410;
mov.u32 %r8358, 0;
bra.uni $L__BB0_1463;
$L__BB0_1457:
mov.b32 %r1913, %f5607;
shr.u32 %r6595, %r1913, 23;
and.b32 %r6596, %r6595, 255;
add.s32 %r1914, %r6596, -128;
shl.b32 %r6597, %r1913, 8;
or.b32 %r1915, %r6597, -2147483648;
shr.u32 %r1916, %r1914, 5;
mov.u64 %rd2702, 0;
mov.u32 %r8355, 0;
mov.u64 %rd2700, __cudart_i2opi_f;
mov.u64 %rd2701, %rd1;
$L__BB0_1458:
.pragma "nounroll";
ld.global.nc.u32 %r6598, [%rd2700];
mad.wide.u32 %rd2155, %r6598, %r1915, %rd2702;
shr.u64 %rd2702, %rd2155, 32;
st.local.u32 [%rd2701], %rd2155;
add.s64 %rd2701, %rd2701, 4;
add.s64 %rd2700, %rd2700, 4;
add.s32 %r8355, %r8355, 1;
setp.ne.s32 %p1233, %r8355, 6;
@%p1233 bra $L__BB0_1458;
st.local.u32 [%rd4], %rd2702;
mov.u32 %r6599, 4;
sub.s32 %r1919, %r6599, %r1916;
mov.u32 %r6600, 6;
sub.s32 %r6601, %r6600, %r1916;
mul.wide.s32 %rd2156, %r6601, 4;
add.s64 %rd2157, %rd1, %rd2156;
ld.local.u32 %r8356, [%rd2157];
ld.local.u32 %r8357, [%rd2157+-4];
and.b32 %r1922, %r1914, 31;
setp.eq.s32 %p1234, %r1922, 0;
@%p1234 bra $L__BB0_1461;
mov.u32 %r6602, 32;
sub.s32 %r6603, %r6602, %r1922;
shr.u32 %r6604, %r8357, %r6603;
shl.b32 %r6605, %r8356, %r1922;
add.s32 %r8356, %r6604, %r6605;
mul.wide.s32 %rd2158, %r1919, 4;
add.s64 %rd2159, %rd1, %rd2158;
ld.local.u32 %r6606, [%rd2159];
shr.u32 %r6607, %r6606, %r6603;
shl.b32 %r6608, %r8357, %r1922;
add.s32 %r8357, %r6607, %r6608;
$L__BB0_1461:
and.b32 %r6609, %r1913, -2147483648;
shr.u32 %r6610, %r8357, 30;
shl.b32 %r6611, %r8356, 2;
or.b32 %r6612, %r6610, %r6611;
shr.u32 %r6613, %r6612, 31;
shr.u32 %r6614, %r8356, 30;
add.s32 %r6615, %r6613, %r6614;
neg.s32 %r6616, %r6615;
setp.eq.s32 %p1235, %r6609, 0;
selp.b32 %r8358, %r6615, %r6616, %p1235;
setp.ne.s32 %p1236, %r6613, 0;
xor.b32 %r6617, %r6609, -2147483648;
selp.b32 %r6618, %r6617, %r6609, %p1236;
selp.b32 %r6619, -1, 0, %p1236;
xor.b32 %r6620, %r6612, %r6619;
shl.b32 %r6621, %r8357, 2;
xor.b32 %r6622, %r6621, %r6619;
cvt.u64.u32 %rd2160, %r6620;
cvt.u64.u32 %rd2161, %r6622;
bfi.b64 %rd2162, %rd2160, %rd2161, 32, 32;
cvt.rn.f64.s64 %fd195, %rd2162;
mul.f64 %fd196, %fd195, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4408, %fd196;
setp.eq.s32 %p1237, %r6618, 0;
neg.f32 %f4409, %f4408;
selp.f32 %f5847, %f4408, %f4409, %p1237;
$L__BB0_1463:
add.s32 %r1929, %r8358, 1;
and.b32 %r1930, %r1929, 1;
setp.eq.s32 %p1238, %r1930, 0;
selp.f32 %f1708, %f5847, 0f3F800000, %p1238;
mul.rn.f32 %f1709, %f5847, %f5847;
mov.f32 %f5848, 0fB94D4153;
@%p1238 bra $L__BB0_1465;
mov.f32 %f4412, 0fBAB607ED;
mov.f32 %f4413, 0f37CBAC00;
fma.rn.f32 %f5848, %f4413, %f1709, %f4412;
$L__BB0_1465:
selp.f32 %f4414, 0f3C0885E4, 0f3D2AAABB, %p1238;
fma.rn.f32 %f4415, %f5848, %f1709, %f4414;
selp.f32 %f4416, 0fBE2AAAA8, 0fBEFFFFFF, %p1238;
fma.rn.f32 %f4417, %f4415, %f1709, %f4416;
mov.f32 %f4418, 0f00000000;
fma.rn.f32 %f4419, %f1709, %f1708, %f4418;
fma.rn.f32 %f5283, %f4417, %f4419, %f1708;
and.b32 %r6624, %r1929, 2;
setp.eq.s32 %p1240, %r6624, 0;
@%p1240 bra $L__BB0_1467;
mov.f32 %f4421, 0fBF800000;
fma.rn.f32 %f5283, %f5283, %f4421, %f4418;
$L__BB0_1467:
selp.f32 %f1716, %f5283, %f5284, %p24;
selp.f32 %f1717, %f5281, %f5282, %p24;
@%p1219 bra $L__BB0_1469;
add.f32 %f5997, %f1717, %f1716;
mov.f32 %f5282, %f5281;
mov.f32 %f5284, %f5283;
$L__BB0_1469:
@%p1194 bra $L__BB0_1498;
shl.b32 %r6625, %r12, 5;
mov.u32 %r6626, -32;
sub.s32 %r1931, %r6626, %r6625;
setp.ge.s32 %p1244, %r11, %r1931;
@%p1244 bra $L__BB0_1483;
mul.f32 %f4424, %f5415, 0f3F22F983;
cvt.rni.s32.f32 %r8362, %f4424;
cvt.rn.f32.s32 %f4425, %r8362;
mov.f32 %f4426, 0fBFC90FDA;
fma.rn.f32 %f4427, %f4425, %f4426, %f5415;
mov.f32 %f4428, 0fB3A22168;
fma.rn.f32 %f4429, %f4425, %f4428, %f4427;
mov.f32 %f4430, 0fA7C234C5;
fma.rn.f32 %f5856, %f4425, %f4430, %f4429;
abs.f32 %f1725, %f5415;
setp.ltu.f32 %p1245, %f1725, 0f47CE4780;
@%p1245 bra $L__BB0_1479;
setp.eq.f32 %p1246, %f1725, 0f7F800000;
@%p1246 bra $L__BB0_1478;
bra.uni $L__BB0_1473;
$L__BB0_1478:
mov.f32 %f4433, 0f00000000;
mul.rn.f32 %f5856, %f5415, %f4433;
mov.u32 %r8362, 0;
bra.uni $L__BB0_1479;
$L__BB0_1473:
mov.b32 %r1933, %f5415;
shr.u32 %r6628, %r1933, 23;
and.b32 %r6629, %r6628, 255;
add.s32 %r1934, %r6629, -128;
shl.b32 %r6630, %r1933, 8;
or.b32 %r1935, %r6630, -2147483648;
shr.u32 %r1936, %r1934, 5;
mov.u64 %rd2705, 0;
mov.u32 %r8359, 0;
mov.u64 %rd2703, __cudart_i2opi_f;
mov.u64 %rd2704, %rd1;
$L__BB0_1474:
.pragma "nounroll";
ld.global.nc.u32 %r6631, [%rd2703];
mad.wide.u32 %rd2165, %r6631, %r1935, %rd2705;
shr.u64 %rd2705, %rd2165, 32;
st.local.u32 [%rd2704], %rd2165;
add.s64 %rd2704, %rd2704, 4;
add.s64 %rd2703, %rd2703, 4;
add.s32 %r8359, %r8359, 1;
setp.ne.s32 %p1247, %r8359, 6;
@%p1247 bra $L__BB0_1474;
st.local.u32 [%rd4], %rd2705;
mov.u32 %r6632, 4;
sub.s32 %r1939, %r6632, %r1936;
mov.u32 %r6633, 6;
sub.s32 %r6634, %r6633, %r1936;
mul.wide.s32 %rd2166, %r6634, 4;
add.s64 %rd2167, %rd1, %rd2166;
ld.local.u32 %r8360, [%rd2167];
ld.local.u32 %r8361, [%rd2167+-4];
and.b32 %r1942, %r1934, 31;
setp.eq.s32 %p1248, %r1942, 0;
@%p1248 bra $L__BB0_1477;
mov.u32 %r6635, 32;
sub.s32 %r6636, %r6635, %r1942;
shr.u32 %r6637, %r8361, %r6636;
shl.b32 %r6638, %r8360, %r1942;
add.s32 %r8360, %r6637, %r6638;
mul.wide.s32 %rd2168, %r1939, 4;
add.s64 %rd2169, %rd1, %rd2168;
ld.local.u32 %r6639, [%rd2169];
shr.u32 %r6640, %r6639, %r6636;
shl.b32 %r6641, %r8361, %r1942;
add.s32 %r8361, %r6640, %r6641;
$L__BB0_1477:
and.b32 %r6642, %r1933, -2147483648;
shr.u32 %r6643, %r8361, 30;
shl.b32 %r6644, %r8360, 2;
or.b32 %r6645, %r6643, %r6644;
shr.u32 %r6646, %r6645, 31;
shr.u32 %r6647, %r8360, 30;
add.s32 %r6648, %r6646, %r6647;
neg.s32 %r6649, %r6648;
setp.eq.s32 %p1249, %r6642, 0;
selp.b32 %r8362, %r6648, %r6649, %p1249;
setp.ne.s32 %p1250, %r6646, 0;
xor.b32 %r6650, %r6642, -2147483648;
selp.b32 %r6651, %r6650, %r6642, %p1250;
selp.b32 %r6652, -1, 0, %p1250;
xor.b32 %r6653, %r6645, %r6652;
shl.b32 %r6654, %r8361, 2;
xor.b32 %r6655, %r6654, %r6652;
cvt.u64.u32 %rd2170, %r6653;
cvt.u64.u32 %rd2171, %r6655;
bfi.b64 %rd2172, %rd2170, %rd2171, 32, 32;
cvt.rn.f64.s64 %fd197, %rd2172;
mul.f64 %fd198, %fd197, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4431, %fd198;
setp.eq.s32 %p1251, %r6651, 0;
neg.f32 %f4432, %f4431;
selp.f32 %f5856, %f4431, %f4432, %p1251;
$L__BB0_1479:
and.b32 %r1949, %r8362, 1;
setp.eq.s32 %p1252, %r1949, 0;
selp.f32 %f1729, %f5856, 0f3F800000, %p1252;
mul.rn.f32 %f1730, %f5856, %f5856;
mov.f32 %f5857, 0fB94D4153;
@%p1252 bra $L__BB0_1481;
mov.f32 %f4435, 0fBAB607ED;
mov.f32 %f4436, 0f37CBAC00;
fma.rn.f32 %f5857, %f4436, %f1730, %f4435;
$L__BB0_1481:
selp.f32 %f4437, 0f3C0885E4, 0f3D2AAABB, %p1252;
fma.rn.f32 %f4438, %f5857, %f1730, %f4437;
selp.f32 %f4439, 0fBE2AAAA8, 0fBEFFFFFF, %p1252;
fma.rn.f32 %f4440, %f4438, %f1730, %f4439;
mov.f32 %f4441, 0f00000000;
fma.rn.f32 %f4442, %f1730, %f1729, %f4441;
fma.rn.f32 %f5281, %f4440, %f4442, %f1729;
and.b32 %r6657, %r8362, 2;
setp.eq.s32 %p1254, %r6657, 0;
@%p1254 bra $L__BB0_1483;
mov.f32 %f4444, 0fBF800000;
fma.rn.f32 %f5281, %f5281, %f4444, %f4441;
$L__BB0_1483:
setp.lt.s32 %p25, %r11, %r1931;
@%p1244 bra $L__BB0_1496;
mul.f32 %f4445, %f5606, 0f3F22F983;
cvt.rni.s32.f32 %r8366, %f4445;
cvt.rn.f32.s32 %f4446, %r8366;
mov.f32 %f4447, 0fBFC90FDA;
fma.rn.f32 %f4448, %f4446, %f4447, %f5606;
mov.f32 %f4449, 0fB3A22168;
fma.rn.f32 %f4450, %f4446, %f4449, %f4448;
mov.f32 %f4451, 0fA7C234C5;
fma.rn.f32 %f5860, %f4446, %f4451, %f4450;
abs.f32 %f1738, %f5606;
setp.ltu.f32 %p1256, %f1738, 0f47CE4780;
@%p1256 bra $L__BB0_1492;
setp.eq.f32 %p1257, %f1738, 0f7F800000;
@%p1257 bra $L__BB0_1491;
bra.uni $L__BB0_1486;
$L__BB0_1491:
mov.f32 %f4454, 0f00000000;
mul.rn.f32 %f5860, %f5606, %f4454;
mov.u32 %r8366, 0;
bra.uni $L__BB0_1492;
$L__BB0_1486:
mov.b32 %r1951, %f5606;
shr.u32 %r6659, %r1951, 23;
and.b32 %r6660, %r6659, 255;
add.s32 %r1952, %r6660, -128;
shl.b32 %r6661, %r1951, 8;
or.b32 %r1953, %r6661, -2147483648;
shr.u32 %r1954, %r1952, 5;
mov.u64 %rd2708, 0;
mov.u32 %r8363, 0;
mov.u64 %rd2706, __cudart_i2opi_f;
mov.u64 %rd2707, %rd1;
$L__BB0_1487:
.pragma "nounroll";
ld.global.nc.u32 %r6662, [%rd2706];
mad.wide.u32 %rd2175, %r6662, %r1953, %rd2708;
shr.u64 %rd2708, %rd2175, 32;
st.local.u32 [%rd2707], %rd2175;
add.s64 %rd2707, %rd2707, 4;
add.s64 %rd2706, %rd2706, 4;
add.s32 %r8363, %r8363, 1;
setp.ne.s32 %p1258, %r8363, 6;
@%p1258 bra $L__BB0_1487;
st.local.u32 [%rd4], %rd2708;
mov.u32 %r6663, 4;
sub.s32 %r1957, %r6663, %r1954;
mov.u32 %r6664, 6;
sub.s32 %r6665, %r6664, %r1954;
mul.wide.s32 %rd2176, %r6665, 4;
add.s64 %rd2177, %rd1, %rd2176;
ld.local.u32 %r8364, [%rd2177];
ld.local.u32 %r8365, [%rd2177+-4];
and.b32 %r1960, %r1952, 31;
setp.eq.s32 %p1259, %r1960, 0;
@%p1259 bra $L__BB0_1490;
mov.u32 %r6666, 32;
sub.s32 %r6667, %r6666, %r1960;
shr.u32 %r6668, %r8365, %r6667;
shl.b32 %r6669, %r8364, %r1960;
add.s32 %r8364, %r6668, %r6669;
mul.wide.s32 %rd2178, %r1957, 4;
add.s64 %rd2179, %rd1, %rd2178;
ld.local.u32 %r6670, [%rd2179];
shr.u32 %r6671, %r6670, %r6667;
shl.b32 %r6672, %r8365, %r1960;
add.s32 %r8365, %r6671, %r6672;
$L__BB0_1490:
and.b32 %r6673, %r1951, -2147483648;
shr.u32 %r6674, %r8365, 30;
shl.b32 %r6675, %r8364, 2;
or.b32 %r6676, %r6674, %r6675;
shr.u32 %r6677, %r6676, 31;
shr.u32 %r6678, %r8364, 30;
add.s32 %r6679, %r6677, %r6678;
neg.s32 %r6680, %r6679;
setp.eq.s32 %p1260, %r6673, 0;
selp.b32 %r8366, %r6679, %r6680, %p1260;
setp.ne.s32 %p1261, %r6677, 0;
xor.b32 %r6681, %r6673, -2147483648;
selp.b32 %r6682, %r6681, %r6673, %p1261;
selp.b32 %r6683, -1, 0, %p1261;
xor.b32 %r6684, %r6676, %r6683;
shl.b32 %r6685, %r8365, 2;
xor.b32 %r6686, %r6685, %r6683;
cvt.u64.u32 %rd2180, %r6684;
cvt.u64.u32 %rd2181, %r6686;
bfi.b64 %rd2182, %rd2180, %rd2181, 32, 32;
cvt.rn.f64.s64 %fd199, %rd2182;
mul.f64 %fd200, %fd199, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4452, %fd200;
setp.eq.s32 %p1262, %r6682, 0;
neg.f32 %f4453, %f4452;
selp.f32 %f5860, %f4452, %f4453, %p1262;
$L__BB0_1492:
add.s32 %r1967, %r8366, 1;
and.b32 %r1968, %r1967, 1;
setp.eq.s32 %p1263, %r1968, 0;
selp.f32 %f1742, %f5860, 0f3F800000, %p1263;
mul.rn.f32 %f1743, %f5860, %f5860;
mov.f32 %f5861, 0fB94D4153;
@%p1263 bra $L__BB0_1494;
mov.f32 %f4456, 0fBAB607ED;
mov.f32 %f4457, 0f37CBAC00;
fma.rn.f32 %f5861, %f4457, %f1743, %f4456;
$L__BB0_1494:
selp.f32 %f4458, 0f3C0885E4, 0f3D2AAABB, %p1263;
fma.rn.f32 %f4459, %f5861, %f1743, %f4458;
selp.f32 %f4460, 0fBE2AAAA8, 0fBEFFFFFF, %p1263;
fma.rn.f32 %f4461, %f4459, %f1743, %f4460;
mov.f32 %f4462, 0f00000000;
fma.rn.f32 %f4463, %f1743, %f1742, %f4462;
fma.rn.f32 %f5283, %f4461, %f4463, %f1742;
and.b32 %r6688, %r1967, 2;
setp.eq.s32 %p1265, %r6688, 0;
@%p1265 bra $L__BB0_1496;
mov.f32 %f4465, 0fBF800000;
fma.rn.f32 %f5283, %f5283, %f4465, %f4462;
$L__BB0_1496:
selp.f32 %f1750, %f5283, %f5284, %p25;
selp.f32 %f1751, %f5281, %f5282, %p25;
@%p1244 bra $L__BB0_1498;
add.f32 %f5996, %f1751, %f1750;
mov.f32 %f5282, %f5281;
mov.f32 %f5284, %f5283;
$L__BB0_1498:
@%p1197 bra $L__BB0_1527;
shl.b32 %r6689, %r12, 5;
neg.s32 %r1969, %r6689;
setp.ge.s32 %p1269, %r11, %r1969;
@%p1269 bra $L__BB0_1512;
mul.f32 %f4468, %f5414, 0f3F22F983;
cvt.rni.s32.f32 %r8370, %f4468;
cvt.rn.f32.s32 %f4469, %r8370;
mov.f32 %f4470, 0fBFC90FDA;
fma.rn.f32 %f4471, %f4469, %f4470, %f5414;
mov.f32 %f4472, 0fB3A22168;
fma.rn.f32 %f4473, %f4469, %f4472, %f4471;
mov.f32 %f4474, 0fA7C234C5;
fma.rn.f32 %f5869, %f4469, %f4474, %f4473;
abs.f32 %f1759, %f5414;
setp.ltu.f32 %p1270, %f1759, 0f47CE4780;
@%p1270 bra $L__BB0_1508;
setp.eq.f32 %p1271, %f1759, 0f7F800000;
@%p1271 bra $L__BB0_1507;
bra.uni $L__BB0_1502;
$L__BB0_1507:
mov.f32 %f4477, 0f00000000;
mul.rn.f32 %f5869, %f5414, %f4477;
mov.u32 %r8370, 0;
bra.uni $L__BB0_1508;
$L__BB0_1502:
mov.b32 %r1971, %f5414;
shr.u32 %r6691, %r1971, 23;
and.b32 %r6692, %r6691, 255;
add.s32 %r1972, %r6692, -128;
shl.b32 %r6693, %r1971, 8;
or.b32 %r1973, %r6693, -2147483648;
shr.u32 %r1974, %r1972, 5;
mov.u64 %rd2711, 0;
mov.u32 %r8367, 0;
mov.u64 %rd2709, __cudart_i2opi_f;
mov.u64 %rd2710, %rd1;
$L__BB0_1503:
.pragma "nounroll";
ld.global.nc.u32 %r6694, [%rd2709];
mad.wide.u32 %rd2185, %r6694, %r1973, %rd2711;
shr.u64 %rd2711, %rd2185, 32;
st.local.u32 [%rd2710], %rd2185;
add.s64 %rd2710, %rd2710, 4;
add.s64 %rd2709, %rd2709, 4;
add.s32 %r8367, %r8367, 1;
setp.ne.s32 %p1272, %r8367, 6;
@%p1272 bra $L__BB0_1503;
st.local.u32 [%rd4], %rd2711;
mov.u32 %r6695, 4;
sub.s32 %r1977, %r6695, %r1974;
mov.u32 %r6696, 6;
sub.s32 %r6697, %r6696, %r1974;
mul.wide.s32 %rd2186, %r6697, 4;
add.s64 %rd2187, %rd1, %rd2186;
ld.local.u32 %r8368, [%rd2187];
ld.local.u32 %r8369, [%rd2187+-4];
and.b32 %r1980, %r1972, 31;
setp.eq.s32 %p1273, %r1980, 0;
@%p1273 bra $L__BB0_1506;
mov.u32 %r6698, 32;
sub.s32 %r6699, %r6698, %r1980;
shr.u32 %r6700, %r8369, %r6699;
shl.b32 %r6701, %r8368, %r1980;
add.s32 %r8368, %r6700, %r6701;
mul.wide.s32 %rd2188, %r1977, 4;
add.s64 %rd2189, %rd1, %rd2188;
ld.local.u32 %r6702, [%rd2189];
shr.u32 %r6703, %r6702, %r6699;
shl.b32 %r6704, %r8369, %r1980;
add.s32 %r8369, %r6703, %r6704;
$L__BB0_1506:
and.b32 %r6705, %r1971, -2147483648;
shr.u32 %r6706, %r8369, 30;
shl.b32 %r6707, %r8368, 2;
or.b32 %r6708, %r6706, %r6707;
shr.u32 %r6709, %r6708, 31;
shr.u32 %r6710, %r8368, 30;
add.s32 %r6711, %r6709, %r6710;
neg.s32 %r6712, %r6711;
setp.eq.s32 %p1274, %r6705, 0;
selp.b32 %r8370, %r6711, %r6712, %p1274;
setp.ne.s32 %p1275, %r6709, 0;
xor.b32 %r6713, %r6705, -2147483648;
selp.b32 %r6714, %r6713, %r6705, %p1275;
selp.b32 %r6715, -1, 0, %p1275;
xor.b32 %r6716, %r6708, %r6715;
shl.b32 %r6717, %r8369, 2;
xor.b32 %r6718, %r6717, %r6715;
cvt.u64.u32 %rd2190, %r6716;
cvt.u64.u32 %rd2191, %r6718;
bfi.b64 %rd2192, %rd2190, %rd2191, 32, 32;
cvt.rn.f64.s64 %fd201, %rd2192;
mul.f64 %fd202, %fd201, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4475, %fd202;
setp.eq.s32 %p1276, %r6714, 0;
neg.f32 %f4476, %f4475;
selp.f32 %f5869, %f4475, %f4476, %p1276;
$L__BB0_1508:
and.b32 %r1987, %r8370, 1;
setp.eq.s32 %p1277, %r1987, 0;
selp.f32 %f1763, %f5869, 0f3F800000, %p1277;
mul.rn.f32 %f1764, %f5869, %f5869;
mov.f32 %f5870, 0fB94D4153;
@%p1277 bra $L__BB0_1510;
mov.f32 %f4479, 0fBAB607ED;
mov.f32 %f4480, 0f37CBAC00;
fma.rn.f32 %f5870, %f4480, %f1764, %f4479;
$L__BB0_1510:
selp.f32 %f4481, 0f3C0885E4, 0f3D2AAABB, %p1277;
fma.rn.f32 %f4482, %f5870, %f1764, %f4481;
selp.f32 %f4483, 0fBE2AAAA8, 0fBEFFFFFF, %p1277;
fma.rn.f32 %f4484, %f4482, %f1764, %f4483;
mov.f32 %f4485, 0f00000000;
fma.rn.f32 %f4486, %f1764, %f1763, %f4485;
fma.rn.f32 %f5281, %f4484, %f4486, %f1763;
and.b32 %r6720, %r8370, 2;
setp.eq.s32 %p1279, %r6720, 0;
@%p1279 bra $L__BB0_1512;
mov.f32 %f4488, 0fBF800000;
fma.rn.f32 %f5281, %f5281, %f4488, %f4485;
$L__BB0_1512:
setp.lt.s32 %p26, %r11, %r1969;
@%p1269 bra $L__BB0_1525;
mul.f32 %f4489, %f5406, 0f3F22F983;
cvt.rni.s32.f32 %r8374, %f4489;
cvt.rn.f32.s32 %f4490, %r8374;
mov.f32 %f4491, 0fBFC90FDA;
fma.rn.f32 %f4492, %f4490, %f4491, %f5406;
mov.f32 %f4493, 0fB3A22168;
fma.rn.f32 %f4494, %f4490, %f4493, %f4492;
mov.f32 %f4495, 0fA7C234C5;
fma.rn.f32 %f5873, %f4490, %f4495, %f4494;
abs.f32 %f1772, %f5406;
setp.ltu.f32 %p1281, %f1772, 0f47CE4780;
@%p1281 bra $L__BB0_1521;
setp.eq.f32 %p1282, %f1772, 0f7F800000;
@%p1282 bra $L__BB0_1520;
bra.uni $L__BB0_1515;
$L__BB0_1520:
mov.f32 %f4498, 0f00000000;
mul.rn.f32 %f5873, %f5406, %f4498;
mov.u32 %r8374, 0;
bra.uni $L__BB0_1521;
$L__BB0_1515:
mov.b32 %r1989, %f5406;
shr.u32 %r6722, %r1989, 23;
and.b32 %r6723, %r6722, 255;
add.s32 %r1990, %r6723, -128;
shl.b32 %r6724, %r1989, 8;
or.b32 %r1991, %r6724, -2147483648;
shr.u32 %r1992, %r1990, 5;
mov.u64 %rd2714, 0;
mov.u32 %r8371, 0;
mov.u64 %rd2712, __cudart_i2opi_f;
mov.u64 %rd2713, %rd1;
$L__BB0_1516:
.pragma "nounroll";
ld.global.nc.u32 %r6725, [%rd2712];
mad.wide.u32 %rd2195, %r6725, %r1991, %rd2714;
shr.u64 %rd2714, %rd2195, 32;
st.local.u32 [%rd2713], %rd2195;
add.s64 %rd2713, %rd2713, 4;
add.s64 %rd2712, %rd2712, 4;
add.s32 %r8371, %r8371, 1;
setp.ne.s32 %p1283, %r8371, 6;
@%p1283 bra $L__BB0_1516;
st.local.u32 [%rd4], %rd2714;
mov.u32 %r6726, 4;
sub.s32 %r1995, %r6726, %r1992;
mov.u32 %r6727, 6;
sub.s32 %r6728, %r6727, %r1992;
mul.wide.s32 %rd2196, %r6728, 4;
add.s64 %rd2197, %rd1, %rd2196;
ld.local.u32 %r8372, [%rd2197];
ld.local.u32 %r8373, [%rd2197+-4];
and.b32 %r1998, %r1990, 31;
setp.eq.s32 %p1284, %r1998, 0;
@%p1284 bra $L__BB0_1519;
mov.u32 %r6729, 32;
sub.s32 %r6730, %r6729, %r1998;
shr.u32 %r6731, %r8373, %r6730;
shl.b32 %r6732, %r8372, %r1998;
add.s32 %r8372, %r6731, %r6732;
mul.wide.s32 %rd2198, %r1995, 4;
add.s64 %rd2199, %rd1, %rd2198;
ld.local.u32 %r6733, [%rd2199];
shr.u32 %r6734, %r6733, %r6730;
shl.b32 %r6735, %r8373, %r1998;
add.s32 %r8373, %r6734, %r6735;
$L__BB0_1519:
and.b32 %r6736, %r1989, -2147483648;
shr.u32 %r6737, %r8373, 30;
shl.b32 %r6738, %r8372, 2;
or.b32 %r6739, %r6737, %r6738;
shr.u32 %r6740, %r6739, 31;
shr.u32 %r6741, %r8372, 30;
add.s32 %r6742, %r6740, %r6741;
neg.s32 %r6743, %r6742;
setp.eq.s32 %p1285, %r6736, 0;
selp.b32 %r8374, %r6742, %r6743, %p1285;
setp.ne.s32 %p1286, %r6740, 0;
xor.b32 %r6744, %r6736, -2147483648;
selp.b32 %r6745, %r6744, %r6736, %p1286;
selp.b32 %r6746, -1, 0, %p1286;
xor.b32 %r6747, %r6739, %r6746;
shl.b32 %r6748, %r8373, 2;
xor.b32 %r6749, %r6748, %r6746;
cvt.u64.u32 %rd2200, %r6747;
cvt.u64.u32 %rd2201, %r6749;
bfi.b64 %rd2202, %rd2200, %rd2201, 32, 32;
cvt.rn.f64.s64 %fd203, %rd2202;
mul.f64 %fd204, %fd203, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4496, %fd204;
setp.eq.s32 %p1287, %r6745, 0;
neg.f32 %f4497, %f4496;
selp.f32 %f5873, %f4496, %f4497, %p1287;
$L__BB0_1521:
add.s32 %r2005, %r8374, 1;
and.b32 %r2006, %r2005, 1;
setp.eq.s32 %p1288, %r2006, 0;
selp.f32 %f1776, %f5873, 0f3F800000, %p1288;
mul.rn.f32 %f1777, %f5873, %f5873;
mov.f32 %f5874, 0fB94D4153;
@%p1288 bra $L__BB0_1523;
mov.f32 %f4500, 0fBAB607ED;
mov.f32 %f4501, 0f37CBAC00;
fma.rn.f32 %f5874, %f4501, %f1777, %f4500;
$L__BB0_1523:
selp.f32 %f4502, 0f3C0885E4, 0f3D2AAABB, %p1288;
fma.rn.f32 %f4503, %f5874, %f1777, %f4502;
selp.f32 %f4504, 0fBE2AAAA8, 0fBEFFFFFF, %p1288;
fma.rn.f32 %f4505, %f4503, %f1777, %f4504;
mov.f32 %f4506, 0f00000000;
fma.rn.f32 %f4507, %f1777, %f1776, %f4506;
fma.rn.f32 %f5283, %f4505, %f4507, %f1776;
and.b32 %r6751, %r2005, 2;
setp.eq.s32 %p1290, %r6751, 0;
@%p1290 bra $L__BB0_1525;
mov.f32 %f4509, 0fBF800000;
fma.rn.f32 %f5283, %f5283, %f4509, %f4506;
$L__BB0_1525:
selp.f32 %f1784, %f5283, %f5284, %p26;
selp.f32 %f1785, %f5281, %f5282, %p26;
@%p1269 bra $L__BB0_1527;
add.f32 %f5995, %f1785, %f1784;
mov.f32 %f5282, %f5281;
mov.f32 %f5284, %f5283;
$L__BB0_1527:
@%p1197 bra $L__BB0_1556;
shl.b32 %r6752, %r12, 5;
mov.u32 %r6753, -32;
sub.s32 %r2007, %r6753, %r6752;
setp.ge.s32 %p1294, %r11, %r2007;
@%p1294 bra $L__BB0_1541;
mul.f32 %f4512, %f5413, 0f3F22F983;
cvt.rni.s32.f32 %r8378, %f4512;
cvt.rn.f32.s32 %f4513, %r8378;
mov.f32 %f4514, 0fBFC90FDA;
fma.rn.f32 %f4515, %f4513, %f4514, %f5413;
mov.f32 %f4516, 0fB3A22168;
fma.rn.f32 %f4517, %f4513, %f4516, %f4515;
mov.f32 %f4518, 0fA7C234C5;
fma.rn.f32 %f5882, %f4513, %f4518, %f4517;
abs.f32 %f1793, %f5413;
setp.ltu.f32 %p1295, %f1793, 0f47CE4780;
@%p1295 bra $L__BB0_1537;
setp.eq.f32 %p1296, %f1793, 0f7F800000;
@%p1296 bra $L__BB0_1536;
bra.uni $L__BB0_1531;
$L__BB0_1536:
mov.f32 %f4521, 0f00000000;
mul.rn.f32 %f5882, %f5413, %f4521;
mov.u32 %r8378, 0;
bra.uni $L__BB0_1537;
$L__BB0_1531:
mov.b32 %r2009, %f5413;
shr.u32 %r6755, %r2009, 23;
and.b32 %r6756, %r6755, 255;
add.s32 %r2010, %r6756, -128;
shl.b32 %r6757, %r2009, 8;
or.b32 %r2011, %r6757, -2147483648;
shr.u32 %r2012, %r2010, 5;
mov.u64 %rd2717, 0;
mov.u32 %r8375, 0;
mov.u64 %rd2715, __cudart_i2opi_f;
mov.u64 %rd2716, %rd1;
$L__BB0_1532:
.pragma "nounroll";
ld.global.nc.u32 %r6758, [%rd2715];
mad.wide.u32 %rd2205, %r6758, %r2011, %rd2717;
shr.u64 %rd2717, %rd2205, 32;
st.local.u32 [%rd2716], %rd2205;
add.s64 %rd2716, %rd2716, 4;
add.s64 %rd2715, %rd2715, 4;
add.s32 %r8375, %r8375, 1;
setp.ne.s32 %p1297, %r8375, 6;
@%p1297 bra $L__BB0_1532;
st.local.u32 [%rd4], %rd2717;
mov.u32 %r6759, 4;
sub.s32 %r2015, %r6759, %r2012;
mov.u32 %r6760, 6;
sub.s32 %r6761, %r6760, %r2012;
mul.wide.s32 %rd2206, %r6761, 4;
add.s64 %rd2207, %rd1, %rd2206;
ld.local.u32 %r8376, [%rd2207];
ld.local.u32 %r8377, [%rd2207+-4];
and.b32 %r2018, %r2010, 31;
setp.eq.s32 %p1298, %r2018, 0;
@%p1298 bra $L__BB0_1535;
mov.u32 %r6762, 32;
sub.s32 %r6763, %r6762, %r2018;
shr.u32 %r6764, %r8377, %r6763;
shl.b32 %r6765, %r8376, %r2018;
add.s32 %r8376, %r6764, %r6765;
mul.wide.s32 %rd2208, %r2015, 4;
add.s64 %rd2209, %rd1, %rd2208;
ld.local.u32 %r6766, [%rd2209];
shr.u32 %r6767, %r6766, %r6763;
shl.b32 %r6768, %r8377, %r2018;
add.s32 %r8377, %r6767, %r6768;
$L__BB0_1535:
and.b32 %r6769, %r2009, -2147483648;
shr.u32 %r6770, %r8377, 30;
shl.b32 %r6771, %r8376, 2;
or.b32 %r6772, %r6770, %r6771;
shr.u32 %r6773, %r6772, 31;
shr.u32 %r6774, %r8376, 30;
add.s32 %r6775, %r6773, %r6774;
neg.s32 %r6776, %r6775;
setp.eq.s32 %p1299, %r6769, 0;
selp.b32 %r8378, %r6775, %r6776, %p1299;
setp.ne.s32 %p1300, %r6773, 0;
xor.b32 %r6777, %r6769, -2147483648;
selp.b32 %r6778, %r6777, %r6769, %p1300;
selp.b32 %r6779, -1, 0, %p1300;
xor.b32 %r6780, %r6772, %r6779;
shl.b32 %r6781, %r8377, 2;
xor.b32 %r6782, %r6781, %r6779;
cvt.u64.u32 %rd2210, %r6780;
cvt.u64.u32 %rd2211, %r6782;
bfi.b64 %rd2212, %rd2210, %rd2211, 32, 32;
cvt.rn.f64.s64 %fd205, %rd2212;
mul.f64 %fd206, %fd205, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4519, %fd206;
setp.eq.s32 %p1301, %r6778, 0;
neg.f32 %f4520, %f4519;
selp.f32 %f5882, %f4519, %f4520, %p1301;
$L__BB0_1537:
and.b32 %r2025, %r8378, 1;
setp.eq.s32 %p1302, %r2025, 0;
selp.f32 %f1797, %f5882, 0f3F800000, %p1302;
mul.rn.f32 %f1798, %f5882, %f5882;
mov.f32 %f5883, 0fB94D4153;
@%p1302 bra $L__BB0_1539;
mov.f32 %f4523, 0fBAB607ED;
mov.f32 %f4524, 0f37CBAC00;
fma.rn.f32 %f5883, %f4524, %f1798, %f4523;
$L__BB0_1539:
selp.f32 %f4525, 0f3C0885E4, 0f3D2AAABB, %p1302;
fma.rn.f32 %f4526, %f5883, %f1798, %f4525;
selp.f32 %f4527, 0fBE2AAAA8, 0fBEFFFFFF, %p1302;
fma.rn.f32 %f4528, %f4526, %f1798, %f4527;
mov.f32 %f4529, 0f00000000;
fma.rn.f32 %f4530, %f1798, %f1797, %f4529;
fma.rn.f32 %f5281, %f4528, %f4530, %f1797;
and.b32 %r6784, %r8378, 2;
setp.eq.s32 %p1304, %r6784, 0;
@%p1304 bra $L__BB0_1541;
mov.f32 %f4532, 0fBF800000;
fma.rn.f32 %f5281, %f5281, %f4532, %f4529;
$L__BB0_1541:
setp.lt.s32 %p27, %r11, %r2007;
@%p1294 bra $L__BB0_1554;
mul.f32 %f4533, %f5405, 0f3F22F983;
cvt.rni.s32.f32 %r8382, %f4533;
cvt.rn.f32.s32 %f4534, %r8382;
mov.f32 %f4535, 0fBFC90FDA;
fma.rn.f32 %f4536, %f4534, %f4535, %f5405;
mov.f32 %f4537, 0fB3A22168;
fma.rn.f32 %f4538, %f4534, %f4537, %f4536;
mov.f32 %f4539, 0fA7C234C5;
fma.rn.f32 %f5886, %f4534, %f4539, %f4538;
abs.f32 %f1806, %f5405;
setp.ltu.f32 %p1306, %f1806, 0f47CE4780;
@%p1306 bra $L__BB0_1550;
setp.eq.f32 %p1307, %f1806, 0f7F800000;
@%p1307 bra $L__BB0_1549;
bra.uni $L__BB0_1544;
$L__BB0_1549:
mov.f32 %f4542, 0f00000000;
mul.rn.f32 %f5886, %f5405, %f4542;
mov.u32 %r8382, 0;
bra.uni $L__BB0_1550;
$L__BB0_1544:
mov.b32 %r2027, %f5405;
shr.u32 %r6786, %r2027, 23;
and.b32 %r6787, %r6786, 255;
add.s32 %r2028, %r6787, -128;
shl.b32 %r6788, %r2027, 8;
or.b32 %r2029, %r6788, -2147483648;
shr.u32 %r2030, %r2028, 5;
mov.u64 %rd2720, 0;
mov.u32 %r8379, 0;
mov.u64 %rd2718, __cudart_i2opi_f;
mov.u64 %rd2719, %rd1;
$L__BB0_1545:
.pragma "nounroll";
ld.global.nc.u32 %r6789, [%rd2718];
mad.wide.u32 %rd2215, %r6789, %r2029, %rd2720;
shr.u64 %rd2720, %rd2215, 32;
st.local.u32 [%rd2719], %rd2215;
add.s64 %rd2719, %rd2719, 4;
add.s64 %rd2718, %rd2718, 4;
add.s32 %r8379, %r8379, 1;
setp.ne.s32 %p1308, %r8379, 6;
@%p1308 bra $L__BB0_1545;
st.local.u32 [%rd4], %rd2720;
mov.u32 %r6790, 4;
sub.s32 %r2033, %r6790, %r2030;
mov.u32 %r6791, 6;
sub.s32 %r6792, %r6791, %r2030;
mul.wide.s32 %rd2216, %r6792, 4;
add.s64 %rd2217, %rd1, %rd2216;
ld.local.u32 %r8380, [%rd2217];
ld.local.u32 %r8381, [%rd2217+-4];
and.b32 %r2036, %r2028, 31;
setp.eq.s32 %p1309, %r2036, 0;
@%p1309 bra $L__BB0_1548;
mov.u32 %r6793, 32;
sub.s32 %r6794, %r6793, %r2036;
shr.u32 %r6795, %r8381, %r6794;
shl.b32 %r6796, %r8380, %r2036;
add.s32 %r8380, %r6795, %r6796;
mul.wide.s32 %rd2218, %r2033, 4;
add.s64 %rd2219, %rd1, %rd2218;
ld.local.u32 %r6797, [%rd2219];
shr.u32 %r6798, %r6797, %r6794;
shl.b32 %r6799, %r8381, %r2036;
add.s32 %r8381, %r6798, %r6799;
$L__BB0_1548:
and.b32 %r6800, %r2027, -2147483648;
shr.u32 %r6801, %r8381, 30;
shl.b32 %r6802, %r8380, 2;
or.b32 %r6803, %r6801, %r6802;
shr.u32 %r6804, %r6803, 31;
shr.u32 %r6805, %r8380, 30;
add.s32 %r6806, %r6804, %r6805;
neg.s32 %r6807, %r6806;
setp.eq.s32 %p1310, %r6800, 0;
selp.b32 %r8382, %r6806, %r6807, %p1310;
setp.ne.s32 %p1311, %r6804, 0;
xor.b32 %r6808, %r6800, -2147483648;
selp.b32 %r6809, %r6808, %r6800, %p1311;
selp.b32 %r6810, -1, 0, %p1311;
xor.b32 %r6811, %r6803, %r6810;
shl.b32 %r6812, %r8381, 2;
xor.b32 %r6813, %r6812, %r6810;
cvt.u64.u32 %rd2220, %r6811;
cvt.u64.u32 %rd2221, %r6813;
bfi.b64 %rd2222, %rd2220, %rd2221, 32, 32;
cvt.rn.f64.s64 %fd207, %rd2222;
mul.f64 %fd208, %fd207, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4540, %fd208;
setp.eq.s32 %p1312, %r6809, 0;
neg.f32 %f4541, %f4540;
selp.f32 %f5886, %f4540, %f4541, %p1312;
$L__BB0_1550:
add.s32 %r2043, %r8382, 1;
and.b32 %r2044, %r2043, 1;
setp.eq.s32 %p1313, %r2044, 0;
selp.f32 %f1810, %f5886, 0f3F800000, %p1313;
mul.rn.f32 %f1811, %f5886, %f5886;
mov.f32 %f5887, 0fB94D4153;
@%p1313 bra $L__BB0_1552;
mov.f32 %f4544, 0fBAB607ED;
mov.f32 %f4545, 0f37CBAC00;
fma.rn.f32 %f5887, %f4545, %f1811, %f4544;
$L__BB0_1552:
selp.f32 %f4546, 0f3C0885E4, 0f3D2AAABB, %p1313;
fma.rn.f32 %f4547, %f5887, %f1811, %f4546;
selp.f32 %f4548, 0fBE2AAAA8, 0fBEFFFFFF, %p1313;
fma.rn.f32 %f4549, %f4547, %f1811, %f4548;
mov.f32 %f4550, 0f00000000;
fma.rn.f32 %f4551, %f1811, %f1810, %f4550;
fma.rn.f32 %f5283, %f4549, %f4551, %f1810;
and.b32 %r6815, %r2043, 2;
setp.eq.s32 %p1315, %r6815, 0;
@%p1315 bra $L__BB0_1554;
mov.f32 %f4553, 0fBF800000;
fma.rn.f32 %f5283, %f5283, %f4553, %f4550;
$L__BB0_1554:
selp.f32 %f1818, %f5283, %f5284, %p27;
selp.f32 %f1819, %f5281, %f5282, %p27;
@%p1294 bra $L__BB0_1556;
add.f32 %f5994, %f1819, %f1818;
mov.f32 %f5282, %f5281;
mov.f32 %f5284, %f5283;
$L__BB0_1556:
@%p1200 bra $L__BB0_1585;
shl.b32 %r6816, %r12, 5;
neg.s32 %r2045, %r6816;
setp.ge.s32 %p1319, %r11, %r2045;
@%p1319 bra $L__BB0_1570;
mul.f32 %f4556, %f5412, 0f3F22F983;
cvt.rni.s32.f32 %r8386, %f4556;
cvt.rn.f32.s32 %f4557, %r8386;
mov.f32 %f4558, 0fBFC90FDA;
fma.rn.f32 %f4559, %f4557, %f4558, %f5412;
mov.f32 %f4560, 0fB3A22168;
fma.rn.f32 %f4561, %f4557, %f4560, %f4559;
mov.f32 %f4562, 0fA7C234C5;
fma.rn.f32 %f5895, %f4557, %f4562, %f4561;
abs.f32 %f1827, %f5412;
setp.ltu.f32 %p1320, %f1827, 0f47CE4780;
@%p1320 bra $L__BB0_1566;
setp.eq.f32 %p1321, %f1827, 0f7F800000;
@%p1321 bra $L__BB0_1565;
bra.uni $L__BB0_1560;
$L__BB0_1565:
mov.f32 %f4565, 0f00000000;
mul.rn.f32 %f5895, %f5412, %f4565;
mov.u32 %r8386, 0;
bra.uni $L__BB0_1566;
$L__BB0_1560:
mov.b32 %r2047, %f5412;
shr.u32 %r6818, %r2047, 23;
and.b32 %r6819, %r6818, 255;
add.s32 %r2048, %r6819, -128;
shl.b32 %r6820, %r2047, 8;
or.b32 %r2049, %r6820, -2147483648;
shr.u32 %r2050, %r2048, 5;
mov.u64 %rd2723, 0;
mov.u32 %r8383, 0;
mov.u64 %rd2721, __cudart_i2opi_f;
mov.u64 %rd2722, %rd1;
$L__BB0_1561:
.pragma "nounroll";
ld.global.nc.u32 %r6821, [%rd2721];
mad.wide.u32 %rd2225, %r6821, %r2049, %rd2723;
shr.u64 %rd2723, %rd2225, 32;
st.local.u32 [%rd2722], %rd2225;
add.s64 %rd2722, %rd2722, 4;
add.s64 %rd2721, %rd2721, 4;
add.s32 %r8383, %r8383, 1;
setp.ne.s32 %p1322, %r8383, 6;
@%p1322 bra $L__BB0_1561;
st.local.u32 [%rd4], %rd2723;
mov.u32 %r6822, 4;
sub.s32 %r2053, %r6822, %r2050;
mov.u32 %r6823, 6;
sub.s32 %r6824, %r6823, %r2050;
mul.wide.s32 %rd2226, %r6824, 4;
add.s64 %rd2227, %rd1, %rd2226;
ld.local.u32 %r8384, [%rd2227];
ld.local.u32 %r8385, [%rd2227+-4];
and.b32 %r2056, %r2048, 31;
setp.eq.s32 %p1323, %r2056, 0;
@%p1323 bra $L__BB0_1564;
mov.u32 %r6825, 32;
sub.s32 %r6826, %r6825, %r2056;
shr.u32 %r6827, %r8385, %r6826;
shl.b32 %r6828, %r8384, %r2056;
add.s32 %r8384, %r6827, %r6828;
mul.wide.s32 %rd2228, %r2053, 4;
add.s64 %rd2229, %rd1, %rd2228;
ld.local.u32 %r6829, [%rd2229];
shr.u32 %r6830, %r6829, %r6826;
shl.b32 %r6831, %r8385, %r2056;
add.s32 %r8385, %r6830, %r6831;
$L__BB0_1564:
and.b32 %r6832, %r2047, -2147483648;
shr.u32 %r6833, %r8385, 30;
shl.b32 %r6834, %r8384, 2;
or.b32 %r6835, %r6833, %r6834;
shr.u32 %r6836, %r6835, 31;
shr.u32 %r6837, %r8384, 30;
add.s32 %r6838, %r6836, %r6837;
neg.s32 %r6839, %r6838;
setp.eq.s32 %p1324, %r6832, 0;
selp.b32 %r8386, %r6838, %r6839, %p1324;
setp.ne.s32 %p1325, %r6836, 0;
xor.b32 %r6840, %r6832, -2147483648;
selp.b32 %r6841, %r6840, %r6832, %p1325;
selp.b32 %r6842, -1, 0, %p1325;
xor.b32 %r6843, %r6835, %r6842;
shl.b32 %r6844, %r8385, 2;
xor.b32 %r6845, %r6844, %r6842;
cvt.u64.u32 %rd2230, %r6843;
cvt.u64.u32 %rd2231, %r6845;
bfi.b64 %rd2232, %rd2230, %rd2231, 32, 32;
cvt.rn.f64.s64 %fd209, %rd2232;
mul.f64 %fd210, %fd209, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4563, %fd210;
setp.eq.s32 %p1326, %r6841, 0;
neg.f32 %f4564, %f4563;
selp.f32 %f5895, %f4563, %f4564, %p1326;
$L__BB0_1566:
and.b32 %r2063, %r8386, 1;
setp.eq.s32 %p1327, %r2063, 0;
selp.f32 %f1831, %f5895, 0f3F800000, %p1327;
mul.rn.f32 %f1832, %f5895, %f5895;
mov.f32 %f5896, 0fB94D4153;
@%p1327 bra $L__BB0_1568;
mov.f32 %f4567, 0fBAB607ED;
mov.f32 %f4568, 0f37CBAC00;
fma.rn.f32 %f5896, %f4568, %f1832, %f4567;
$L__BB0_1568:
selp.f32 %f4569, 0f3C0885E4, 0f3D2AAABB, %p1327;
fma.rn.f32 %f4570, %f5896, %f1832, %f4569;
selp.f32 %f4571, 0fBE2AAAA8, 0fBEFFFFFF, %p1327;
fma.rn.f32 %f4572, %f4570, %f1832, %f4571;
mov.f32 %f4573, 0f00000000;
fma.rn.f32 %f4574, %f1832, %f1831, %f4573;
fma.rn.f32 %f5281, %f4572, %f4574, %f1831;
and.b32 %r6847, %r8386, 2;
setp.eq.s32 %p1329, %r6847, 0;
@%p1329 bra $L__BB0_1570;
mov.f32 %f4576, 0fBF800000;
fma.rn.f32 %f5281, %f5281, %f4576, %f4573;
$L__BB0_1570:
setp.lt.s32 %p28, %r11, %r2045;
@%p1319 bra $L__BB0_1583;
mul.f32 %f4577, %f5404, 0f3F22F983;
cvt.rni.s32.f32 %r8390, %f4577;
cvt.rn.f32.s32 %f4578, %r8390;
mov.f32 %f4579, 0fBFC90FDA;
fma.rn.f32 %f4580, %f4578, %f4579, %f5404;
mov.f32 %f4581, 0fB3A22168;
fma.rn.f32 %f4582, %f4578, %f4581, %f4580;
mov.f32 %f4583, 0fA7C234C5;
fma.rn.f32 %f5899, %f4578, %f4583, %f4582;
abs.f32 %f1840, %f5404;
setp.ltu.f32 %p1331, %f1840, 0f47CE4780;
@%p1331 bra $L__BB0_1579;
setp.eq.f32 %p1332, %f1840, 0f7F800000;
@%p1332 bra $L__BB0_1578;
bra.uni $L__BB0_1573;
$L__BB0_1578:
mov.f32 %f4586, 0f00000000;
mul.rn.f32 %f5899, %f5404, %f4586;
mov.u32 %r8390, 0;
bra.uni $L__BB0_1579;
$L__BB0_1573:
mov.b32 %r2065, %f5404;
shr.u32 %r6849, %r2065, 23;
and.b32 %r6850, %r6849, 255;
add.s32 %r2066, %r6850, -128;
shl.b32 %r6851, %r2065, 8;
or.b32 %r2067, %r6851, -2147483648;
shr.u32 %r2068, %r2066, 5;
mov.u64 %rd2726, 0;
mov.u32 %r8387, 0;
mov.u64 %rd2724, __cudart_i2opi_f;
mov.u64 %rd2725, %rd1;
$L__BB0_1574:
.pragma "nounroll";
ld.global.nc.u32 %r6852, [%rd2724];
mad.wide.u32 %rd2235, %r6852, %r2067, %rd2726;
shr.u64 %rd2726, %rd2235, 32;
st.local.u32 [%rd2725], %rd2235;
add.s64 %rd2725, %rd2725, 4;
add.s64 %rd2724, %rd2724, 4;
add.s32 %r8387, %r8387, 1;
setp.ne.s32 %p1333, %r8387, 6;
@%p1333 bra $L__BB0_1574;
st.local.u32 [%rd4], %rd2726;
mov.u32 %r6853, 4;
sub.s32 %r2071, %r6853, %r2068;
mov.u32 %r6854, 6;
sub.s32 %r6855, %r6854, %r2068;
mul.wide.s32 %rd2236, %r6855, 4;
add.s64 %rd2237, %rd1, %rd2236;
ld.local.u32 %r8388, [%rd2237];
ld.local.u32 %r8389, [%rd2237+-4];
and.b32 %r2074, %r2066, 31;
setp.eq.s32 %p1334, %r2074, 0;
@%p1334 bra $L__BB0_1577;
mov.u32 %r6856, 32;
sub.s32 %r6857, %r6856, %r2074;
shr.u32 %r6858, %r8389, %r6857;
shl.b32 %r6859, %r8388, %r2074;
add.s32 %r8388, %r6858, %r6859;
mul.wide.s32 %rd2238, %r2071, 4;
add.s64 %rd2239, %rd1, %rd2238;
ld.local.u32 %r6860, [%rd2239];
shr.u32 %r6861, %r6860, %r6857;
shl.b32 %r6862, %r8389, %r2074;
add.s32 %r8389, %r6861, %r6862;
$L__BB0_1577:
and.b32 %r6863, %r2065, -2147483648;
shr.u32 %r6864, %r8389, 30;
shl.b32 %r6865, %r8388, 2;
or.b32 %r6866, %r6864, %r6865;
shr.u32 %r6867, %r6866, 31;
shr.u32 %r6868, %r8388, 30;
add.s32 %r6869, %r6867, %r6868;
neg.s32 %r6870, %r6869;
setp.eq.s32 %p1335, %r6863, 0;
selp.b32 %r8390, %r6869, %r6870, %p1335;
setp.ne.s32 %p1336, %r6867, 0;
xor.b32 %r6871, %r6863, -2147483648;
selp.b32 %r6872, %r6871, %r6863, %p1336;
selp.b32 %r6873, -1, 0, %p1336;
xor.b32 %r6874, %r6866, %r6873;
shl.b32 %r6875, %r8389, 2;
xor.b32 %r6876, %r6875, %r6873;
cvt.u64.u32 %rd2240, %r6874;
cvt.u64.u32 %rd2241, %r6876;
bfi.b64 %rd2242, %rd2240, %rd2241, 32, 32;
cvt.rn.f64.s64 %fd211, %rd2242;
mul.f64 %fd212, %fd211, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4584, %fd212;
setp.eq.s32 %p1337, %r6872, 0;
neg.f32 %f4585, %f4584;
selp.f32 %f5899, %f4584, %f4585, %p1337;
$L__BB0_1579:
add.s32 %r2081, %r8390, 1;
and.b32 %r2082, %r2081, 1;
setp.eq.s32 %p1338, %r2082, 0;
selp.f32 %f1844, %f5899, 0f3F800000, %p1338;
mul.rn.f32 %f1845, %f5899, %f5899;
mov.f32 %f5900, 0fB94D4153;
@%p1338 bra $L__BB0_1581;
mov.f32 %f4588, 0fBAB607ED;
mov.f32 %f4589, 0f37CBAC00;
fma.rn.f32 %f5900, %f4589, %f1845, %f4588;
$L__BB0_1581:
selp.f32 %f4590, 0f3C0885E4, 0f3D2AAABB, %p1338;
fma.rn.f32 %f4591, %f5900, %f1845, %f4590;
selp.f32 %f4592, 0fBE2AAAA8, 0fBEFFFFFF, %p1338;
fma.rn.f32 %f4593, %f4591, %f1845, %f4592;
mov.f32 %f4594, 0f00000000;
fma.rn.f32 %f4595, %f1845, %f1844, %f4594;
fma.rn.f32 %f5283, %f4593, %f4595, %f1844;
and.b32 %r6878, %r2081, 2;
setp.eq.s32 %p1340, %r6878, 0;
@%p1340 bra $L__BB0_1583;
mov.f32 %f4597, 0fBF800000;
fma.rn.f32 %f5283, %f5283, %f4597, %f4594;
$L__BB0_1583:
selp.f32 %f1852, %f5283, %f5284, %p28;
selp.f32 %f1853, %f5281, %f5282, %p28;
@%p1319 bra $L__BB0_1585;
add.f32 %f5993, %f1853, %f1852;
mov.f32 %f5282, %f5281;
mov.f32 %f5284, %f5283;
$L__BB0_1585:
@%p1200 bra $L__BB0_1614;
shl.b32 %r6879, %r12, 5;
mov.u32 %r6880, -32;
sub.s32 %r2083, %r6880, %r6879;
setp.ge.s32 %p1344, %r11, %r2083;
@%p1344 bra $L__BB0_1599;
mul.f32 %f4600, %f5411, 0f3F22F983;
cvt.rni.s32.f32 %r8394, %f4600;
cvt.rn.f32.s32 %f4601, %r8394;
mov.f32 %f4602, 0fBFC90FDA;
fma.rn.f32 %f4603, %f4601, %f4602, %f5411;
mov.f32 %f4604, 0fB3A22168;
fma.rn.f32 %f4605, %f4601, %f4604, %f4603;
mov.f32 %f4606, 0fA7C234C5;
fma.rn.f32 %f5908, %f4601, %f4606, %f4605;
abs.f32 %f1861, %f5411;
setp.ltu.f32 %p1345, %f1861, 0f47CE4780;
@%p1345 bra $L__BB0_1595;
setp.eq.f32 %p1346, %f1861, 0f7F800000;
@%p1346 bra $L__BB0_1594;
bra.uni $L__BB0_1589;
$L__BB0_1594:
mov.f32 %f4609, 0f00000000;
mul.rn.f32 %f5908, %f5411, %f4609;
mov.u32 %r8394, 0;
bra.uni $L__BB0_1595;
$L__BB0_1589:
mov.b32 %r2085, %f5411;
shr.u32 %r6882, %r2085, 23;
and.b32 %r6883, %r6882, 255;
add.s32 %r2086, %r6883, -128;
shl.b32 %r6884, %r2085, 8;
or.b32 %r2087, %r6884, -2147483648;
shr.u32 %r2088, %r2086, 5;
mov.u64 %rd2729, 0;
mov.u32 %r8391, 0;
mov.u64 %rd2727, __cudart_i2opi_f;
mov.u64 %rd2728, %rd1;
$L__BB0_1590:
.pragma "nounroll";
ld.global.nc.u32 %r6885, [%rd2727];
mad.wide.u32 %rd2245, %r6885, %r2087, %rd2729;
shr.u64 %rd2729, %rd2245, 32;
st.local.u32 [%rd2728], %rd2245;
add.s64 %rd2728, %rd2728, 4;
add.s64 %rd2727, %rd2727, 4;
add.s32 %r8391, %r8391, 1;
setp.ne.s32 %p1347, %r8391, 6;
@%p1347 bra $L__BB0_1590;
st.local.u32 [%rd4], %rd2729;
mov.u32 %r6886, 4;
sub.s32 %r2091, %r6886, %r2088;
mov.u32 %r6887, 6;
sub.s32 %r6888, %r6887, %r2088;
mul.wide.s32 %rd2246, %r6888, 4;
add.s64 %rd2247, %rd1, %rd2246;
ld.local.u32 %r8392, [%rd2247];
ld.local.u32 %r8393, [%rd2247+-4];
and.b32 %r2094, %r2086, 31;
setp.eq.s32 %p1348, %r2094, 0;
@%p1348 bra $L__BB0_1593;
mov.u32 %r6889, 32;
sub.s32 %r6890, %r6889, %r2094;
shr.u32 %r6891, %r8393, %r6890;
shl.b32 %r6892, %r8392, %r2094;
add.s32 %r8392, %r6891, %r6892;
mul.wide.s32 %rd2248, %r2091, 4;
add.s64 %rd2249, %rd1, %rd2248;
ld.local.u32 %r6893, [%rd2249];
shr.u32 %r6894, %r6893, %r6890;
shl.b32 %r6895, %r8393, %r2094;
add.s32 %r8393, %r6894, %r6895;
$L__BB0_1593:
and.b32 %r6896, %r2085, -2147483648;
shr.u32 %r6897, %r8393, 30;
shl.b32 %r6898, %r8392, 2;
or.b32 %r6899, %r6897, %r6898;
shr.u32 %r6900, %r6899, 31;
shr.u32 %r6901, %r8392, 30;
add.s32 %r6902, %r6900, %r6901;
neg.s32 %r6903, %r6902;
setp.eq.s32 %p1349, %r6896, 0;
selp.b32 %r8394, %r6902, %r6903, %p1349;
setp.ne.s32 %p1350, %r6900, 0;
xor.b32 %r6904, %r6896, -2147483648;
selp.b32 %r6905, %r6904, %r6896, %p1350;
selp.b32 %r6906, -1, 0, %p1350;
xor.b32 %r6907, %r6899, %r6906;
shl.b32 %r6908, %r8393, 2;
xor.b32 %r6909, %r6908, %r6906;
cvt.u64.u32 %rd2250, %r6907;
cvt.u64.u32 %rd2251, %r6909;
bfi.b64 %rd2252, %rd2250, %rd2251, 32, 32;
cvt.rn.f64.s64 %fd213, %rd2252;
mul.f64 %fd214, %fd213, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4607, %fd214;
setp.eq.s32 %p1351, %r6905, 0;
neg.f32 %f4608, %f4607;
selp.f32 %f5908, %f4607, %f4608, %p1351;
$L__BB0_1595:
and.b32 %r2101, %r8394, 1;
setp.eq.s32 %p1352, %r2101, 0;
selp.f32 %f1865, %f5908, 0f3F800000, %p1352;
mul.rn.f32 %f1866, %f5908, %f5908;
mov.f32 %f5909, 0fB94D4153;
@%p1352 bra $L__BB0_1597;
mov.f32 %f4611, 0fBAB607ED;
mov.f32 %f4612, 0f37CBAC00;
fma.rn.f32 %f5909, %f4612, %f1866, %f4611;
$L__BB0_1597:
selp.f32 %f4613, 0f3C0885E4, 0f3D2AAABB, %p1352;
fma.rn.f32 %f4614, %f5909, %f1866, %f4613;
selp.f32 %f4615, 0fBE2AAAA8, 0fBEFFFFFF, %p1352;
fma.rn.f32 %f4616, %f4614, %f1866, %f4615;
mov.f32 %f4617, 0f00000000;
fma.rn.f32 %f4618, %f1866, %f1865, %f4617;
fma.rn.f32 %f5281, %f4616, %f4618, %f1865;
and.b32 %r6911, %r8394, 2;
setp.eq.s32 %p1354, %r6911, 0;
@%p1354 bra $L__BB0_1599;
mov.f32 %f4620, 0fBF800000;
fma.rn.f32 %f5281, %f5281, %f4620, %f4617;
$L__BB0_1599:
setp.lt.s32 %p29, %r11, %r2083;
@%p1344 bra $L__BB0_1612;
mul.f32 %f4621, %f5403, 0f3F22F983;
cvt.rni.s32.f32 %r8398, %f4621;
cvt.rn.f32.s32 %f4622, %r8398;
mov.f32 %f4623, 0fBFC90FDA;
fma.rn.f32 %f4624, %f4622, %f4623, %f5403;
mov.f32 %f4625, 0fB3A22168;
fma.rn.f32 %f4626, %f4622, %f4625, %f4624;
mov.f32 %f4627, 0fA7C234C5;
fma.rn.f32 %f5912, %f4622, %f4627, %f4626;
abs.f32 %f1874, %f5403;
setp.ltu.f32 %p1356, %f1874, 0f47CE4780;
@%p1356 bra $L__BB0_1608;
setp.eq.f32 %p1357, %f1874, 0f7F800000;
@%p1357 bra $L__BB0_1607;
bra.uni $L__BB0_1602;
$L__BB0_1607:
mov.f32 %f4630, 0f00000000;
mul.rn.f32 %f5912, %f5403, %f4630;
mov.u32 %r8398, 0;
bra.uni $L__BB0_1608;
$L__BB0_1602:
mov.b32 %r2103, %f5403;
shr.u32 %r6913, %r2103, 23;
and.b32 %r6914, %r6913, 255;
add.s32 %r2104, %r6914, -128;
shl.b32 %r6915, %r2103, 8;
or.b32 %r2105, %r6915, -2147483648;
shr.u32 %r2106, %r2104, 5;
mov.u64 %rd2732, 0;
mov.u32 %r8395, 0;
mov.u64 %rd2730, __cudart_i2opi_f;
mov.u64 %rd2731, %rd1;
$L__BB0_1603:
.pragma "nounroll";
ld.global.nc.u32 %r6916, [%rd2730];
mad.wide.u32 %rd2255, %r6916, %r2105, %rd2732;
shr.u64 %rd2732, %rd2255, 32;
st.local.u32 [%rd2731], %rd2255;
add.s64 %rd2731, %rd2731, 4;
add.s64 %rd2730, %rd2730, 4;
add.s32 %r8395, %r8395, 1;
setp.ne.s32 %p1358, %r8395, 6;
@%p1358 bra $L__BB0_1603;
st.local.u32 [%rd4], %rd2732;
mov.u32 %r6917, 4;
sub.s32 %r2109, %r6917, %r2106;
mov.u32 %r6918, 6;
sub.s32 %r6919, %r6918, %r2106;
mul.wide.s32 %rd2256, %r6919, 4;
add.s64 %rd2257, %rd1, %rd2256;
ld.local.u32 %r8396, [%rd2257];
ld.local.u32 %r8397, [%rd2257+-4];
and.b32 %r2112, %r2104, 31;
setp.eq.s32 %p1359, %r2112, 0;
@%p1359 bra $L__BB0_1606;
mov.u32 %r6920, 32;
sub.s32 %r6921, %r6920, %r2112;
shr.u32 %r6922, %r8397, %r6921;
shl.b32 %r6923, %r8396, %r2112;
add.s32 %r8396, %r6922, %r6923;
mul.wide.s32 %rd2258, %r2109, 4;
add.s64 %rd2259, %rd1, %rd2258;
ld.local.u32 %r6924, [%rd2259];
shr.u32 %r6925, %r6924, %r6921;
shl.b32 %r6926, %r8397, %r2112;
add.s32 %r8397, %r6925, %r6926;
$L__BB0_1606:
and.b32 %r6927, %r2103, -2147483648;
shr.u32 %r6928, %r8397, 30;
shl.b32 %r6929, %r8396, 2;
or.b32 %r6930, %r6928, %r6929;
shr.u32 %r6931, %r6930, 31;
shr.u32 %r6932, %r8396, 30;
add.s32 %r6933, %r6931, %r6932;
neg.s32 %r6934, %r6933;
setp.eq.s32 %p1360, %r6927, 0;
selp.b32 %r8398, %r6933, %r6934, %p1360;
setp.ne.s32 %p1361, %r6931, 0;
xor.b32 %r6935, %r6927, -2147483648;
selp.b32 %r6936, %r6935, %r6927, %p1361;
selp.b32 %r6937, -1, 0, %p1361;
xor.b32 %r6938, %r6930, %r6937;
shl.b32 %r6939, %r8397, 2;
xor.b32 %r6940, %r6939, %r6937;
cvt.u64.u32 %rd2260, %r6938;
cvt.u64.u32 %rd2261, %r6940;
bfi.b64 %rd2262, %rd2260, %rd2261, 32, 32;
cvt.rn.f64.s64 %fd215, %rd2262;
mul.f64 %fd216, %fd215, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4628, %fd216;
setp.eq.s32 %p1362, %r6936, 0;
neg.f32 %f4629, %f4628;
selp.f32 %f5912, %f4628, %f4629, %p1362;
$L__BB0_1608:
add.s32 %r2119, %r8398, 1;
and.b32 %r2120, %r2119, 1;
setp.eq.s32 %p1363, %r2120, 0;
selp.f32 %f1878, %f5912, 0f3F800000, %p1363;
mul.rn.f32 %f1879, %f5912, %f5912;
mov.f32 %f5913, 0fB94D4153;
@%p1363 bra $L__BB0_1610;
mov.f32 %f4632, 0fBAB607ED;
mov.f32 %f4633, 0f37CBAC00;
fma.rn.f32 %f5913, %f4633, %f1879, %f4632;
$L__BB0_1610:
selp.f32 %f4634, 0f3C0885E4, 0f3D2AAABB, %p1363;
fma.rn.f32 %f4635, %f5913, %f1879, %f4634;
selp.f32 %f4636, 0fBE2AAAA8, 0fBEFFFFFF, %p1363;
fma.rn.f32 %f4637, %f4635, %f1879, %f4636;
mov.f32 %f4638, 0f00000000;
fma.rn.f32 %f4639, %f1879, %f1878, %f4638;
fma.rn.f32 %f5283, %f4637, %f4639, %f1878;
and.b32 %r6942, %r2119, 2;
setp.eq.s32 %p1365, %r6942, 0;
@%p1365 bra $L__BB0_1612;
mov.f32 %f4641, 0fBF800000;
fma.rn.f32 %f5283, %f5283, %f4641, %f4638;
$L__BB0_1612:
selp.f32 %f1886, %f5283, %f5284, %p29;
selp.f32 %f1887, %f5281, %f5282, %p29;
@%p1344 bra $L__BB0_1614;
add.f32 %f5992, %f1887, %f1886;
mov.f32 %f5282, %f5281;
mov.f32 %f5284, %f5283;
$L__BB0_1614:
@%p1203 bra $L__BB0_1643;
shl.b32 %r6943, %r12, 5;
neg.s32 %r2121, %r6943;
setp.ge.s32 %p1369, %r11, %r2121;
@%p1369 bra $L__BB0_1628;
mul.f32 %f4644, %f5410, 0f3F22F983;
cvt.rni.s32.f32 %r8402, %f4644;
cvt.rn.f32.s32 %f4645, %r8402;
mov.f32 %f4646, 0fBFC90FDA;
fma.rn.f32 %f4647, %f4645, %f4646, %f5410;
mov.f32 %f4648, 0fB3A22168;
fma.rn.f32 %f4649, %f4645, %f4648, %f4647;
mov.f32 %f4650, 0fA7C234C5;
fma.rn.f32 %f5921, %f4645, %f4650, %f4649;
abs.f32 %f1895, %f5410;
setp.ltu.f32 %p1370, %f1895, 0f47CE4780;
@%p1370 bra $L__BB0_1624;
setp.eq.f32 %p1371, %f1895, 0f7F800000;
@%p1371 bra $L__BB0_1623;
bra.uni $L__BB0_1618;
$L__BB0_1623:
mov.f32 %f4653, 0f00000000;
mul.rn.f32 %f5921, %f5410, %f4653;
mov.u32 %r8402, 0;
bra.uni $L__BB0_1624;
$L__BB0_1618:
mov.b32 %r2123, %f5410;
shr.u32 %r6945, %r2123, 23;
and.b32 %r6946, %r6945, 255;
add.s32 %r2124, %r6946, -128;
shl.b32 %r6947, %r2123, 8;
or.b32 %r2125, %r6947, -2147483648;
shr.u32 %r2126, %r2124, 5;
mov.u64 %rd2735, 0;
mov.u32 %r8399, 0;
mov.u64 %rd2733, __cudart_i2opi_f;
mov.u64 %rd2734, %rd1;
$L__BB0_1619:
.pragma "nounroll";
ld.global.nc.u32 %r6948, [%rd2733];
mad.wide.u32 %rd2265, %r6948, %r2125, %rd2735;
shr.u64 %rd2735, %rd2265, 32;
st.local.u32 [%rd2734], %rd2265;
add.s64 %rd2734, %rd2734, 4;
add.s64 %rd2733, %rd2733, 4;
add.s32 %r8399, %r8399, 1;
setp.ne.s32 %p1372, %r8399, 6;
@%p1372 bra $L__BB0_1619;
st.local.u32 [%rd4], %rd2735;
mov.u32 %r6949, 4;
sub.s32 %r2129, %r6949, %r2126;
mov.u32 %r6950, 6;
sub.s32 %r6951, %r6950, %r2126;
mul.wide.s32 %rd2266, %r6951, 4;
add.s64 %rd2267, %rd1, %rd2266;
ld.local.u32 %r8400, [%rd2267];
ld.local.u32 %r8401, [%rd2267+-4];
and.b32 %r2132, %r2124, 31;
setp.eq.s32 %p1373, %r2132, 0;
@%p1373 bra $L__BB0_1622;
mov.u32 %r6952, 32;
sub.s32 %r6953, %r6952, %r2132;
shr.u32 %r6954, %r8401, %r6953;
shl.b32 %r6955, %r8400, %r2132;
add.s32 %r8400, %r6954, %r6955;
mul.wide.s32 %rd2268, %r2129, 4;
add.s64 %rd2269, %rd1, %rd2268;
ld.local.u32 %r6956, [%rd2269];
shr.u32 %r6957, %r6956, %r6953;
shl.b32 %r6958, %r8401, %r2132;
add.s32 %r8401, %r6957, %r6958;
$L__BB0_1622:
and.b32 %r6959, %r2123, -2147483648;
shr.u32 %r6960, %r8401, 30;
shl.b32 %r6961, %r8400, 2;
or.b32 %r6962, %r6960, %r6961;
shr.u32 %r6963, %r6962, 31;
shr.u32 %r6964, %r8400, 30;
add.s32 %r6965, %r6963, %r6964;
neg.s32 %r6966, %r6965;
setp.eq.s32 %p1374, %r6959, 0;
selp.b32 %r8402, %r6965, %r6966, %p1374;
setp.ne.s32 %p1375, %r6963, 0;
xor.b32 %r6967, %r6959, -2147483648;
selp.b32 %r6968, %r6967, %r6959, %p1375;
selp.b32 %r6969, -1, 0, %p1375;
xor.b32 %r6970, %r6962, %r6969;
shl.b32 %r6971, %r8401, 2;
xor.b32 %r6972, %r6971, %r6969;
cvt.u64.u32 %rd2270, %r6970;
cvt.u64.u32 %rd2271, %r6972;
bfi.b64 %rd2272, %rd2270, %rd2271, 32, 32;
cvt.rn.f64.s64 %fd217, %rd2272;
mul.f64 %fd218, %fd217, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4651, %fd218;
setp.eq.s32 %p1376, %r6968, 0;
neg.f32 %f4652, %f4651;
selp.f32 %f5921, %f4651, %f4652, %p1376;
$L__BB0_1624:
and.b32 %r2139, %r8402, 1;
setp.eq.s32 %p1377, %r2139, 0;
selp.f32 %f1899, %f5921, 0f3F800000, %p1377;
mul.rn.f32 %f1900, %f5921, %f5921;
mov.f32 %f5922, 0fB94D4153;
@%p1377 bra $L__BB0_1626;
mov.f32 %f4655, 0fBAB607ED;
mov.f32 %f4656, 0f37CBAC00;
fma.rn.f32 %f5922, %f4656, %f1900, %f4655;
$L__BB0_1626:
selp.f32 %f4657, 0f3C0885E4, 0f3D2AAABB, %p1377;
fma.rn.f32 %f4658, %f5922, %f1900, %f4657;
selp.f32 %f4659, 0fBE2AAAA8, 0fBEFFFFFF, %p1377;
fma.rn.f32 %f4660, %f4658, %f1900, %f4659;
mov.f32 %f4661, 0f00000000;
fma.rn.f32 %f4662, %f1900, %f1899, %f4661;
fma.rn.f32 %f5281, %f4660, %f4662, %f1899;
and.b32 %r6974, %r8402, 2;
setp.eq.s32 %p1379, %r6974, 0;
@%p1379 bra $L__BB0_1628;
mov.f32 %f4664, 0fBF800000;
fma.rn.f32 %f5281, %f5281, %f4664, %f4661;
$L__BB0_1628:
setp.lt.s32 %p30, %r11, %r2121;
@%p1369 bra $L__BB0_1641;
mul.f32 %f4665, %f5402, 0f3F22F983;
cvt.rni.s32.f32 %r8406, %f4665;
cvt.rn.f32.s32 %f4666, %r8406;
mov.f32 %f4667, 0fBFC90FDA;
fma.rn.f32 %f4668, %f4666, %f4667, %f5402;
mov.f32 %f4669, 0fB3A22168;
fma.rn.f32 %f4670, %f4666, %f4669, %f4668;
mov.f32 %f4671, 0fA7C234C5;
fma.rn.f32 %f5925, %f4666, %f4671, %f4670;
abs.f32 %f1908, %f5402;
setp.ltu.f32 %p1381, %f1908, 0f47CE4780;
@%p1381 bra $L__BB0_1637;
setp.eq.f32 %p1382, %f1908, 0f7F800000;
@%p1382 bra $L__BB0_1636;
bra.uni $L__BB0_1631;
$L__BB0_1636:
mov.f32 %f4674, 0f00000000;
mul.rn.f32 %f5925, %f5402, %f4674;
mov.u32 %r8406, 0;
bra.uni $L__BB0_1637;
$L__BB0_1631:
mov.b32 %r2141, %f5402;
shr.u32 %r6976, %r2141, 23;
and.b32 %r6977, %r6976, 255;
add.s32 %r2142, %r6977, -128;
shl.b32 %r6978, %r2141, 8;
or.b32 %r2143, %r6978, -2147483648;
shr.u32 %r2144, %r2142, 5;
mov.u64 %rd2738, 0;
mov.u32 %r8403, 0;
mov.u64 %rd2736, __cudart_i2opi_f;
mov.u64 %rd2737, %rd1;
$L__BB0_1632:
.pragma "nounroll";
ld.global.nc.u32 %r6979, [%rd2736];
mad.wide.u32 %rd2275, %r6979, %r2143, %rd2738;
shr.u64 %rd2738, %rd2275, 32;
st.local.u32 [%rd2737], %rd2275;
add.s64 %rd2737, %rd2737, 4;
add.s64 %rd2736, %rd2736, 4;
add.s32 %r8403, %r8403, 1;
setp.ne.s32 %p1383, %r8403, 6;
@%p1383 bra $L__BB0_1632;
st.local.u32 [%rd4], %rd2738;
mov.u32 %r6980, 4;
sub.s32 %r2147, %r6980, %r2144;
mov.u32 %r6981, 6;
sub.s32 %r6982, %r6981, %r2144;
mul.wide.s32 %rd2276, %r6982, 4;
add.s64 %rd2277, %rd1, %rd2276;
ld.local.u32 %r8404, [%rd2277];
ld.local.u32 %r8405, [%rd2277+-4];
and.b32 %r2150, %r2142, 31;
setp.eq.s32 %p1384, %r2150, 0;
@%p1384 bra $L__BB0_1635;
mov.u32 %r6983, 32;
sub.s32 %r6984, %r6983, %r2150;
shr.u32 %r6985, %r8405, %r6984;
shl.b32 %r6986, %r8404, %r2150;
add.s32 %r8404, %r6985, %r6986;
mul.wide.s32 %rd2278, %r2147, 4;
add.s64 %rd2279, %rd1, %rd2278;
ld.local.u32 %r6987, [%rd2279];
shr.u32 %r6988, %r6987, %r6984;
shl.b32 %r6989, %r8405, %r2150;
add.s32 %r8405, %r6988, %r6989;
$L__BB0_1635:
and.b32 %r6990, %r2141, -2147483648;
shr.u32 %r6991, %r8405, 30;
shl.b32 %r6992, %r8404, 2;
or.b32 %r6993, %r6991, %r6992;
shr.u32 %r6994, %r6993, 31;
shr.u32 %r6995, %r8404, 30;
add.s32 %r6996, %r6994, %r6995;
neg.s32 %r6997, %r6996;
setp.eq.s32 %p1385, %r6990, 0;
selp.b32 %r8406, %r6996, %r6997, %p1385;
setp.ne.s32 %p1386, %r6994, 0;
xor.b32 %r6998, %r6990, -2147483648;
selp.b32 %r6999, %r6998, %r6990, %p1386;
selp.b32 %r7000, -1, 0, %p1386;
xor.b32 %r7001, %r6993, %r7000;
shl.b32 %r7002, %r8405, 2;
xor.b32 %r7003, %r7002, %r7000;
cvt.u64.u32 %rd2280, %r7001;
cvt.u64.u32 %rd2281, %r7003;
bfi.b64 %rd2282, %rd2280, %rd2281, 32, 32;
cvt.rn.f64.s64 %fd219, %rd2282;
mul.f64 %fd220, %fd219, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4672, %fd220;
setp.eq.s32 %p1387, %r6999, 0;
neg.f32 %f4673, %f4672;
selp.f32 %f5925, %f4672, %f4673, %p1387;
$L__BB0_1637:
add.s32 %r2157, %r8406, 1;
and.b32 %r2158, %r2157, 1;
setp.eq.s32 %p1388, %r2158, 0;
selp.f32 %f1912, %f5925, 0f3F800000, %p1388;
mul.rn.f32 %f1913, %f5925, %f5925;
mov.f32 %f5926, 0fB94D4153;
@%p1388 bra $L__BB0_1639;
mov.f32 %f4676, 0fBAB607ED;
mov.f32 %f4677, 0f37CBAC00;
fma.rn.f32 %f5926, %f4677, %f1913, %f4676;
$L__BB0_1639:
selp.f32 %f4678, 0f3C0885E4, 0f3D2AAABB, %p1388;
fma.rn.f32 %f4679, %f5926, %f1913, %f4678;
selp.f32 %f4680, 0fBE2AAAA8, 0fBEFFFFFF, %p1388;
fma.rn.f32 %f4681, %f4679, %f1913, %f4680;
mov.f32 %f4682, 0f00000000;
fma.rn.f32 %f4683, %f1913, %f1912, %f4682;
fma.rn.f32 %f5283, %f4681, %f4683, %f1912;
and.b32 %r7005, %r2157, 2;
setp.eq.s32 %p1390, %r7005, 0;
@%p1390 bra $L__BB0_1641;
mov.f32 %f4685, 0fBF800000;
fma.rn.f32 %f5283, %f5283, %f4685, %f4682;
$L__BB0_1641:
selp.f32 %f1920, %f5283, %f5284, %p30;
selp.f32 %f1921, %f5281, %f5282, %p30;
@%p1369 bra $L__BB0_1643;
add.f32 %f5991, %f1921, %f1920;
mov.f32 %f5282, %f5281;
mov.f32 %f5284, %f5283;
$L__BB0_1643:
@%p1203 bra $L__BB0_1865;
shl.b32 %r7006, %r12, 5;
mov.u32 %r7007, -32;
sub.s32 %r2159, %r7007, %r7006;
setp.ge.s32 %p1394, %r11, %r2159;
@%p1394 bra $L__BB0_1657;
mul.f32 %f4688, %f5409, 0f3F22F983;
cvt.rni.s32.f32 %r8410, %f4688;
cvt.rn.f32.s32 %f4689, %r8410;
mov.f32 %f4690, 0fBFC90FDA;
fma.rn.f32 %f4691, %f4689, %f4690, %f5409;
mov.f32 %f4692, 0fB3A22168;
fma.rn.f32 %f4693, %f4689, %f4692, %f4691;
mov.f32 %f4694, 0fA7C234C5;
fma.rn.f32 %f5934, %f4689, %f4694, %f4693;
abs.f32 %f1929, %f5409;
setp.ltu.f32 %p1395, %f1929, 0f47CE4780;
@%p1395 bra $L__BB0_1653;
setp.eq.f32 %p1396, %f1929, 0f7F800000;
@%p1396 bra $L__BB0_1652;
bra.uni $L__BB0_1647;
$L__BB0_1652:
mov.f32 %f4697, 0f00000000;
mul.rn.f32 %f5934, %f5409, %f4697;
mov.u32 %r8410, 0;
bra.uni $L__BB0_1653;
$L__BB0_1647:
mov.b32 %r2161, %f5409;
shr.u32 %r7009, %r2161, 23;
and.b32 %r7010, %r7009, 255;
add.s32 %r2162, %r7010, -128;
shl.b32 %r7011, %r2161, 8;
or.b32 %r2163, %r7011, -2147483648;
shr.u32 %r2164, %r2162, 5;
mov.u64 %rd2741, 0;
mov.u32 %r8407, 0;
mov.u64 %rd2739, __cudart_i2opi_f;
mov.u64 %rd2740, %rd1;
$L__BB0_1648:
.pragma "nounroll";
ld.global.nc.u32 %r7012, [%rd2739];
mad.wide.u32 %rd2285, %r7012, %r2163, %rd2741;
shr.u64 %rd2741, %rd2285, 32;
st.local.u32 [%rd2740], %rd2285;
add.s64 %rd2740, %rd2740, 4;
add.s64 %rd2739, %rd2739, 4;
add.s32 %r8407, %r8407, 1;
setp.ne.s32 %p1397, %r8407, 6;
@%p1397 bra $L__BB0_1648;
st.local.u32 [%rd4], %rd2741;
mov.u32 %r7013, 4;
sub.s32 %r2167, %r7013, %r2164;
mov.u32 %r7014, 6;
sub.s32 %r7015, %r7014, %r2164;
mul.wide.s32 %rd2286, %r7015, 4;
add.s64 %rd2287, %rd1, %rd2286;
ld.local.u32 %r8408, [%rd2287];
ld.local.u32 %r8409, [%rd2287+-4];
and.b32 %r2170, %r2162, 31;
setp.eq.s32 %p1398, %r2170, 0;
@%p1398 bra $L__BB0_1651;
mov.u32 %r7016, 32;
sub.s32 %r7017, %r7016, %r2170;
shr.u32 %r7018, %r8409, %r7017;
shl.b32 %r7019, %r8408, %r2170;
add.s32 %r8408, %r7018, %r7019;
mul.wide.s32 %rd2288, %r2167, 4;
add.s64 %rd2289, %rd1, %rd2288;
ld.local.u32 %r7020, [%rd2289];
shr.u32 %r7021, %r7020, %r7017;
shl.b32 %r7022, %r8409, %r2170;
add.s32 %r8409, %r7021, %r7022;
$L__BB0_1651:
and.b32 %r7023, %r2161, -2147483648;
shr.u32 %r7024, %r8409, 30;
shl.b32 %r7025, %r8408, 2;
or.b32 %r7026, %r7024, %r7025;
shr.u32 %r7027, %r7026, 31;
shr.u32 %r7028, %r8408, 30;
add.s32 %r7029, %r7027, %r7028;
neg.s32 %r7030, %r7029;
setp.eq.s32 %p1399, %r7023, 0;
selp.b32 %r8410, %r7029, %r7030, %p1399;
setp.ne.s32 %p1400, %r7027, 0;
xor.b32 %r7031, %r7023, -2147483648;
selp.b32 %r7032, %r7031, %r7023, %p1400;
selp.b32 %r7033, -1, 0, %p1400;
xor.b32 %r7034, %r7026, %r7033;
shl.b32 %r7035, %r8409, 2;
xor.b32 %r7036, %r7035, %r7033;
cvt.u64.u32 %rd2290, %r7034;
cvt.u64.u32 %rd2291, %r7036;
bfi.b64 %rd2292, %rd2290, %rd2291, 32, 32;
cvt.rn.f64.s64 %fd221, %rd2292;
mul.f64 %fd222, %fd221, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4695, %fd222;
setp.eq.s32 %p1401, %r7032, 0;
neg.f32 %f4696, %f4695;
selp.f32 %f5934, %f4695, %f4696, %p1401;
$L__BB0_1653:
and.b32 %r2177, %r8410, 1;
setp.eq.s32 %p1402, %r2177, 0;
selp.f32 %f1933, %f5934, 0f3F800000, %p1402;
mul.rn.f32 %f1934, %f5934, %f5934;
mov.f32 %f5935, 0fB94D4153;
@%p1402 bra $L__BB0_1655;
mov.f32 %f4699, 0fBAB607ED;
mov.f32 %f4700, 0f37CBAC00;
fma.rn.f32 %f5935, %f4700, %f1934, %f4699;
$L__BB0_1655:
selp.f32 %f4701, 0f3C0885E4, 0f3D2AAABB, %p1402;
fma.rn.f32 %f4702, %f5935, %f1934, %f4701;
selp.f32 %f4703, 0fBE2AAAA8, 0fBEFFFFFF, %p1402;
fma.rn.f32 %f4704, %f4702, %f1934, %f4703;
mov.f32 %f4705, 0f00000000;
fma.rn.f32 %f4706, %f1934, %f1933, %f4705;
fma.rn.f32 %f5281, %f4704, %f4706, %f1933;
and.b32 %r7038, %r8410, 2;
setp.eq.s32 %p1404, %r7038, 0;
@%p1404 bra $L__BB0_1657;
mov.f32 %f4708, 0fBF800000;
fma.rn.f32 %f5281, %f5281, %f4708, %f4705;
$L__BB0_1657:
setp.lt.s32 %p1406, %r11, %r2159;
selp.f32 %f1941, %f5281, %f5282, %p1406;
@%p1394 bra $L__BB0_1670;
mul.f32 %f4709, %f5401, 0f3F22F983;
cvt.rni.s32.f32 %r8414, %f4709;
cvt.rn.f32.s32 %f4710, %r8414;
mov.f32 %f4711, 0fBFC90FDA;
fma.rn.f32 %f4712, %f4710, %f4711, %f5401;
mov.f32 %f4713, 0fB3A22168;
fma.rn.f32 %f4714, %f4710, %f4713, %f4712;
mov.f32 %f4715, 0fA7C234C5;
fma.rn.f32 %f5938, %f4710, %f4715, %f4714;
abs.f32 %f1943, %f5401;
setp.ltu.f32 %p1407, %f1943, 0f47CE4780;
@%p1407 bra $L__BB0_1666;
setp.eq.f32 %p1408, %f1943, 0f7F800000;
@%p1408 bra $L__BB0_1665;
bra.uni $L__BB0_1660;
$L__BB0_1665:
mov.f32 %f4718, 0f00000000;
mul.rn.f32 %f5938, %f5401, %f4718;
mov.u32 %r8414, 0;
bra.uni $L__BB0_1666;
$L__BB0_1660:
mov.b32 %r2179, %f5401;
shr.u32 %r7040, %r2179, 23;
and.b32 %r7041, %r7040, 255;
add.s32 %r2180, %r7041, -128;
shl.b32 %r7042, %r2179, 8;
or.b32 %r2181, %r7042, -2147483648;
shr.u32 %r2182, %r2180, 5;
mov.u64 %rd2744, 0;
mov.u32 %r8411, 0;
mov.u64 %rd2742, __cudart_i2opi_f;
mov.u64 %rd2743, %rd1;
$L__BB0_1661:
.pragma "nounroll";
ld.global.nc.u32 %r7043, [%rd2742];
mad.wide.u32 %rd2295, %r7043, %r2181, %rd2744;
shr.u64 %rd2744, %rd2295, 32;
st.local.u32 [%rd2743], %rd2295;
add.s64 %rd2743, %rd2743, 4;
add.s64 %rd2742, %rd2742, 4;
add.s32 %r8411, %r8411, 1;
setp.ne.s32 %p1409, %r8411, 6;
@%p1409 bra $L__BB0_1661;
st.local.u32 [%rd4], %rd2744;
mov.u32 %r7044, 4;
sub.s32 %r2185, %r7044, %r2182;
mov.u32 %r7045, 6;
sub.s32 %r7046, %r7045, %r2182;
mul.wide.s32 %rd2296, %r7046, 4;
add.s64 %rd2297, %rd1, %rd2296;
ld.local.u32 %r8412, [%rd2297];
ld.local.u32 %r8413, [%rd2297+-4];
and.b32 %r2188, %r2180, 31;
setp.eq.s32 %p1410, %r2188, 0;
@%p1410 bra $L__BB0_1664;
mov.u32 %r7047, 32;
sub.s32 %r7048, %r7047, %r2188;
shr.u32 %r7049, %r8413, %r7048;
shl.b32 %r7050, %r8412, %r2188;
add.s32 %r8412, %r7049, %r7050;
mul.wide.s32 %rd2298, %r2185, 4;
add.s64 %rd2299, %rd1, %rd2298;
ld.local.u32 %r7051, [%rd2299];
shr.u32 %r7052, %r7051, %r7048;
shl.b32 %r7053, %r8413, %r2188;
add.s32 %r8413, %r7052, %r7053;
$L__BB0_1664:
and.b32 %r7054, %r2179, -2147483648;
shr.u32 %r7055, %r8413, 30;
shl.b32 %r7056, %r8412, 2;
or.b32 %r7057, %r7055, %r7056;
shr.u32 %r7058, %r7057, 31;
shr.u32 %r7059, %r8412, 30;
add.s32 %r7060, %r7058, %r7059;
neg.s32 %r7061, %r7060;
setp.eq.s32 %p1411, %r7054, 0;
selp.b32 %r8414, %r7060, %r7061, %p1411;
setp.ne.s32 %p1412, %r7058, 0;
xor.b32 %r7062, %r7054, -2147483648;
selp.b32 %r7063, %r7062, %r7054, %p1412;
selp.b32 %r7064, -1, 0, %p1412;
xor.b32 %r7065, %r7057, %r7064;
shl.b32 %r7066, %r8413, 2;
xor.b32 %r7067, %r7066, %r7064;
cvt.u64.u32 %rd2300, %r7065;
cvt.u64.u32 %rd2301, %r7067;
bfi.b64 %rd2302, %rd2300, %rd2301, 32, 32;
cvt.rn.f64.s64 %fd223, %rd2302;
mul.f64 %fd224, %fd223, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f4716, %fd224;
setp.eq.s32 %p1413, %r7063, 0;
neg.f32 %f4717, %f4716;
selp.f32 %f5938, %f4716, %f4717, %p1413;
$L__BB0_1666:
add.s32 %r2195, %r8414, 1;
and.b32 %r2196, %r2195, 1;
setp.eq.s32 %p1414, %r2196, 0;
selp.f32 %f1947, %f5938, 0f3F800000, %p1414;
mul.rn.f32 %f1948, %f5938, %f5938;
mov.f32 %f5939, 0fB94D4153;
@%p1414 bra $L__BB0_1668;
mov.f32 %f4720, 0fBAB607ED;
mov.f32 %f4721, 0f37CBAC00;
fma.rn.f32 %f5939, %f4721, %f1948, %f4720;
$L__BB0_1668:
selp.f32 %f4722, 0f3C0885E4, 0f3D2AAABB, %p1414;
fma.rn.f32 %f4723, %f5939, %f1948, %f4722;
selp.f32 %f4724, 0fBE2AAAA8, 0fBEFFFFFF, %p1414;
fma.rn.f32 %f4725, %f4723, %f1948, %f4724;
mov.f32 %f4726, 0f00000000;
fma.rn.f32 %f4727, %f1948, %f1947, %f4726;
fma.rn.f32 %f5283, %f4725, %f4727, %f1947;
and.b32 %r7069, %r2195, 2;
setp.eq.s32 %p1416, %r7069, 0;
@%p1416 bra $L__BB0_1670;
mov.f32 %f4729, 0fBF800000;
fma.rn.f32 %f5283, %f5283, %f4729, %f4726;
$L__BB0_1670:
selp.f32 %f1955, %f5283, %f5284, %p1406;
@%p1394 bra $L__BB0_1865;
add.f32 %f5990, %f1941, %f1955;
$L__BB0_1865:
shl.b32 %r2493, %r12, 1;
setp.lt.s32 %p1579, %r2493, 3;
and.pred %p1581, %p33, %p1579;
@%p1581 bra $L__BB0_1868;
bra.uni $L__BB0_1866;
$L__BB0_1868:
add.f32 %f5081, %f5400, 0f00000000;
add.f32 %f5082, %f5081, %f5399;
add.f32 %f5083, %f5082, %f5398;
add.f32 %f5084, %f5083, %f5397;
add.f32 %f5085, %f5084, %f5396;
add.f32 %f5086, %f5085, %f5395;
add.f32 %f5087, %f5086, %f5394;
add.f32 %f5998, %f5087, %f5393;
bra.uni $L__BB0_1869;
$L__BB0_1866:
setp.lt.s32 %p1582, %r2493, 15;
shl.b32 %r7628, %r12, 6;
neg.s32 %r7629, %r7628;
setp.lt.s32 %p1583, %r11, %r7629;
and.pred %p1584, %p1582, %p1583;
add.f32 %f5067, %f5400, 0f00000000;
selp.f32 %f5068, %f5067, 0f00000000, %p1584;
mov.u32 %r7630, -32;
sub.s32 %r2494, %r7630, %r7628;
setp.lt.s32 %p1585, %r11, %r2494;
add.f32 %f5069, %f5068, %f5399;
and.pred %p1586, %p1582, %p1585;
selp.f32 %f5070, %f5069, %f5068, %p1586;
add.f32 %f5071, %f5070, %f5398;
setp.lt.s32 %p1587, %r2493, 14;
and.pred %p1588, %p1587, %p1583;
selp.f32 %f5072, %f5071, %f5070, %p1588;
add.f32 %f5073, %f5072, %f5397;
and.pred %p1589, %p1587, %p1585;
selp.f32 %f5074, %f5073, %f5072, %p1589;
add.s32 %r7631, %r2493, 2;
setp.lt.s32 %p1590, %r7631, 15;
add.f32 %f5075, %f5074, %f5396;
and.pred %p1591, %p1590, %p1583;
selp.f32 %f5076, %f5075, %f5074, %p1591;
add.f32 %f5077, %f5076, %f5395;
and.pred %p1592, %p1590, %p1585;
selp.f32 %f5078, %f5077, %f5076, %p1592;
add.s32 %r7632, %r2493, 3;
setp.gt.s32 %p1593, %r7632, 14;
setp.lt.s32 %p1594, %r7632, 15;
add.f32 %f5079, %f5078, %f5394;
and.pred %p1595, %p1594, %p1583;
selp.f32 %f5998, %f5079, %f5078, %p1595;
@%p1593 bra $L__BB0_1869;
add.f32 %f5080, %f5998, %f5393;
selp.f32 %f5998, %f5080, %f5998, %p1585;
$L__BB0_1869:
setp.lt.s32 %p1597, %r2493, 2;
and.pred %p1599, %p33, %p1597;
@%p1599 bra $L__BB0_1872;
bra.uni $L__BB0_1870;
$L__BB0_1872:
add.f32 %f5102, %f5998, %f5599;
add.f32 %f5103, %f5102, %f5598;
add.f32 %f5104, %f5103, %f5597;
add.f32 %f5105, %f5104, %f5596;
add.f32 %f5106, %f5105, %f5595;
add.f32 %f5107, %f5106, %f5594;
add.f32 %f5108, %f5107, %f5593;
add.f32 %f5999, %f5108, %f5592;
bra.uni $L__BB0_1873;
$L__BB0_1870:
add.s32 %r7633, %r2493, 4;
setp.lt.s32 %p1600, %r7633, 15;
shl.b32 %r7634, %r12, 6;
neg.s32 %r7635, %r7634;
setp.lt.s32 %p1601, %r11, %r7635;
and.pred %p1602, %p1600, %p1601;
add.f32 %f5088, %f5998, %f5599;
selp.f32 %f5089, %f5088, %f5998, %p1602;
mov.u32 %r7636, -32;
sub.s32 %r2495, %r7636, %r7634;
setp.lt.s32 %p1603, %r11, %r2495;
add.f32 %f5090, %f5089, %f5598;
and.pred %p1604, %p1600, %p1603;
selp.f32 %f5091, %f5090, %f5089, %p1604;
add.s32 %r7637, %r2493, 5;
setp.lt.s32 %p1605, %r7637, 15;
add.f32 %f5092, %f5091, %f5597;
and.pred %p1606, %p1605, %p1601;
selp.f32 %f5093, %f5092, %f5091, %p1606;
add.f32 %f5094, %f5093, %f5596;
and.pred %p1607, %p1605, %p1603;
selp.f32 %f5095, %f5094, %f5093, %p1607;
add.s32 %r7638, %r2493, 6;
setp.lt.s32 %p1608, %r7638, 15;
add.f32 %f5096, %f5095, %f5595;
and.pred %p1609, %p1608, %p1601;
selp.f32 %f5097, %f5096, %f5095, %p1609;
add.f32 %f5098, %f5097, %f5594;
and.pred %p1610, %p1608, %p1603;
selp.f32 %f5099, %f5098, %f5097, %p1610;
add.s32 %r7639, %r2493, 7;
setp.gt.s32 %p1611, %r7639, 14;
setp.lt.s32 %p1612, %r7639, 15;
add.f32 %f5100, %f5099, %f5593;
and.pred %p1613, %p1612, %p1601;
selp.f32 %f5999, %f5100, %f5099, %p1613;
@%p1611 bra $L__BB0_1873;
add.f32 %f5101, %f5999, %f5592;
selp.f32 %f5999, %f5101, %f5999, %p1603;
$L__BB0_1873:
setp.lt.s32 %p1615, %r2493, 1;
and.pred %p1617, %p33, %p1615;
@%p1617 bra $L__BB0_1876;
bra.uni $L__BB0_1874;
$L__BB0_1876:
add.f32 %f5123, %f5999, %f5798;
add.f32 %f5124, %f5123, %f5797;
add.f32 %f5125, %f5124, %f5796;
add.f32 %f5126, %f5125, %f5795;
add.f32 %f5127, %f5126, %f5794;
add.f32 %f5128, %f5127, %f5793;
add.f32 %f5129, %f5128, %f5792;
add.f32 %f6000, %f5129, %f5791;
bra.uni $L__BB0_1877;
$L__BB0_1874:
add.s32 %r7640, %r2493, 8;
setp.lt.s32 %p1618, %r7640, 15;
shl.b32 %r7641, %r12, 6;
neg.s32 %r7642, %r7641;
setp.lt.s32 %p1619, %r11, %r7642;
and.pred %p1620, %p1618, %p1619;
add.f32 %f5109, %f5999, %f5798;
selp.f32 %f5110, %f5109, %f5999, %p1620;
mov.u32 %r7643, -32;
sub.s32 %r2496, %r7643, %r7641;
setp.lt.s32 %p1621, %r11, %r2496;
add.f32 %f5111, %f5110, %f5797;
and.pred %p1622, %p1618, %p1621;
selp.f32 %f5112, %f5111, %f5110, %p1622;
add.s32 %r7644, %r2493, 9;
setp.lt.s32 %p1623, %r7644, 15;
add.f32 %f5113, %f5112, %f5796;
and.pred %p1624, %p1623, %p1619;
selp.f32 %f5114, %f5113, %f5112, %p1624;
add.f32 %f5115, %f5114, %f5795;
and.pred %p1625, %p1623, %p1621;
selp.f32 %f5116, %f5115, %f5114, %p1625;
add.s32 %r7645, %r2493, 10;
setp.lt.s32 %p1626, %r7645, 15;
add.f32 %f5117, %f5116, %f5794;
and.pred %p1627, %p1626, %p1619;
selp.f32 %f5118, %f5117, %f5116, %p1627;
add.f32 %f5119, %f5118, %f5793;
and.pred %p1628, %p1626, %p1621;
selp.f32 %f5120, %f5119, %f5118, %p1628;
add.s32 %r7646, %r2493, 11;
setp.gt.s32 %p1629, %r7646, 14;
setp.lt.s32 %p1630, %r7646, 15;
add.f32 %f5121, %f5120, %f5792;
and.pred %p1631, %p1630, %p1619;
selp.f32 %f6000, %f5121, %f5120, %p1631;
@%p1629 bra $L__BB0_1877;
add.f32 %f5122, %f6000, %f5791;
selp.f32 %f6000, %f5122, %f6000, %p1621;
$L__BB0_1877:
setp.lt.s32 %p1633, %r2493, 0;
and.pred %p1635, %p33, %p1633;
@%p1635 bra $L__BB0_1880;
bra.uni $L__BB0_1878;
$L__BB0_1880:
add.f32 %f5144, %f6000, %f5997;
add.f32 %f5145, %f5144, %f5996;
add.f32 %f5146, %f5145, %f5995;
add.f32 %f5147, %f5146, %f5994;
add.f32 %f5148, %f5147, %f5993;
add.f32 %f5149, %f5148, %f5992;
add.f32 %f5150, %f5149, %f5991;
add.f32 %f6001, %f5150, %f5990;
bra.uni $L__BB0_1881;
$L__BB0_1878:
add.s32 %r7647, %r2493, 12;
setp.lt.s32 %p1636, %r7647, 15;
shl.b32 %r7648, %r12, 6;
neg.s32 %r7649, %r7648;
setp.lt.s32 %p1637, %r11, %r7649;
and.pred %p1638, %p1636, %p1637;
add.f32 %f5130, %f6000, %f5997;
selp.f32 %f5131, %f5130, %f6000, %p1638;
mov.u32 %r7650, -32;
sub.s32 %r2497, %r7650, %r7648;
setp.lt.s32 %p1639, %r11, %r2497;
add.f32 %f5132, %f5131, %f5996;
and.pred %p1640, %p1636, %p1639;
selp.f32 %f5133, %f5132, %f5131, %p1640;
add.s32 %r7651, %r2493, 13;
setp.lt.s32 %p1641, %r7651, 15;
add.f32 %f5134, %f5133, %f5995;
and.pred %p1642, %p1641, %p1637;
selp.f32 %f5135, %f5134, %f5133, %p1642;
add.f32 %f5136, %f5135, %f5994;
and.pred %p1643, %p1641, %p1639;
selp.f32 %f5137, %f5136, %f5135, %p1643;
add.s32 %r7652, %r2493, 14;
setp.lt.s32 %p1644, %r7652, 15;
add.f32 %f5138, %f5137, %f5993;
and.pred %p1645, %p1644, %p1637;
selp.f32 %f5139, %f5138, %f5137, %p1645;
add.f32 %f5140, %f5139, %f5992;
and.pred %p1646, %p1644, %p1639;
selp.f32 %f5141, %f5140, %f5139, %p1646;
add.s32 %r7653, %r2493, 15;
setp.gt.s32 %p1647, %r7653, 14;
setp.lt.s32 %p1648, %r7653, 15;
add.f32 %f5142, %f5141, %f5991;
and.pred %p1649, %p1648, %p1637;
selp.f32 %f6001, %f5142, %f5141, %p1649;
@%p1647 bra $L__BB0_1881;
add.f32 %f5143, %f6001, %f5990;
selp.f32 %f6001, %f5143, %f6001, %p1639;
$L__BB0_1881:
shl.b32 %r2498, %r12, 2;
mov.u32 %r7654, %tid.z;
or.b32 %r2499, %r1, %r7654;
mov.u32 %r7655, %ntid.z;
mov.u32 %r2500, %ntid.x;
mul.lo.s32 %r2501, %r2500, %r7655;
mul.lo.s32 %r7656, %r7654, %r2500;
add.s32 %r2502, %r7656, %r1;
mov.u32 %r2503, %tid.y;
mul.lo.s32 %r2504, %r2503, %r2500;
add.s32 %r7657, %r2504, %r1;
mov.u32 %r2505, %ntid.y;
mad.lo.s32 %r2506, %r7656, %r2505, %r7657;
mul.wide.u32 %rd2489, %r2506, 4;
mov.u64 %rd2490, _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_6bbe2216_160115arrayE;
add.s64 %rd581, %rd2490, %rd2489;
st.shared.f32 [%rd581], %f6001;
bar.sync 0;
clz.b32 %r7658, %r2501;
mov.u32 %r7659, 31;
sub.s32 %r7660, %r7659, %r7658;
mov.u32 %r7661, 1;
shl.b32 %r8480, %r7661, %r7660;
setp.ge.u32 %p1651, %r2502, %r8480;
add.s32 %r2508, %r8480, %r2502;
setp.ge.u32 %p1652, %r2508, %r2501;
or.pred %p1653, %p1651, %p1652;
@%p1653 bra $L__BB0_1885;
add.s32 %r8479, %r8480, %r2506;
setp.lt.u32 %p1654, %r2505, 2;
@%p1654 bra $L__BB0_1884;
rem.u32 %r7662, %r2508, %r2500;
add.s32 %r7663, %r7662, %r2504;
sub.s32 %r7664, %r2508, %r7662;
mad.lo.s32 %r8479, %r7664, %r2505, %r7663;
$L__BB0_1884:
mul.wide.s32 %rd2491, %r8479, 4;
add.s64 %rd2493, %rd2490, %rd2491;
ld.shared.f32 %f5151, [%rd581];
ld.shared.f32 %f5152, [%rd2493];
add.f32 %f5153, %f5152, %f5151;
st.shared.f32 [%rd581], %f5153;
$L__BB0_1885:
bar.sync 0;
setp.lt.s32 %p1655, %r8480, 4;
@%p1655 bra $L__BB0_1891;
$L__BB0_1886:
shr.u32 %r2513, %r8480, 1;
setp.ge.u32 %p1656, %r2502, %r2513;
@%p1656 bra $L__BB0_1890;
setp.lt.u32 %p1657, %r2505, 2;
add.s32 %r8481, %r2513, %r2506;
@%p1657 bra $L__BB0_1889;
add.s32 %r7665, %r2513, %r2502;
rem.u32 %r7666, %r7665, %r2500;
add.s32 %r7667, %r7666, %r2504;
sub.s32 %r7668, %r7665, %r7666;
mad.lo.s32 %r8481, %r7668, %r2505, %r7667;
$L__BB0_1889:
mul.wide.s32 %rd2494, %r8481, 4;
add.s64 %rd2496, %rd2490, %rd2494;
ld.shared.f32 %f5154, [%rd581];
ld.shared.f32 %f5155, [%rd2496];
add.f32 %f5156, %f5155, %f5154;
st.shared.f32 [%rd581], %f5156;
$L__BB0_1890:
bar.sync 0;
setp.gt.u32 %p1658, %r8480, 7;
mov.u32 %r8480, %r2513;
@%p1658 bra $L__BB0_1886;
$L__BB0_1891:
setp.ne.s32 %p1659, %r2499, 0;
mov.f32 %f6002, 0f00000000;
@%p1659 bra $L__BB0_1894;
ld.shared.f32 %f5158, [%rd581];
add.f32 %f6002, %f5158, 0f00000000;
setp.lt.u32 %p1660, %r2501, 2;
@%p1660 bra $L__BB0_1894;
add.s32 %r7669, %r2506, 1;
mul.wide.u32 %rd2497, %r7669, 4;
add.s64 %rd2499, %rd2490, %rd2497;
ld.shared.f32 %f5159, [%rd2499];
add.f32 %f6002, %f6002, %f5159;
$L__BB0_1894:
bar.sync 0;
mul.wide.s32 %rd2500, %r2503, 4;
add.s64 %rd582, %rd2490, %rd2500;
setp.eq.s32 %p1661, %r2499, 0;
@%p1661 bra $L__BB0_1895;
bra.uni $L__BB0_1896;
$L__BB0_1895:
st.shared.f32 [%rd582], %f6002;
$L__BB0_1896:
ld.param.u64 %rd2504, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_368_cu_6bbe2216_1601111nvfuser_368ENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_2];
mov.u32 %r7784, %ctaid.x;
bar.sync 0;
ld.shared.f32 %f2200, [%rd582];
bar.sync 0;
setp.lt.s32 %p1663, %r2498, 3;
and.pred %p1664, %p33, %p1663;
mad.lo.s32 %r7671, %r7784, 63, %r1;
shl.b32 %r7672, %r12, 7;
add.s32 %r7673, %r7671, %r7672;
cvta.to.global.u64 %rd2502, %rd2504;
mul.wide.s32 %rd2503, %r7673, 4;
add.s64 %rd583, %rd2502, %rd2503;
@%p1664 bra $L__BB0_1925;
bra.uni $L__BB0_1897;
$L__BB0_1925:
add.f32 %f5165, %f2200, %f5400;
st.global.f32 [%rd583], %f5165;
add.f32 %f5166, %f2200, %f5399;
st.global.f32 [%rd583+128], %f5166;
add.f32 %f5167, %f2200, %f5398;
st.global.f32 [%rd583+10584], %f5167;
add.f32 %f5168, %f2200, %f5397;
st.global.f32 [%rd583+10712], %f5168;
add.f32 %f5169, %f2200, %f5396;
st.global.f32 [%rd583+21168], %f5169;
add.f32 %f5170, %f2200, %f5395;
st.global.f32 [%rd583+21296], %f5170;
add.f32 %f5171, %f2200, %f5394;
st.global.f32 [%rd583+31752], %f5171;
add.f32 %f5172, %f2200, %f5393;
st.global.f32 [%rd583+31880], %f5172;
bra.uni $L__BB0_1926;
$L__BB0_1897:
setp.gt.s32 %p1665, %r2498, 14;
@%p1665 bra $L__BB0_1899;
add.f32 %f6034, %f2200, %f5400;
add.f32 %f6033, %f2200, %f5399;
$L__BB0_1899:
setp.gt.s32 %p1666, %r2498, 13;
@%p1666 bra $L__BB0_1901;
add.f32 %f6016, %f2200, %f5398;
add.f32 %f6015, %f2200, %f5397;
$L__BB0_1901:
setp.gt.s32 %p1667, %r2498, 12;
@%p1667 bra $L__BB0_1903;
add.f32 %f6014, %f2200, %f5396;
add.f32 %f6013, %f2200, %f5395;
$L__BB0_1903:
setp.gt.s32 %p1668, %r2498, 11;
@%p1668 bra $L__BB0_1905;
add.f32 %f6012, %f2200, %f5394;
add.f32 %f6011, %f2200, %f5393;
$L__BB0_1905:
@%p1665 bra $L__BB0_1910;
neg.s32 %r7674, %r7672;
setp.ge.s32 %p1670, %r11, %r7674;
@%p1670 bra $L__BB0_1908;
st.global.f32 [%rd583], %f6034;
$L__BB0_1908:
mov.u32 %r7675, -32;
sub.s32 %r7676, %r7675, %r7672;
setp.ge.s32 %p1671, %r11, %r7676;
@%p1671 bra $L__BB0_1910;
st.global.f32 [%rd583+128], %f6033;
$L__BB0_1910:
@%p1666 bra $L__BB0_1915;
neg.s32 %r7677, %r7672;
setp.ge.s32 %p1673, %r11, %r7677;
@%p1673 bra $L__BB0_1913;
st.global.f32 [%rd583+10584], %f6016;
$L__BB0_1913:
mov.u32 %r7678, -32;
sub.s32 %r7679, %r7678, %r7672;
setp.ge.s32 %p1674, %r11, %r7679;
@%p1674 bra $L__BB0_1915;
st.global.f32 [%rd583+10712], %f6015;
$L__BB0_1915:
@%p1667 bra $L__BB0_1920;
neg.s32 %r7680, %r7672;
setp.ge.s32 %p1676, %r11, %r7680;
@%p1676 bra $L__BB0_1918;
st.global.f32 [%rd583+21168], %f6014;
$L__BB0_1918:
mov.u32 %r7681, -32;
sub.s32 %r7682, %r7681, %r7672;
setp.ge.s32 %p1677, %r11, %r7682;
@%p1677 bra $L__BB0_1920;
st.global.f32 [%rd583+21296], %f6013;
$L__BB0_1920:
@%p1668 bra $L__BB0_1926;
neg.s32 %r7683, %r7672;
setp.ge.s32 %p1679, %r11, %r7683;
@%p1679 bra $L__BB0_1923;
st.global.f32 [%rd583+31752], %f6012;
$L__BB0_1923:
mov.u32 %r7684, -32;
sub.s32 %r7685, %r7684, %r7672;
setp.ge.s32 %p1680, %r11, %r7685;
@%p1680 bra $L__BB0_1926;
st.global.f32 [%rd583+31880], %f6011;
$L__BB0_1926:
setp.lt.s32 %p1681, %r2498, 2;
and.pred %p1683, %p33, %p1681;
@%p1683 bra $L__BB0_1955;
bra.uni $L__BB0_1927;
$L__BB0_1955:
add.f32 %f5181, %f2200, %f5599;
st.global.f32 [%rd583+42336], %f5181;
add.f32 %f5182, %f2200, %f5598;
st.global.f32 [%rd583+42464], %f5182;
add.f32 %f5183, %f2200, %f5597;
st.global.f32 [%rd583+52920], %f5183;
add.f32 %f5184, %f2200, %f5596;
st.global.f32 [%rd583+53048], %f5184;
add.f32 %f5185, %f2200, %f5595;
st.global.f32 [%rd583+63504], %f5185;
add.f32 %f5186, %f2200, %f5594;
st.global.f32 [%rd583+63632], %f5186;
add.f32 %f5187, %f2200, %f5593;
st.global.f32 [%rd583+74088], %f5187;
add.f32 %f5188, %f2200, %f5592;
st.global.f32 [%rd583+74216], %f5188;
bra.uni $L__BB0_1956;
$L__BB0_1927:
add.s32 %r2521, %r2498, 4;
setp.gt.s32 %p1684, %r2521, 14;
@%p1684 bra $L__BB0_1929;
neg.s32 %r7687, %r7672;
setp.lt.s32 %p1685, %r11, %r7687;
add.f32 %f5173, %f2200, %f5599;
selp.f32 %f6034, %f5173, %f6034, %p1685;
mov.u32 %r7688, -32;
sub.s32 %r7689, %r7688, %r7672;
setp.lt.s32 %p1686, %r11, %r7689;
add.f32 %f5174, %f2200, %f5598;
selp.f32 %f6033, %f5174, %f6033, %p1686;
$L__BB0_1929:
add.s32 %r2522, %r2498, 5;
setp.gt.s32 %p1687, %r2522, 14;
@%p1687 bra $L__BB0_1931;
neg.s32 %r7691, %r7672;
setp.lt.s32 %p1688, %r11, %r7691;
add.f32 %f5175, %f2200, %f5597;
selp.f32 %f6016, %f5175, %f6016, %p1688;
mov.u32 %r7692, -32;
sub.s32 %r7693, %r7692, %r7672;
setp.lt.s32 %p1689, %r11, %r7693;
add.f32 %f5176, %f2200, %f5596;
selp.f32 %f6015, %f5176, %f6015, %p1689;
$L__BB0_1931:
add.s32 %r2523, %r2498, 6;
setp.gt.s32 %p1690, %r2523, 14;
@%p1690 bra $L__BB0_1933;
neg.s32 %r7695, %r7672;
setp.lt.s32 %p1691, %r11, %r7695;
add.f32 %f5177, %f2200, %f5595;
selp.f32 %f6014, %f5177, %f6014, %p1691;
mov.u32 %r7696, -32;
sub.s32 %r7697, %r7696, %r7672;
setp.lt.s32 %p1692, %r11, %r7697;
add.f32 %f5178, %f2200, %f5594;
selp.f32 %f6013, %f5178, %f6013, %p1692;
$L__BB0_1933:
add.s32 %r2524, %r2498, 7;
setp.gt.s32 %p1693, %r2524, 14;
@%p1693 bra $L__BB0_1935;
neg.s32 %r7699, %r7672;
setp.lt.s32 %p1694, %r11, %r7699;
add.f32 %f5179, %f2200, %f5593;
selp.f32 %f6012, %f5179, %f6012, %p1694;
mov.u32 %r7700, -32;
sub.s32 %r7701, %r7700, %r7672;
setp.lt.s32 %p1695, %r11, %r7701;
add.f32 %f5180, %f2200, %f5592;
selp.f32 %f6011, %f5180, %f6011, %p1695;
$L__BB0_1935:
@%p1684 bra $L__BB0_1940;
neg.s32 %r7702, %r7672;
setp.ge.s32 %p1697, %r11, %r7702;
@%p1697 bra $L__BB0_1938;
st.global.f32 [%rd583+42336], %f6034;
$L__BB0_1938:
mov.u32 %r7703, -32;
sub.s32 %r7704, %r7703, %r7672;
setp.ge.s32 %p1698, %r11, %r7704;
@%p1698 bra $L__BB0_1940;
st.global.f32 [%rd583+42464], %f6033;
$L__BB0_1940:
@%p1687 bra $L__BB0_1945;
neg.s32 %r7705, %r7672;
setp.ge.s32 %p1700, %r11, %r7705;
@%p1700 bra $L__BB0_1943;
st.global.f32 [%rd583+52920], %f6016;
$L__BB0_1943:
mov.u32 %r7706, -32;
sub.s32 %r7707, %r7706, %r7672;
setp.ge.s32 %p1701, %r11, %r7707;
@%p1701 bra $L__BB0_1945;
st.global.f32 [%rd583+53048], %f6015;
$L__BB0_1945:
@%p1690 bra $L__BB0_1950;
neg.s32 %r7708, %r7672;
setp.ge.s32 %p1703, %r11, %r7708;
@%p1703 bra $L__BB0_1948;
st.global.f32 [%rd583+63504], %f6014;
$L__BB0_1948:
mov.u32 %r7709, -32;
sub.s32 %r7710, %r7709, %r7672;
setp.ge.s32 %p1704, %r11, %r7710;
@%p1704 bra $L__BB0_1950;
st.global.f32 [%rd583+63632], %f6013;
$L__BB0_1950:
@%p1693 bra $L__BB0_1956;
neg.s32 %r7711, %r7672;
setp.ge.s32 %p1706, %r11, %r7711;
@%p1706 bra $L__BB0_1953;
st.global.f32 [%rd583+74088], %f6012;
$L__BB0_1953:
mov.u32 %r7712, -32;
sub.s32 %r7713, %r7712, %r7672;
setp.ge.s32 %p1707, %r11, %r7713;
@%p1707 bra $L__BB0_1956;
st.global.f32 [%rd583+74216], %f6011;
$L__BB0_1956:
setp.lt.s32 %p1708, %r2498, 1;
and.pred %p1710, %p33, %p1708;
@%p1710 bra $L__BB0_1985;
bra.uni $L__BB0_1957;
$L__BB0_1985:
add.f32 %f5197, %f2200, %f5798;
st.global.f32 [%rd583+84672], %f5197;
add.f32 %f5198, %f2200, %f5797;
st.global.f32 [%rd583+84800], %f5198;
add.f32 %f5199, %f2200, %f5796;
st.global.f32 [%rd583+95256], %f5199;
add.f32 %f5200, %f2200, %f5795;
st.global.f32 [%rd583+95384], %f5200;
add.f32 %f5201, %f2200, %f5794;
st.global.f32 [%rd583+105840], %f5201;
add.f32 %f5202, %f2200, %f5793;
st.global.f32 [%rd583+105968], %f5202;
add.f32 %f5203, %f2200, %f5792;
st.global.f32 [%rd583+116424], %f5203;
add.f32 %f5204, %f2200, %f5791;
st.global.f32 [%rd583+116552], %f5204;
bra.uni $L__BB0_1986;
$L__BB0_1957:
add.s32 %r2529, %r2498, 8;
setp.gt.s32 %p1711, %r2529, 14;
@%p1711 bra $L__BB0_1959;
neg.s32 %r7715, %r7672;
setp.lt.s32 %p1712, %r11, %r7715;
add.f32 %f5189, %f2200, %f5798;
selp.f32 %f6034, %f5189, %f6034, %p1712;
mov.u32 %r7716, -32;
sub.s32 %r7717, %r7716, %r7672;
setp.lt.s32 %p1713, %r11, %r7717;
add.f32 %f5190, %f2200, %f5797;
selp.f32 %f6033, %f5190, %f6033, %p1713;
$L__BB0_1959:
add.s32 %r2530, %r2498, 9;
setp.gt.s32 %p1714, %r2530, 14;
@%p1714 bra $L__BB0_1961;
neg.s32 %r7719, %r7672;
setp.lt.s32 %p1715, %r11, %r7719;
add.f32 %f5191, %f2200, %f5796;
selp.f32 %f6016, %f5191, %f6016, %p1715;
mov.u32 %r7720, -32;
sub.s32 %r7721, %r7720, %r7672;
setp.lt.s32 %p1716, %r11, %r7721;
add.f32 %f5192, %f2200, %f5795;
selp.f32 %f6015, %f5192, %f6015, %p1716;
$L__BB0_1961:
add.s32 %r2531, %r2498, 10;
setp.gt.s32 %p1717, %r2531, 14;
@%p1717 bra $L__BB0_1963;
neg.s32 %r7723, %r7672;
setp.lt.s32 %p1718, %r11, %r7723;
add.f32 %f5193, %f2200, %f5794;
selp.f32 %f6014, %f5193, %f6014, %p1718;
mov.u32 %r7724, -32;
sub.s32 %r7725, %r7724, %r7672;
setp.lt.s32 %p1719, %r11, %r7725;
add.f32 %f5194, %f2200, %f5793;
selp.f32 %f6013, %f5194, %f6013, %p1719;
$L__BB0_1963:
add.s32 %r2532, %r2498, 11;
setp.gt.s32 %p1720, %r2532, 14;
@%p1720 bra $L__BB0_1965;
neg.s32 %r7727, %r7672;
setp.lt.s32 %p1721, %r11, %r7727;
add.f32 %f5195, %f2200, %f5792;
selp.f32 %f6012, %f5195, %f6012, %p1721;
mov.u32 %r7728, -32;
sub.s32 %r7729, %r7728, %r7672;
setp.lt.s32 %p1722, %r11, %r7729;
add.f32 %f5196, %f2200, %f5791;
selp.f32 %f6011, %f5196, %f6011, %p1722;
$L__BB0_1965:
@%p1711 bra $L__BB0_1970;
neg.s32 %r7730, %r7672;
setp.ge.s32 %p1724, %r11, %r7730;
@%p1724 bra $L__BB0_1968;
st.global.f32 [%rd583+84672], %f6034;
$L__BB0_1968:
mov.u32 %r7731, -32;
sub.s32 %r7732, %r7731, %r7672;
setp.ge.s32 %p1725, %r11, %r7732;
@%p1725 bra $L__BB0_1970;
st.global.f32 [%rd583+84800], %f6033;
$L__BB0_1970:
@%p1714 bra $L__BB0_1975;
neg.s32 %r7733, %r7672;
setp.ge.s32 %p1727, %r11, %r7733;
@%p1727 bra $L__BB0_1973;
st.global.f32 [%rd583+95256], %f6016;
$L__BB0_1973:
mov.u32 %r7734, -32;
sub.s32 %r7735, %r7734, %r7672;
setp.ge.s32 %p1728, %r11, %r7735;
@%p1728 bra $L__BB0_1975;
st.global.f32 [%rd583+95384], %f6015;
$L__BB0_1975:
@%p1717 bra $L__BB0_1980;
neg.s32 %r7736, %r7672;
setp.ge.s32 %p1730, %r11, %r7736;
@%p1730 bra $L__BB0_1978;
st.global.f32 [%rd583+105840], %f6014;
$L__BB0_1978:
mov.u32 %r7737, -32;
sub.s32 %r7738, %r7737, %r7672;
setp.ge.s32 %p1731, %r11, %r7738;
@%p1731 bra $L__BB0_1980;
st.global.f32 [%rd583+105968], %f6013;
$L__BB0_1980:
@%p1720 bra $L__BB0_1986;
neg.s32 %r7739, %r7672;
setp.ge.s32 %p1733, %r11, %r7739;
@%p1733 bra $L__BB0_1983;
st.global.f32 [%rd583+116424], %f6012;
$L__BB0_1983:
mov.u32 %r7740, -32;
sub.s32 %r7741, %r7740, %r7672;
setp.ge.s32 %p1734, %r11, %r7741;
@%p1734 bra $L__BB0_1986;
st.global.f32 [%rd583+116552], %f6011;
$L__BB0_1986:
setp.lt.s32 %p1735, %r2498, 0;
and.pred %p1737, %p33, %p1735;
@%p1737 bra $L__BB0_2015;
bra.uni $L__BB0_1987;
$L__BB0_2015:
add.f32 %f5213, %f2200, %f5997;
st.global.f32 [%rd583+127008], %f5213;
add.f32 %f5214, %f2200, %f5996;
st.global.f32 [%rd583+127136], %f5214;
add.f32 %f5215, %f2200, %f5995;
st.global.f32 [%rd583+137592], %f5215;
add.f32 %f5216, %f2200, %f5994;
st.global.f32 [%rd583+137720], %f5216;
add.f32 %f5217, %f2200, %f5993;
st.global.f32 [%rd583+148176], %f5217;
add.f32 %f5218, %f2200, %f5992;
st.global.f32 [%rd583+148304], %f5218;
add.f32 %f5219, %f2200, %f5991;
st.global.f32 [%rd583+158760], %f5219;
add.f32 %f5220, %f2200, %f5990;
st.global.f32 [%rd583+158888], %f5220;
bra.uni $L__BB0_2016;
$L__BB0_1987:
add.s32 %r2537, %r2498, 12;
setp.gt.s32 %p1738, %r2537, 14;
@%p1738 bra $L__BB0_1989;
neg.s32 %r7743, %r7672;
setp.lt.s32 %p1739, %r11, %r7743;
add.f32 %f5205, %f2200, %f5997;
selp.f32 %f6034, %f5205, %f6034, %p1739;
mov.u32 %r7744, -32;
sub.s32 %r7745, %r7744, %r7672;
setp.lt.s32 %p1740, %r11, %r7745;
add.f32 %f5206, %f2200, %f5996;
selp.f32 %f6033, %f5206, %f6033, %p1740;
$L__BB0_1989:
add.s32 %r2538, %r2498, 13;
setp.gt.s32 %p1741, %r2538, 14;
@%p1741 bra $L__BB0_1991;
neg.s32 %r7747, %r7672;
setp.lt.s32 %p1742, %r11, %r7747;
add.f32 %f5207, %f2200, %f5995;
selp.f32 %f6016, %f5207, %f6016, %p1742;
mov.u32 %r7748, -32;
sub.s32 %r7749, %r7748, %r7672;
setp.lt.s32 %p1743, %r11, %r7749;
add.f32 %f5208, %f2200, %f5994;
selp.f32 %f6015, %f5208, %f6015, %p1743;
$L__BB0_1991:
add.s32 %r2539, %r2498, 14;
setp.gt.s32 %p1744, %r2539, 14;
@%p1744 bra $L__BB0_1993;
neg.s32 %r7751, %r7672;
setp.lt.s32 %p1745, %r11, %r7751;
add.f32 %f5209, %f2200, %f5993;
selp.f32 %f6014, %f5209, %f6014, %p1745;
mov.u32 %r7752, -32;
sub.s32 %r7753, %r7752, %r7672;
setp.lt.s32 %p1746, %r11, %r7753;
add.f32 %f5210, %f2200, %f5992;
selp.f32 %f6013, %f5210, %f6013, %p1746;
$L__BB0_1993:
add.s32 %r2540, %r2498, 15;
setp.gt.s32 %p1747, %r2540, 14;
@%p1747 bra $L__BB0_1995;
neg.s32 %r7755, %r7672;
setp.lt.s32 %p1748, %r11, %r7755;
add.f32 %f5211, %f2200, %f5991;
selp.f32 %f6012, %f5211, %f6012, %p1748;
mov.u32 %r7756, -32;
sub.s32 %r7757, %r7756, %r7672;
setp.lt.s32 %p1749, %r11, %r7757;
add.f32 %f5212, %f2200, %f5990;
selp.f32 %f6011, %f5212, %f6011, %p1749;
$L__BB0_1995:
@%p1738 bra $L__BB0_2000;
neg.s32 %r7758, %r7672;
setp.ge.s32 %p1751, %r11, %r7758;
@%p1751 bra $L__BB0_1998;
st.global.f32 [%rd583+127008], %f6034;
$L__BB0_1998:
mov.u32 %r7759, -32;
sub.s32 %r7760, %r7759, %r7672;
setp.ge.s32 %p1752, %r11, %r7760;
@%p1752 bra $L__BB0_2000;
st.global.f32 [%rd583+127136], %f6033;
$L__BB0_2000:
@%p1741 bra $L__BB0_2005;
neg.s32 %r7761, %r7672;
setp.ge.s32 %p1754, %r11, %r7761;
@%p1754 bra $L__BB0_2003;
st.global.f32 [%rd583+137592], %f6016;
$L__BB0_2003:
mov.u32 %r7762, -32;
sub.s32 %r7763, %r7762, %r7672;
setp.ge.s32 %p1755, %r11, %r7763;
@%p1755 bra $L__BB0_2005;
st.global.f32 [%rd583+137720], %f6015;
$L__BB0_2005:
@%p1744 bra $L__BB0_2010;
neg.s32 %r7764, %r7672;
setp.ge.s32 %p1757, %r11, %r7764;
@%p1757 bra $L__BB0_2008;
st.global.f32 [%rd583+148176], %f6014;
$L__BB0_2008:
mov.u32 %r7765, -32;
sub.s32 %r7766, %r7765, %r7672;
setp.ge.s32 %p1758, %r11, %r7766;
@%p1758 bra $L__BB0_2010;
st.global.f32 [%rd583+148304], %f6013;
$L__BB0_2010:
@%p1747 bra $L__BB0_2016;
neg.s32 %r7767, %r7672;
setp.ge.s32 %p1760, %r11, %r7767;
@%p1760 bra $L__BB0_2013;
st.global.f32 [%rd583+158760], %f6012;
$L__BB0_2013:
mov.u32 %r7768, -32;
sub.s32 %r7769, %r7768, %r7672;
setp.ge.s32 %p1761, %r11, %r7769;
@%p1761 bra $L__BB0_2016;
st.global.f32 [%rd583+158888], %f6011;
$L__BB0_2016:
ret;
}
--- 53997da5d
+++ 03a1b695e
@@ -24,20692 +24,20457 @@
)
{
.local .align 4 .b8 __local_depot0[28];
.reg .b64 %SP;
.reg .b64 %SPL;
- .reg .pred %p<1830>;
- .reg .f32 %f<5935>;
- .reg .b32 %r<8741>;
+ .reg .pred %p<1789>;
+ .reg .f32 %f<6059>;
+ .reg .b32 %r<8482>;
.reg .f64 %fd<257>;
- .reg .b64 %rd<2796>;
+ .reg .b64 %rd<2793>;
.shared .align 4 .u32 _ZZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEEE14nvfuser_zero_s;
mov.u64 %SPL, __local_depot0;
- ld.param.v2.u32 {%r2601, %r2602}, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_0+24];
- ld.param.v2.u32 {%r2603, %r2604}, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_0+32];
- ld.param.v2.u32 {%r2605, %r2606}, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_0+40];
- ld.param.v2.u32 {%r2611, %r2612}, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_1+24];
- ld.param.v2.u32 {%r2613, %r2614}, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_1+32];
- ld.param.v2.u32 {%r2615, %r2616}, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_1+40];
- ld.param.u64 %rd609, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_1];
- ld.param.u64 %rd608, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_0];
- ld.param.u64 %rd610, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_2];
+ ld.param.v2.u32 {%r2575, %r2576}, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_0+24];
+ ld.param.v2.u32 {%r2577, %r2578}, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_0+32];
+ ld.param.v2.u32 {%r2579, %r2580}, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_0+40];
+ ld.param.v2.u32 {%r2585, %r2586}, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_1+24];
+ ld.param.v2.u32 {%r2587, %r2588}, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_1+32];
+ ld.param.v2.u32 {%r2589, %r2590}, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_1+40];
+ ld.param.u64 %rd585, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_1];
+ ld.param.u64 %rd584, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_0];
add.u64 %rd1, %SPL, 0;
- cvta.to.global.u64 %rd2, %rd608;
- cvta.to.global.u64 %rd3, %rd609;
- cvta.to.global.u64 %rd4, %rd610;
+ cvta.to.global.u64 %rd2, %rd584;
+ cvta.to.global.u64 %rd3, %rd585;
mov.u32 %r1, %tid.x;
setp.ne.s32 %p31, %r1, 0;
@%p31 bra $L__BB0_2;
- mov.u32 %r2617, 0;
- st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEEE14nvfuser_zero_s], %r2617;
+ mov.u32 %r2597, 0;
+ st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEEE14nvfuser_zero_s], %r2597;
$L__BB0_2:
bar.sync 0;
- mov.u64 %rd612, _ZZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEEE14nvfuser_zero_s;
- atom.shared.min.s32 %r2618, [%rd612], %r1;
+ mov.u64 %rd588, _ZZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEEE14nvfuser_zero_s;
+ atom.shared.min.s32 %r2598, [%rd588], %r1;
+ add.s32 %r11, %r1, -63;
ld.shared.u32 %r12, [_ZZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEEE14nvfuser_zero_s];
- mov.u32 %r13, %tid.z;
- add.s32 %r14, %r1, -63;
- add.s64 %rd5, %rd1, 24;
- setp.gt.s32 %p32, %r1, 30;
- @%p32 bra $L__BB0_4;
-
- shl.b32 %r2619, %r12, 2;
- neg.s32 %r2620, %r2619;
- add.s32 %r2621, %r13, -12;
- setp.lt.s32 %p33, %r2621, %r2620;
- @%p33 bra $L__BB0_283;
- bra.uni $L__BB0_4;
-
-$L__BB0_283:
- mov.u32 %r3359, %ctaid.x;
- shl.b32 %r3360, %r12, 5;
- add.s32 %r3361, %r3360, %r1;
- mul.hi.s32 %r3362, %r3361, -1840700269;
- add.s32 %r3363, %r3362, %r3361;
- shr.u32 %r3364, %r3363, 31;
- shr.s32 %r3365, %r3363, 2;
- add.s32 %r3366, %r3365, %r3364;
- mul.lo.s32 %r3367, %r3366, 7;
- sub.s32 %r3368, %r3361, %r3367;
- mul.lo.s32 %r3369, %r13, %r2613;
- mad.lo.s32 %r3370, %r2614, %r3359, %r3369;
- mad.lo.s32 %r3371, %r3366, %r2615, %r3370;
- mad.lo.s32 %r3372, %r3368, %r2616, %r3371;
- mul.wide.s32 %rd869, %r3372, 4;
- add.s64 %rd870, %rd3, %rd869;
- ld.global.f32 %f296, [%rd870];
- add.s32 %r3373, %r3361, 32;
- mul.hi.s32 %r3374, %r3373, -1840700269;
- add.s32 %r3375, %r3374, %r3373;
- shr.u32 %r3376, %r3375, 31;
- shr.s32 %r3377, %r3375, 2;
- add.s32 %r3378, %r3377, %r3376;
- mul.lo.s32 %r3379, %r3378, 7;
- sub.s32 %r3380, %r3373, %r3379;
- mad.lo.s32 %r3381, %r3378, %r2615, %r3370;
- mad.lo.s32 %r3382, %r3380, %r2616, %r3381;
- mul.wide.s32 %rd871, %r3382, 4;
- add.s64 %rd872, %rd3, %rd871;
- ld.global.f32 %f297, [%rd872];
- mul.wide.s32 %rd873, %r2613, 4;
- add.s64 %rd874, %rd870, %rd873;
- ld.global.f32 %f298, [%rd874];
- add.s64 %rd875, %rd872, %rd873;
- ld.global.f32 %f299, [%rd875];
- add.s64 %rd876, %rd874, %rd873;
- ld.global.f32 %f300, [%rd876];
- add.s64 %rd877, %rd875, %rd873;
- ld.global.f32 %f301, [%rd877];
- add.s64 %rd878, %rd876, %rd873;
- ld.global.f32 %f302, [%rd878];
- add.s64 %rd879, %rd877, %rd873;
- ld.global.f32 %f303, [%rd879];
- mul.lo.s32 %r3383, %r2604, %r3359;
- mul.hi.s32 %r3384, %r3361, 954437177;
- shr.u32 %r3385, %r3384, 31;
- shr.s32 %r3386, %r3384, 1;
- add.s32 %r3387, %r3386, %r3385;
- mul.lo.s32 %r3388, %r3387, %r2605;
- mul.lo.s32 %r3389, %r3387, 9;
- sub.s32 %r3390, %r3361, %r3389;
- mul.lo.s32 %r3391, %r3390, %r2606;
- mul.lo.s32 %r3392, %r13, %r2603;
- add.s32 %r3393, %r3383, %r3392;
- add.s32 %r3394, %r3393, %r3388;
- add.s32 %r3395, %r3394, %r3391;
- mul.wide.s32 %rd880, %r3395, 4;
- add.s64 %rd881, %rd2, %rd880;
- ld.global.f32 %f304, [%rd881];
- mul.hi.s32 %r3396, %r3373, 954437177;
- shr.u32 %r3397, %r3396, 31;
- shr.s32 %r3398, %r3396, 1;
- add.s32 %r3399, %r3398, %r3397;
- mul.lo.s32 %r3400, %r3399, %r2605;
- mul.lo.s32 %r3401, %r3399, 9;
- sub.s32 %r3402, %r3373, %r3401;
- mul.lo.s32 %r3403, %r3402, %r2606;
- add.s32 %r3404, %r3393, %r3400;
- add.s32 %r3405, %r3404, %r3403;
- mul.wide.s32 %rd882, %r3405, 4;
- add.s64 %rd883, %rd2, %rd882;
- ld.global.f32 %f305, [%rd883];
- mul.wide.s32 %rd884, %r2603, 4;
- add.s64 %rd885, %rd881, %rd884;
- ld.global.f32 %f306, [%rd885];
- add.s64 %rd886, %rd883, %rd884;
- ld.global.f32 %f307, [%rd886];
- add.s64 %rd887, %rd885, %rd884;
- ld.global.f32 %f308, [%rd887];
- add.s64 %rd888, %rd886, %rd884;
- ld.global.f32 %f309, [%rd888];
- add.s32 %r3406, %r2602, %r3383;
- add.s32 %r3407, %r3406, %r3392;
- add.s32 %r3408, %r3407, %r3388;
- add.s32 %r3409, %r3408, %r3391;
- mul.wide.s32 %rd889, %r3409, 4;
- add.s64 %rd890, %rd2, %rd889;
- ld.global.f32 %f310, [%rd890];
- add.s32 %r3410, %r3407, %r3400;
- add.s32 %r3411, %r3410, %r3403;
- mul.wide.s32 %rd891, %r3411, 4;
- add.s64 %rd892, %rd2, %rd891;
- ld.global.f32 %f311, [%rd892];
- mul.f32 %f2610, %f304, 0f3F22F983;
- cvt.rni.s32.f32 %r8293, %f2610;
- cvt.rn.f32.s32 %f2611, %r8293;
- mov.f32 %f2612, 0fBFC90FDA;
- fma.rn.f32 %f2613, %f2611, %f2612, %f304;
- mov.f32 %f2614, 0fB3A22168;
- fma.rn.f32 %f2615, %f2611, %f2614, %f2613;
- mov.f32 %f2616, 0fA7C234C5;
- fma.rn.f32 %f5277, %f2611, %f2616, %f2615;
- abs.f32 %f313, %f304;
- setp.ltu.f32 %p264, %f313, 0f47CE4780;
- @%p264 bra $L__BB0_291;
-
- setp.eq.f32 %p265, %f313, 0f7F800000;
- @%p265 bra $L__BB0_290;
- bra.uni $L__BB0_285;
-
-$L__BB0_290:
- mov.f32 %f2619, 0f00000000;
- mul.rn.f32 %f5277, %f304, %f2619;
- mov.u32 %r8293, 0;
- bra.uni $L__BB0_291;
-
-$L__BB0_4:
- mov.u32 %r15, %ctaid.x;
- add.s32 %r2622, %r13, -15;
- neg.s32 %r16, %r12;
- setp.ge.s32 %p34, %r2622, %r16;
- mul.lo.s32 %r2623, %r13, %r2613;
- mad.lo.s32 %r17, %r2614, %r15, %r2623;
- @%p34 bra $L__BB0_7;
+ setp.lt.s32 %p32, %r12, 3;
+ setp.lt.s32 %p33, %r1, 31;
+ and.pred %p34, %p33, %p32;
+ mov.u32 %r13, %ctaid.x;
+ mul.lo.s32 %r14, %r2588, %r13;
+ add.s64 %rd4, %rd1, 24;
+ @%p34 bra $L__BB0_274;
+ bra.uni $L__BB0_3;
+
+$L__BB0_274:
+ shl.b32 %r3303, %r12, 5;
+ add.s32 %r3304, %r3303, %r1;
+ mul.hi.s32 %r3305, %r3304, -1840700269;
+ add.s32 %r3306, %r3305, %r3304;
+ shr.u32 %r3307, %r3306, 31;
+ shr.s32 %r3308, %r3306, 2;
+ add.s32 %r3309, %r3308, %r3307;
+ mad.lo.s32 %r3310, %r3309, %r2589, %r14;
+ mul.lo.s32 %r3311, %r3309, 7;
+ sub.s32 %r3312, %r3304, %r3311;
+ mad.lo.s32 %r3313, %r3312, %r2590, %r3310;
+ mul.wide.s32 %rd845, %r3313, 4;
+ add.s64 %rd846, %rd3, %rd845;
+ ld.global.f32 %f304, [%rd846];
+ add.s32 %r3314, %r3304, 32;
+ mul.hi.s32 %r3315, %r3314, -1840700269;
+ add.s32 %r3316, %r3315, %r3314;
+ shr.u32 %r3317, %r3316, 31;
+ shr.s32 %r3318, %r3316, 2;
+ add.s32 %r3319, %r3318, %r3317;
+ mad.lo.s32 %r3320, %r3319, %r2589, %r14;
+ mul.lo.s32 %r3321, %r3319, 7;
+ sub.s32 %r3322, %r3314, %r3321;
+ mad.lo.s32 %r3323, %r3322, %r2590, %r3320;
+ mul.wide.s32 %rd847, %r3323, 4;
+ add.s64 %rd848, %rd3, %rd847;
+ ld.global.f32 %f305, [%rd848];
+ mul.wide.s32 %rd849, %r2587, 4;
+ add.s64 %rd850, %rd846, %rd849;
+ ld.global.f32 %f306, [%rd850];
+ add.s64 %rd851, %rd848, %rd849;
+ ld.global.f32 %f307, [%rd851];
+ add.s64 %rd852, %rd850, %rd849;
+ ld.global.f32 %f308, [%rd852];
+ add.s64 %rd853, %rd851, %rd849;
+ ld.global.f32 %f309, [%rd853];
+ add.s64 %rd854, %rd852, %rd849;
+ ld.global.f32 %f310, [%rd854];
+ add.s64 %rd855, %rd853, %rd849;
+ ld.global.f32 %f311, [%rd855];
+ mul.hi.s32 %r3324, %r3304, 954437177;
+ shr.u32 %r3325, %r3324, 31;
+ shr.s32 %r3326, %r3324, 1;
+ add.s32 %r3327, %r3326, %r3325;
+ mul.lo.s32 %r3328, %r3327, %r2579;
+ mul.lo.s32 %r3329, %r2578, %r13;
+ add.s32 %r3330, %r3329, %r3328;
+ mul.lo.s32 %r3331, %r3327, 9;
+ sub.s32 %r3332, %r3304, %r3331;
+ mul.lo.s32 %r3333, %r3332, %r2580;
+ add.s32 %r3334, %r3330, %r3333;
+ mul.wide.s32 %rd856, %r3334, 4;
+ add.s64 %rd857, %rd2, %rd856;
+ ld.global.f32 %f312, [%rd857];
+ mul.hi.s32 %r3335, %r3314, 954437177;
+ shr.u32 %r3336, %r3335, 31;
+ shr.s32 %r3337, %r3335, 1;
+ add.s32 %r3338, %r3337, %r3336;
+ mul.lo.s32 %r3339, %r3338, %r2579;
+ add.s32 %r3340, %r3329, %r3339;
+ mul.lo.s32 %r3341, %r3338, 9;
+ sub.s32 %r3342, %r3314, %r3341;
+ mul.lo.s32 %r3343, %r3342, %r2580;
+ add.s32 %r3344, %r3340, %r3343;
+ mul.wide.s32 %rd858, %r3344, 4;
+ add.s64 %rd859, %rd2, %rd858;
+ ld.global.f32 %f313, [%rd859];
+ mul.wide.s32 %rd860, %r2577, 4;
+ add.s64 %rd861, %rd857, %rd860;
+ ld.global.f32 %f314, [%rd861];
+ add.s64 %rd862, %rd859, %rd860;
+ ld.global.f32 %f315, [%rd862];
+ add.s64 %rd863, %rd861, %rd860;
+ ld.global.f32 %f316, [%rd863];
+ add.s64 %rd864, %rd862, %rd860;
+ ld.global.f32 %f317, [%rd864];
+ add.s32 %r3345, %r2576, %r3329;
+ add.s32 %r3346, %r3345, %r3328;
+ add.s32 %r3347, %r3346, %r3333;
+ mul.wide.s32 %rd865, %r3347, 4;
+ add.s64 %rd866, %rd2, %rd865;
+ ld.global.f32 %f318, [%rd866];
+ add.s32 %r3348, %r3345, %r3339;
+ add.s32 %r3349, %r3348, %r3343;
+ mul.wide.s32 %rd867, %r3349, 4;
+ add.s64 %rd868, %rd2, %rd867;
+ ld.global.f32 %f319, [%rd868];
+ mul.f32 %f2666, %f312, 0f3F22F983;
+ cvt.rni.s32.f32 %r8034, %f2666;
+ cvt.rn.f32.s32 %f2667, %r8034;
+ mov.f32 %f2668, 0fBFC90FDA;
+ fma.rn.f32 %f2669, %f2667, %f2668, %f312;
+ mov.f32 %f2670, 0fB3A22168;
+ fma.rn.f32 %f2671, %f2667, %f2670, %f2669;
+ mov.f32 %f2672, 0fA7C234C5;
+ fma.rn.f32 %f5345, %f2667, %f2672, %f2671;
+ abs.f32 %f321, %f312;
+ setp.ltu.f32 %p257, %f321, 0f47CE4780;
+ @%p257 bra $L__BB0_282;
+
+ setp.eq.f32 %p258, %f321, 0f7F800000;
+ @%p258 bra $L__BB0_281;
+ bra.uni $L__BB0_276;
+
+$L__BB0_281:
+ mov.f32 %f2675, 0f00000000;
+ mul.rn.f32 %f5345, %f312, %f2675;
+ mov.u32 %r8034, 0;
+ bra.uni $L__BB0_282;
+
+$L__BB0_3:
+ setp.gt.s32 %p35, %r12, 14;
+ @%p35 bra $L__BB0_8;
+
+ shl.b32 %r15, %r12, 5;
+ neg.s32 %r2599, %r15;
+ setp.ge.s32 %p36, %r11, %r2599;
+ @%p36 bra $L__BB0_6;
+
+ add.s32 %r2600, %r15, %r1;
+ mul.hi.s32 %r2601, %r2600, -1840700269;
+ add.s32 %r2602, %r2601, %r2600;
+ shr.u32 %r2603, %r2602, 31;
+ shr.s32 %r2604, %r2602, 2;
+ add.s32 %r2605, %r2604, %r2603;
+ mad.lo.s32 %r2606, %r2605, %r2589, %r14;
+ mul.lo.s32 %r2607, %r2605, 7;
+ sub.s32 %r2608, %r2600, %r2607;
+ mad.lo.s32 %r2609, %r2608, %r2590, %r2606;
+ mul.wide.s32 %rd589, %r2609, 4;
+ add.s64 %rd590, %rd3, %rd589;
+ ld.global.f32 %f5607, [%rd590];
+
+$L__BB0_6:
+ shl.b32 %r7788, %r12, 5;
+ mov.u32 %r2610, -32;
+ sub.s32 %r2611, %r2610, %r7788;
+ setp.ge.s32 %p37, %r11, %r2611;
+ @%p37 bra $L__BB0_8;
+
+ shl.b32 %r7790, %r12, 5;
+ add.s32 %r2612, %r7790, %r1;
+ add.s32 %r2613, %r2612, 32;
+ mul.hi.s32 %r2614, %r2613, -1840700269;
+ add.s32 %r2615, %r2614, %r2613;
+ shr.u32 %r2616, %r2615, 31;
+ shr.s32 %r2617, %r2615, 2;
+ add.s32 %r2618, %r2617, %r2616;
+ mad.lo.s32 %r2619, %r2618, %r2589, %r14;
+ mul.lo.s32 %r2620, %r2618, 7;
+ sub.s32 %r2621, %r2613, %r2620;
+ mad.lo.s32 %r2622, %r2621, %r2590, %r2619;
+ mul.wide.s32 %rd591, %r2622, 4;
+ add.s64 %rd592, %rd3, %rd591;
+ ld.global.f32 %f5606, [%rd592];
+
+$L__BB0_8:
+ add.s32 %r16, %r12, 1;
+ setp.gt.s32 %p38, %r16, 14;
+ add.s32 %r17, %r14, %r2587;
+ @%p38 bra $L__BB0_13;
shl.b32 %r18, %r12, 5;
- neg.s32 %r2624, %r18;
- setp.ge.s32 %p35, %r14, %r2624;
- @%p35 bra $L__BB0_7;
-
- add.s32 %r2625, %r18, %r1;
- mul.hi.s32 %r2626, %r2625, -1840700269;
- add.s32 %r2627, %r2626, %r2625;
- shr.u32 %r2628, %r2627, 31;
- shr.s32 %r2629, %r2627, 2;
- add.s32 %r2630, %r2629, %r2628;
- mul.lo.s32 %r2631, %r2630, 7;
- sub.s32 %r2632, %r2625, %r2631;
- mad.lo.s32 %r2633, %r2630, %r2615, %r17;
- mad.lo.s32 %r2634, %r2632, %r2616, %r2633;
- mul.wide.s32 %rd613, %r2634, 4;
- add.s64 %rd614, %rd3, %rd613;
- ld.global.f32 %f5531, [%rd614];
-
-$L__BB0_7:
- @%p34 bra $L__BB0_10;
-
- shl.b32 %r19, %r12, 5;
- mov.u32 %r2636, -32;
- sub.s32 %r2637, %r2636, %r19;
- setp.ge.s32 %p37, %r14, %r2637;
- @%p37 bra $L__BB0_10;
-
- add.s32 %r2638, %r19, %r1;
- add.s32 %r2639, %r2638, 32;
- mul.hi.s32 %r2640, %r2639, -1840700269;
- add.s32 %r2641, %r2640, %r2639;
- shr.u32 %r2642, %r2641, 31;
- shr.s32 %r2643, %r2641, 2;
- add.s32 %r2644, %r2643, %r2642;
- mul.lo.s32 %r2645, %r2644, 7;
- sub.s32 %r2646, %r2639, %r2645;
- mad.lo.s32 %r2647, %r2644, %r2615, %r17;
- mad.lo.s32 %r2648, %r2646, %r2616, %r2647;
- mul.wide.s32 %rd615, %r2648, 4;
- add.s64 %rd616, %rd3, %rd615;
- ld.global.f32 %f5339, [%rd616];
-
-$L__BB0_10:
- add.s32 %r8185, %r13, -15;
- not.b32 %r20, %r12;
- setp.ge.s32 %p38, %r8185, %r20;
- add.s32 %r21, %r17, %r2613;
- @%p38 bra $L__BB0_13;
-
- shl.b32 %r22, %r12, 5;
- neg.s32 %r2650, %r22;
- setp.ge.s32 %p39, %r14, %r2650;
- @%p39 bra $L__BB0_13;
-
- add.s32 %r2651, %r22, %r1;
- mul.hi.s32 %r2652, %r2651, -1840700269;
+ neg.s32 %r2623, %r18;
+ setp.ge.s32 %p39, %r11, %r2623;
+ @%p39 bra $L__BB0_11;
+
+ add.s32 %r2624, %r18, %r1;
+ mul.hi.s32 %r2625, %r2624, -1840700269;
+ add.s32 %r2626, %r2625, %r2624;
+ shr.u32 %r2627, %r2626, 31;
+ shr.s32 %r2628, %r2626, 2;
+ add.s32 %r2629, %r2628, %r2627;
+ mad.lo.s32 %r2630, %r2629, %r2589, %r17;
+ mul.lo.s32 %r2631, %r2629, 7;
+ sub.s32 %r2632, %r2624, %r2631;
+ mad.lo.s32 %r2633, %r2632, %r2590, %r2630;
+ mul.wide.s32 %rd593, %r2633, 4;
+ add.s64 %rd594, %rd3, %rd593;
+ ld.global.f32 %f5406, [%rd594];
+
+$L__BB0_11:
+ shl.b32 %r7791, %r12, 5;
+ mov.u32 %r2634, -32;
+ sub.s32 %r2635, %r2634, %r7791;
+ setp.ge.s32 %p40, %r11, %r2635;
+ @%p40 bra $L__BB0_13;
+
+ shl.b32 %r7792, %r12, 5;
+ add.s32 %r2636, %r7792, %r1;
+ add.s32 %r2637, %r2636, 32;
+ mul.hi.s32 %r2638, %r2637, -1840700269;
+ add.s32 %r2639, %r2638, %r2637;
+ shr.u32 %r2640, %r2639, 31;
+ shr.s32 %r2641, %r2639, 2;
+ add.s32 %r2642, %r2641, %r2640;
+ mad.lo.s32 %r2643, %r2642, %r2589, %r17;
+ mul.lo.s32 %r2644, %r2642, 7;
+ sub.s32 %r2645, %r2637, %r2644;
+ mad.lo.s32 %r2646, %r2645, %r2590, %r2643;
+ mul.wide.s32 %rd595, %r2646, 4;
+ add.s64 %rd596, %rd3, %rd595;
+ ld.global.f32 %f5405, [%rd596];
+
+$L__BB0_13:
+ add.s32 %r19, %r12, 2;
+ setp.gt.s32 %p41, %r19, 14;
+ add.s32 %r20, %r17, %r2587;
+ @%p41 bra $L__BB0_18;
+
+ shl.b32 %r21, %r12, 5;
+ neg.s32 %r2647, %r21;
+ setp.ge.s32 %p42, %r11, %r2647;
+ @%p42 bra $L__BB0_16;
+
+ add.s32 %r2648, %r21, %r1;
+ mul.hi.s32 %r2649, %r2648, -1840700269;
+ add.s32 %r2650, %r2649, %r2648;
+ shr.u32 %r2651, %r2650, 31;
+ shr.s32 %r2652, %r2650, 2;
add.s32 %r2653, %r2652, %r2651;
- shr.u32 %r2654, %r2653, 31;
- shr.s32 %r2655, %r2653, 2;
- add.s32 %r2656, %r2655, %r2654;
- mul.lo.s32 %r2657, %r2656, 7;
- sub.s32 %r2658, %r2651, %r2657;
- mad.lo.s32 %r2659, %r2656, %r2615, %r21;
- mad.lo.s32 %r2660, %r2658, %r2616, %r2659;
- mul.wide.s32 %rd617, %r2660, 4;
- add.s64 %rd618, %rd3, %rd617;
- ld.global.f32 %f5338, [%rd618];
-
-$L__BB0_13:
- not.b32 %r8225, %r12;
- add.s32 %r8224, %r13, -15;
- setp.ge.s32 %p1829, %r8224, %r8225;
- @%p1829 bra $L__BB0_16;
-
- shl.b32 %r23, %r12, 5;
- mov.u32 %r2662, -32;
- sub.s32 %r2663, %r2662, %r23;
- setp.ge.s32 %p41, %r14, %r2663;
- @%p41 bra $L__BB0_16;
-
- add.s32 %r2664, %r23, %r1;
- add.s32 %r2665, %r2664, 32;
- mul.hi.s32 %r2666, %r2665, -1840700269;
- add.s32 %r2667, %r2666, %r2665;
- shr.u32 %r2668, %r2667, 31;
- shr.s32 %r2669, %r2667, 2;
- add.s32 %r2670, %r2669, %r2668;
- mul.lo.s32 %r2671, %r2670, 7;
- sub.s32 %r2672, %r2665, %r2671;
- mad.lo.s32 %r2673, %r2670, %r2615, %r21;
- mad.lo.s32 %r2674, %r2672, %r2616, %r2673;
- mul.wide.s32 %rd619, %r2674, 4;
- add.s64 %rd620, %rd3, %rd619;
- ld.global.f32 %f5337, [%rd620];
+ mad.lo.s32 %r2654, %r2653, %r2589, %r20;
+ mul.lo.s32 %r2655, %r2653, 7;
+ sub.s32 %r2656, %r2648, %r2655;
+ mad.lo.s32 %r2657, %r2656, %r2590, %r2654;
+ mul.wide.s32 %rd597, %r2657, 4;
+ add.s64 %rd598, %rd3, %rd597;
+ ld.global.f32 %f5404, [%rd598];
$L__BB0_16:
- add.s32 %r8186, %r13, -15;
- mov.u32 %r2676, -2;
- sub.s32 %r24, %r2676, %r12;
- setp.ge.s32 %p42, %r8186, %r24;
- add.s32 %r25, %r21, %r2613;
- @%p42 bra $L__BB0_19;
+ shl.b32 %r7793, %r12, 5;
+ mov.u32 %r2658, -32;
+ sub.s32 %r2659, %r2658, %r7793;
+ setp.ge.s32 %p43, %r11, %r2659;
+ @%p43 bra $L__BB0_18;
+
+ shl.b32 %r7795, %r12, 5;
+ add.s32 %r2660, %r7795, %r1;
+ add.s32 %r2661, %r2660, 32;
+ mul.hi.s32 %r2662, %r2661, -1840700269;
+ add.s32 %r2663, %r2662, %r2661;
+ shr.u32 %r2664, %r2663, 31;
+ shr.s32 %r2665, %r2663, 2;
+ add.s32 %r2666, %r2665, %r2664;
+ mad.lo.s32 %r2667, %r2666, %r2589, %r20;
+ mul.lo.s32 %r2668, %r2666, 7;
+ sub.s32 %r2669, %r2661, %r2668;
+ mad.lo.s32 %r2670, %r2669, %r2590, %r2667;
+ mul.wide.s32 %rd599, %r2670, 4;
+ add.s64 %rd600, %rd3, %rd599;
+ ld.global.f32 %f5403, [%rd600];
+
+$L__BB0_18:
+ add.s32 %r22, %r12, 3;
+ setp.gt.s32 %p44, %r22, 14;
+ add.s32 %r23, %r20, %r2587;
+ @%p44 bra $L__BB0_23;
+
+ shl.b32 %r24, %r12, 5;
+ neg.s32 %r2671, %r24;
+ setp.ge.s32 %p45, %r11, %r2671;
+ @%p45 bra $L__BB0_21;
+
+ add.s32 %r2672, %r24, %r1;
+ mul.hi.s32 %r2673, %r2672, -1840700269;
+ add.s32 %r2674, %r2673, %r2672;
+ shr.u32 %r2675, %r2674, 31;
+ shr.s32 %r2676, %r2674, 2;
+ add.s32 %r2677, %r2676, %r2675;
+ mad.lo.s32 %r2678, %r2677, %r2589, %r23;
+ mul.lo.s32 %r2679, %r2677, 7;
+ sub.s32 %r2680, %r2672, %r2679;
+ mad.lo.s32 %r2681, %r2680, %r2590, %r2678;
+ mul.wide.s32 %rd601, %r2681, 4;
+ add.s64 %rd602, %rd3, %rd601;
+ ld.global.f32 %f5402, [%rd602];
+
+$L__BB0_21:
+ shl.b32 %r7796, %r12, 5;
+ mov.u32 %r2682, -32;
+ sub.s32 %r2683, %r2682, %r7796;
+ setp.ge.s32 %p46, %r11, %r2683;
+ @%p46 bra $L__BB0_23;
+
+ shl.b32 %r7798, %r12, 5;
+ add.s32 %r2684, %r7798, %r1;
+ add.s32 %r2685, %r2684, 32;
+ mul.hi.s32 %r2686, %r2685, -1840700269;
+ add.s32 %r2687, %r2686, %r2685;
+ shr.u32 %r2688, %r2687, 31;
+ shr.s32 %r2689, %r2687, 2;
+ add.s32 %r2690, %r2689, %r2688;
+ mad.lo.s32 %r2691, %r2690, %r2589, %r23;
+ mul.lo.s32 %r2692, %r2690, 7;
+ sub.s32 %r2693, %r2685, %r2692;
+ mad.lo.s32 %r2694, %r2693, %r2590, %r2691;
+ mul.wide.s32 %rd603, %r2694, 4;
+ add.s64 %rd604, %rd3, %rd603;
+ ld.global.f32 %f5401, [%rd604];
+
+$L__BB0_23:
+ setp.gt.s32 %p1767, %r12, 14;
+ @%p1767 bra $L__BB0_28;
shl.b32 %r26, %r12, 5;
- neg.s32 %r2677, %r26;
- setp.ge.s32 %p43, %r14, %r2677;
- @%p43 bra $L__BB0_19;
-
- add.s32 %r2678, %r26, %r1;
- mul.hi.s32 %r2679, %r2678, -1840700269;
- add.s32 %r2680, %r2679, %r2678;
- shr.u32 %r2681, %r2680, 31;
- shr.s32 %r2682, %r2680, 2;
- add.s32 %r2683, %r2682, %r2681;
- mul.lo.s32 %r2684, %r2683, 7;
- sub.s32 %r2685, %r2678, %r2684;
- mad.lo.s32 %r2686, %r2683, %r2615, %r25;
- mad.lo.s32 %r2687, %r2685, %r2616, %r2686;
- mul.wide.s32 %rd621, %r2687, 4;
- add.s64 %rd622, %rd3, %rd621;
- ld.global.f32 %f5336, [%rd622];
-
-$L__BB0_19:
- mov.u32 %r8220, -2;
- sub.s32 %r8219, %r8220, %r12;
- add.s32 %r8218, %r13, -15;
- setp.ge.s32 %p1827, %r8218, %r8219;
- @%p1827 bra $L__BB0_22;
-
- shl.b32 %r27, %r12, 5;
- mov.u32 %r2689, -32;
- sub.s32 %r2690, %r2689, %r27;
- setp.ge.s32 %p45, %r14, %r2690;
- @%p45 bra $L__BB0_22;
-
- add.s32 %r2691, %r27, %r1;
- add.s32 %r2692, %r2691, 32;
- mul.hi.s32 %r2693, %r2692, -1840700269;
- add.s32 %r2694, %r2693, %r2692;
- shr.u32 %r2695, %r2694, 31;
- shr.s32 %r2696, %r2694, 2;
- add.s32 %r2697, %r2696, %r2695;
- mul.lo.s32 %r2698, %r2697, 7;
- sub.s32 %r2699, %r2692, %r2698;
- mad.lo.s32 %r2700, %r2697, %r2615, %r25;
- mad.lo.s32 %r2701, %r2699, %r2616, %r2700;
- mul.wide.s32 %rd623, %r2701, 4;
- add.s64 %rd624, %rd3, %rd623;
- ld.global.f32 %f5335, [%rd624];
-
-$L__BB0_22:
- add.s32 %r8051, %r13, -15;
- mov.u32 %r2703, -3;
- sub.s32 %r28, %r2703, %r12;
- setp.ge.s32 %p46, %r8051, %r28;
- add.s32 %r29, %r25, %r2613;
- @%p46 bra $L__BB0_25;
+ neg.s32 %r2695, %r26;
+ setp.ge.s32 %p48, %r11, %r2695;
+ @%p48 bra $L__BB0_26;
+
+ mov.u32 %r7966, %ctaid.x;
+ mul.lo.s32 %r7965, %r2578, %r7966;
+ add.s32 %r2696, %r26, %r1;
+ mul.hi.s32 %r2697, %r2696, 954437177;
+ shr.u32 %r2698, %r2697, 31;
+ shr.s32 %r2699, %r2697, 1;
+ add.s32 %r2700, %r2699, %r2698;
+ mad.lo.s32 %r2701, %r2700, %r2579, %r7965;
+ mul.lo.s32 %r2702, %r2700, 9;
+ sub.s32 %r2703, %r2696, %r2702;
+ mad.lo.s32 %r2704, %r2703, %r2580, %r2701;
+ mul.wide.s32 %rd605, %r2704, 4;
+ add.s64 %rd606, %rd2, %rd605;
+ ld.global.f32 %f5416, [%rd606];
+
+$L__BB0_26:
+ shl.b32 %r7799, %r12, 5;
+ mov.u32 %r2705, -32;
+ sub.s32 %r2706, %r2705, %r7799;
+ setp.ge.s32 %p49, %r11, %r2706;
+ @%p49 bra $L__BB0_28;
+
+ mov.u32 %r7942, %ctaid.x;
+ mul.lo.s32 %r7941, %r2578, %r7942;
+ shl.b32 %r7800, %r12, 5;
+ add.s32 %r2707, %r7800, %r1;
+ add.s32 %r2708, %r2707, 32;
+ mul.hi.s32 %r2709, %r2708, 954437177;
+ shr.u32 %r2710, %r2709, 31;
+ shr.s32 %r2711, %r2709, 1;
+ add.s32 %r2712, %r2711, %r2710;
+ mad.lo.s32 %r2713, %r2712, %r2579, %r7941;
+ mul.lo.s32 %r2714, %r2712, 9;
+ sub.s32 %r2715, %r2708, %r2714;
+ mad.lo.s32 %r2716, %r2715, %r2580, %r2713;
+ mul.wide.s32 %rd607, %r2716, 4;
+ add.s64 %rd608, %rd2, %rd607;
+ ld.global.f32 %f5415, [%rd608];
+
+$L__BB0_28:
+ mov.u32 %r7940, %ctaid.x;
+ mul.lo.s32 %r7939, %r2578, %r7940;
+ add.s32 %r7938, %r12, 1;
+ setp.gt.s32 %p1785, %r7938, 14;
+ add.s32 %r27, %r7939, %r2577;
+ @%p1785 bra $L__BB0_33;
+
+ shl.b32 %r28, %r12, 5;
+ neg.s32 %r2717, %r28;
+ setp.ge.s32 %p51, %r11, %r2717;
+ @%p51 bra $L__BB0_31;
+
+ shl.b32 %r7807, %r12, 5;
+ add.s32 %r2718, %r7807, %r1;
+ mul.hi.s32 %r2719, %r2718, 954437177;
+ shr.u32 %r2720, %r2719, 31;
+ shr.s32 %r2721, %r2719, 1;
+ add.s32 %r2722, %r2721, %r2720;
+ mad.lo.s32 %r2723, %r2722, %r2579, %r27;
+ mul.lo.s32 %r2724, %r2722, 9;
+ sub.s32 %r2725, %r2718, %r2724;
+ mad.lo.s32 %r2726, %r2725, %r2580, %r2723;
+ mul.wide.s32 %rd609, %r2726, 4;
+ add.s64 %rd610, %rd2, %rd609;
+ ld.global.f32 %f5414, [%rd610];
+
+$L__BB0_31:
+ shl.b32 %r7801, %r12, 5;
+ mov.u32 %r2727, -32;
+ sub.s32 %r2728, %r2727, %r7801;
+ setp.ge.s32 %p52, %r11, %r2728;
+ @%p52 bra $L__BB0_33;
+
+ shl.b32 %r7806, %r12, 5;
+ add.s32 %r2729, %r7806, %r1;
+ add.s32 %r2730, %r2729, 32;
+ mul.hi.s32 %r2731, %r2730, 954437177;
+ shr.u32 %r2732, %r2731, 31;
+ shr.s32 %r2733, %r2731, 1;
+ add.s32 %r2734, %r2733, %r2732;
+ mad.lo.s32 %r2735, %r2734, %r2579, %r27;
+ mul.lo.s32 %r2736, %r2734, 9;
+ sub.s32 %r2737, %r2730, %r2736;
+ mad.lo.s32 %r2738, %r2737, %r2580, %r2735;
+ mul.wide.s32 %rd611, %r2738, 4;
+ add.s64 %rd612, %rd2, %rd611;
+ ld.global.f32 %f5413, [%rd612];
+
+$L__BB0_33:
+ add.s32 %r7794, %r12, 2;
+ setp.gt.s32 %p1769, %r7794, 14;
+ @%p1769 bra $L__BB0_38;
shl.b32 %r30, %r12, 5;
- neg.s32 %r2704, %r30;
- setp.ge.s32 %p47, %r14, %r2704;
- @%p47 bra $L__BB0_25;
-
- add.s32 %r2705, %r30, %r1;
- mul.hi.s32 %r2706, %r2705, -1840700269;
- add.s32 %r2707, %r2706, %r2705;
- shr.u32 %r2708, %r2707, 31;
- shr.s32 %r2709, %r2707, 2;
- add.s32 %r2710, %r2709, %r2708;
- mul.lo.s32 %r2711, %r2710, 7;
- sub.s32 %r2712, %r2705, %r2711;
- mad.lo.s32 %r2713, %r2710, %r2615, %r29;
- mad.lo.s32 %r2714, %r2712, %r2616, %r2713;
- mul.wide.s32 %rd625, %r2714, 4;
- add.s64 %rd626, %rd3, %rd625;
- ld.global.f32 %f5334, [%rd626];
-
-$L__BB0_25:
- mov.u32 %r8223, -3;
- sub.s32 %r8222, %r8223, %r12;
- add.s32 %r8221, %r13, -15;
- setp.ge.s32 %p1828, %r8221, %r8222;
- @%p1828 bra $L__BB0_28;
-
- shl.b32 %r31, %r12, 5;
- mov.u32 %r2716, -32;
- sub.s32 %r2717, %r2716, %r31;
- setp.ge.s32 %p49, %r14, %r2717;
- @%p49 bra $L__BB0_28;
-
- add.s32 %r2718, %r31, %r1;
- add.s32 %r2719, %r2718, 32;
- mul.hi.s32 %r2720, %r2719, -1840700269;
- add.s32 %r2721, %r2720, %r2719;
- shr.u32 %r2722, %r2721, 31;
- shr.s32 %r2723, %r2721, 2;
- add.s32 %r2724, %r2723, %r2722;
- mul.lo.s32 %r2725, %r2724, 7;
- sub.s32 %r2726, %r2719, %r2725;
- mad.lo.s32 %r2727, %r2724, %r2615, %r29;
- mad.lo.s32 %r2728, %r2726, %r2616, %r2727;
- mul.wide.s32 %rd627, %r2728, 4;
- add.s64 %rd628, %rd3, %rd627;
- ld.global.f32 %f5333, [%rd628];
-
-$L__BB0_28:
- neg.s32 %r8215, %r12;
- add.s32 %r8214, %r13, -15;
- setp.ge.s32 %p1825, %r8214, %r8215;
- mov.u32 %r8052, %ctaid.x;
- mul.lo.s32 %r32, %r13, %r2603;
- mul.lo.s32 %r33, %r2604, %r8052;
- add.s32 %r34, %r33, %r32;
- @%p1825 bra $L__BB0_31;
-
- shl.b32 %r35, %r12, 5;
- neg.s32 %r2730, %r35;
- setp.ge.s32 %p51, %r14, %r2730;
- @%p51 bra $L__BB0_31;
-
- add.s32 %r2731, %r35, %r1;
- mul.hi.s32 %r2732, %r2731, 954437177;
- shr.u32 %r2733, %r2732, 31;
- shr.s32 %r2734, %r2732, 1;
- add.s32 %r2735, %r2734, %r2733;
- mul.lo.s32 %r2736, %r2735, 9;
- sub.s32 %r2737, %r2731, %r2736;
- mad.lo.s32 %r2738, %r2735, %r2605, %r34;
- mad.lo.s32 %r2739, %r2737, %r2606, %r2738;
- mul.wide.s32 %rd629, %r2739, 4;
- add.s64 %rd630, %rd2, %rd629;
- ld.global.f32 %f5348, [%rd630];
-
-$L__BB0_31:
- neg.s32 %r8194, %r12;
- add.s32 %r8193, %r13, -15;
- setp.ge.s32 %p1820, %r8193, %r8194;
- @%p1820 bra $L__BB0_34;
-
- shl.b32 %r36, %r12, 5;
- mov.u32 %r2741, -32;
- sub.s32 %r2742, %r2741, %r36;
- setp.ge.s32 %p53, %r14, %r2742;
- @%p53 bra $L__BB0_34;
-
- add.s32 %r2743, %r36, %r1;
- add.s32 %r2744, %r2743, 32;
- mul.hi.s32 %r2745, %r2744, 954437177;
- shr.u32 %r2746, %r2745, 31;
- shr.s32 %r2747, %r2745, 1;
- add.s32 %r2748, %r2747, %r2746;
- mul.lo.s32 %r2749, %r2748, 9;
- sub.s32 %r2750, %r2744, %r2749;
- mad.lo.s32 %r2751, %r2748, %r2605, %r34;
- mad.lo.s32 %r2752, %r2750, %r2606, %r2751;
- mul.wide.s32 %rd631, %r2752, 4;
- add.s64 %rd632, %rd2, %rd631;
- ld.global.f32 %f5347, [%rd632];
-
-$L__BB0_34:
- not.b32 %r8217, %r12;
- add.s32 %r8216, %r13, -15;
- setp.ge.s32 %p1826, %r8216, %r8217;
- add.s32 %r37, %r34, %r2603;
- @%p1826 bra $L__BB0_37;
-
- shl.b32 %r38, %r12, 5;
- neg.s32 %r2754, %r38;
- setp.ge.s32 %p55, %r14, %r2754;
- @%p55 bra $L__BB0_37;
-
- add.s32 %r2755, %r38, %r1;
- mul.hi.s32 %r2756, %r2755, 954437177;
- shr.u32 %r2757, %r2756, 31;
- shr.s32 %r2758, %r2756, 1;
- add.s32 %r2759, %r2758, %r2757;
- mul.lo.s32 %r2760, %r2759, 9;
- sub.s32 %r2761, %r2755, %r2760;
- mad.lo.s32 %r2762, %r2759, %r2605, %r37;
- mad.lo.s32 %r2763, %r2761, %r2606, %r2762;
- mul.wide.s32 %rd633, %r2763, 4;
- add.s64 %rd634, %rd2, %rd633;
- ld.global.f32 %f5346, [%rd634];
-
-$L__BB0_37:
- not.b32 %r8196, %r12;
- add.s32 %r8195, %r13, -15;
- setp.ge.s32 %p1821, %r8195, %r8196;
- @%p1821 bra $L__BB0_40;
-
- shl.b32 %r39, %r12, 5;
- mov.u32 %r2765, -32;
- sub.s32 %r2766, %r2765, %r39;
- setp.ge.s32 %p57, %r14, %r2766;
- @%p57 bra $L__BB0_40;
-
- add.s32 %r2767, %r39, %r1;
- add.s32 %r2768, %r2767, 32;
- mul.hi.s32 %r2769, %r2768, 954437177;
- shr.u32 %r2770, %r2769, 31;
- shr.s32 %r2771, %r2769, 1;
- add.s32 %r2772, %r2771, %r2770;
- mul.lo.s32 %r2773, %r2772, 9;
- sub.s32 %r2774, %r2768, %r2773;
- mad.lo.s32 %r2775, %r2772, %r2605, %r37;
- mad.lo.s32 %r2776, %r2774, %r2606, %r2775;
- mul.wide.s32 %rd635, %r2776, 4;
- add.s64 %rd636, %rd2, %rd635;
- ld.global.f32 %f5345, [%rd636];
-
-$L__BB0_40:
- mov.u32 %r8189, -2;
- sub.s32 %r8188, %r8189, %r12;
- add.s32 %r8187, %r13, -15;
- setp.ge.s32 %p1818, %r8187, %r8188;
- add.s32 %r40, %r37, %r2603;
- @%p1818 bra $L__BB0_43;
-
- shl.b32 %r41, %r12, 5;
- neg.s32 %r2778, %r41;
- setp.ge.s32 %p59, %r14, %r2778;
- @%p59 bra $L__BB0_43;
-
- add.s32 %r2779, %r41, %r1;
- mul.hi.s32 %r2780, %r2779, 954437177;
- shr.u32 %r2781, %r2780, 31;
- shr.s32 %r2782, %r2780, 1;
- add.s32 %r2783, %r2782, %r2781;
- mul.lo.s32 %r2784, %r2783, 9;
- sub.s32 %r2785, %r2779, %r2784;
- mad.lo.s32 %r2786, %r2783, %r2605, %r40;
- mad.lo.s32 %r2787, %r2785, %r2606, %r2786;
- mul.wide.s32 %rd637, %r2787, 4;
- add.s64 %rd638, %rd2, %rd637;
- ld.global.f32 %f5344, [%rd638];
+ neg.s32 %r2739, %r30;
+ setp.ge.s32 %p54, %r11, %r2739;
+ @%p54 bra $L__BB0_36;
+
+ mov.u32 %r7946, %ctaid.x;
+ mul.lo.s32 %r7945, %r2578, %r7946;
+ add.s32 %r7944, %r7945, %r2577;
+ add.s32 %r7943, %r7944, %r2577;
+ shl.b32 %r7810, %r12, 5;
+ add.s32 %r2740, %r7810, %r1;
+ mul.hi.s32 %r2741, %r2740, 954437177;
+ shr.u32 %r2742, %r2741, 31;
+ shr.s32 %r2743, %r2741, 1;
+ add.s32 %r2744, %r2743, %r2742;
+ mad.lo.s32 %r2745, %r2744, %r2579, %r7943;
+ mul.lo.s32 %r2746, %r2744, 9;
+ sub.s32 %r2747, %r2740, %r2746;
+ mad.lo.s32 %r2748, %r2747, %r2580, %r2745;
+ mul.wide.s32 %rd613, %r2748, 4;
+ add.s64 %rd614, %rd2, %rd613;
+ ld.global.f32 %f5412, [%rd614];
+
+$L__BB0_36:
+ shl.b32 %r7808, %r12, 5;
+ mov.u32 %r2749, -32;
+ sub.s32 %r2750, %r2749, %r7808;
+ setp.ge.s32 %p55, %r11, %r2750;
+ @%p55 bra $L__BB0_38;
+
+ mov.u32 %r7814, %ctaid.x;
+ mul.lo.s32 %r7813, %r2578, %r7814;
+ add.s32 %r7812, %r7813, %r2577;
+ add.s32 %r7811, %r7812, %r2577;
+ shl.b32 %r7809, %r12, 5;
+ add.s32 %r2751, %r7809, %r1;
+ add.s32 %r2752, %r2751, 32;
+ mul.hi.s32 %r2753, %r2752, 954437177;
+ shr.u32 %r2754, %r2753, 31;
+ shr.s32 %r2755, %r2753, 1;
+ add.s32 %r2756, %r2755, %r2754;
+ mad.lo.s32 %r2757, %r2756, %r2579, %r7811;
+ mul.lo.s32 %r2758, %r2756, 9;
+ sub.s32 %r2759, %r2752, %r2758;
+ mad.lo.s32 %r2760, %r2759, %r2580, %r2757;
+ mul.wide.s32 %rd615, %r2760, 4;
+ add.s64 %rd616, %rd2, %rd615;
+ ld.global.f32 %f5411, [%rd616];
+
+$L__BB0_38:
+ add.s32 %r7797, %r12, 3;
+ setp.gt.s32 %p1770, %r7797, 14;
+ @%p1770 bra $L__BB0_43;
+
+ shl.b32 %r32, %r12, 5;
+ neg.s32 %r2761, %r32;
+ setp.ge.s32 %p57, %r11, %r2761;
+ @%p57 bra $L__BB0_41;
+
+ mov.u32 %r7949, %ctaid.x;
+ mul.lo.s32 %r7948, %r2578, %r7949;
+ add.s32 %r7947, %r2576, %r7948;
+ shl.b32 %r7817, %r12, 5;
+ add.s32 %r2762, %r7817, %r1;
+ mul.hi.s32 %r2763, %r2762, 954437177;
+ shr.u32 %r2764, %r2763, 31;
+ shr.s32 %r2765, %r2763, 1;
+ add.s32 %r2766, %r2765, %r2764;
+ mad.lo.s32 %r2767, %r2766, %r2579, %r7947;
+ mul.lo.s32 %r2768, %r2766, 9;
+ sub.s32 %r2769, %r2762, %r2768;
+ mad.lo.s32 %r2770, %r2769, %r2580, %r2767;
+ mul.wide.s32 %rd617, %r2770, 4;
+ add.s64 %rd618, %rd2, %rd617;
+ ld.global.f32 %f5410, [%rd618];
+
+$L__BB0_41:
+ shl.b32 %r7815, %r12, 5;
+ mov.u32 %r2771, -32;
+ sub.s32 %r2772, %r2771, %r7815;
+ setp.ge.s32 %p58, %r11, %r2772;
+ @%p58 bra $L__BB0_43;
+
+ mov.u32 %r7820, %ctaid.x;
+ mul.lo.s32 %r7819, %r2578, %r7820;
+ add.s32 %r7818, %r2576, %r7819;
+ shl.b32 %r7816, %r12, 5;
+ add.s32 %r2773, %r7816, %r1;
+ add.s32 %r2774, %r2773, 32;
+ mul.hi.s32 %r2775, %r2774, 954437177;
+ shr.u32 %r2776, %r2775, 31;
+ shr.s32 %r2777, %r2775, 1;
+ add.s32 %r2778, %r2777, %r2776;
+ mad.lo.s32 %r2779, %r2778, %r2579, %r7818;
+ mul.lo.s32 %r2780, %r2778, 9;
+ sub.s32 %r2781, %r2774, %r2780;
+ mad.lo.s32 %r2782, %r2781, %r2580, %r2779;
+ mul.wide.s32 %rd619, %r2782, 4;
+ add.s64 %rd620, %rd2, %rd619;
+ ld.global.f32 %f5409, [%rd620];
$L__BB0_43:
- mov.u32 %r8060, -2;
- sub.s32 %r8059, %r8060, %r12;
- add.s32 %r8058, %r13, -15;
- setp.ge.s32 %p1802, %r8058, %r8059;
- @%p1802 bra $L__BB0_46;
-
- shl.b32 %r42, %r12, 5;
- mov.u32 %r2789, -32;
- sub.s32 %r2790, %r2789, %r42;
- setp.ge.s32 %p61, %r14, %r2790;
- @%p61 bra $L__BB0_46;
-
- add.s32 %r2791, %r42, %r1;
- add.s32 %r2792, %r2791, 32;
- mul.hi.s32 %r2793, %r2792, 954437177;
- shr.u32 %r2794, %r2793, 31;
- shr.s32 %r2795, %r2793, 1;
- add.s32 %r2796, %r2795, %r2794;
- mul.lo.s32 %r2797, %r2796, 9;
- sub.s32 %r2798, %r2792, %r2797;
- mad.lo.s32 %r2799, %r2796, %r2605, %r40;
- mad.lo.s32 %r2800, %r2798, %r2606, %r2799;
- mul.wide.s32 %rd639, %r2800, 4;
- add.s64 %rd640, %rd2, %rd639;
- ld.global.f32 %f5343, [%rd640];
-
-$L__BB0_46:
- mov.u32 %r8192, -3;
- sub.s32 %r8191, %r8192, %r12;
- add.s32 %r8190, %r13, -15;
- setp.ge.s32 %p1819, %r8190, %r8191;
- mul.lo.s32 %r8055, %r13, %r2603;
- mov.u32 %r8054, %ctaid.x;
- mul.lo.s32 %r8053, %r2604, %r8054;
- add.s32 %r2802, %r2602, %r8053;
- add.s32 %r43, %r2802, %r8055;
- @%p1819 bra $L__BB0_49;
-
- shl.b32 %r44, %r12, 5;
- neg.s32 %r2803, %r44;
- setp.ge.s32 %p63, %r14, %r2803;
- @%p63 bra $L__BB0_49;
-
- add.s32 %r2804, %r44, %r1;
- mul.hi.s32 %r2805, %r2804, 954437177;
- shr.u32 %r2806, %r2805, 31;
- shr.s32 %r2807, %r2805, 1;
- add.s32 %r2808, %r2807, %r2806;
- mul.lo.s32 %r2809, %r2808, 9;
- sub.s32 %r2810, %r2804, %r2809;
- mad.lo.s32 %r2811, %r2808, %r2605, %r43;
- mad.lo.s32 %r2812, %r2810, %r2606, %r2811;
- mul.wide.s32 %rd641, %r2812, 4;
- add.s64 %rd642, %rd2, %rd641;
- ld.global.f32 %f5342, [%rd642];
-
-$L__BB0_49:
- mov.u32 %r8063, -3;
- sub.s32 %r8062, %r8063, %r12;
- add.s32 %r8061, %r13, -15;
- setp.ge.s32 %p1803, %r8061, %r8062;
- @%p1803 bra $L__BB0_52;
-
- shl.b32 %r45, %r12, 5;
- mov.u32 %r2814, -32;
- sub.s32 %r2815, %r2814, %r45;
- setp.ge.s32 %p65, %r14, %r2815;
- @%p65 bra $L__BB0_52;
-
- add.s32 %r2816, %r45, %r1;
- add.s32 %r2817, %r2816, 32;
- mul.hi.s32 %r2818, %r2817, 954437177;
- shr.u32 %r2819, %r2818, 31;
- shr.s32 %r2820, %r2818, 1;
- add.s32 %r2821, %r2820, %r2819;
- mul.lo.s32 %r2822, %r2821, 9;
- sub.s32 %r2823, %r2817, %r2822;
- mad.lo.s32 %r2824, %r2821, %r2605, %r43;
- mad.lo.s32 %r2825, %r2823, %r2606, %r2824;
- mul.wide.s32 %rd643, %r2825, 4;
- add.s64 %rd644, %rd2, %rd643;
- ld.global.f32 %f5341, [%rd644];
+ setp.gt.s32 %p1768, %r12, 14;
+ @%p1768 bra $L__BB0_71;
+
+ shl.b32 %r2783, %r12, 5;
+ neg.s32 %r2784, %r2783;
+ setp.ge.s32 %p60, %r11, %r2784;
+ @%p60 bra $L__BB0_57;
+
+ mul.f32 %f2315, %f5416, 0f3F22F983;
+ cvt.rni.s32.f32 %r7970, %f2315;
+ cvt.rn.f32.s32 %f2316, %r7970;
+ mov.f32 %f2317, 0fBFC90FDA;
+ fma.rn.f32 %f2318, %f2316, %f2317, %f5416;
+ mov.f32 %f2319, 0fB3A22168;
+ fma.rn.f32 %f2320, %f2316, %f2319, %f2318;
+ mov.f32 %f2321, 0fA7C234C5;
+ fma.rn.f32 %f5248, %f2316, %f2321, %f2320;
+ abs.f32 %f42, %f5416;
+ setp.ltu.f32 %p61, %f42, 0f47CE4780;
+ @%p61 bra $L__BB0_53;
+
+ setp.eq.f32 %p62, %f42, 0f7F800000;
+ @%p62 bra $L__BB0_52;
+ bra.uni $L__BB0_47;
$L__BB0_52:
- neg.s32 %r8057, %r12;
- add.s32 %r8056, %r13, -15;
- setp.ge.s32 %p1801, %r8056, %r8057;
- @%p1801 bra $L__BB0_80;
-
- shl.b32 %r2827, %r12, 5;
- neg.s32 %r2828, %r2827;
- setp.ge.s32 %p67, %r14, %r2828;
- @%p67 bra $L__BB0_66;
-
- mul.f32 %f2259, %f5348, 0f3F22F983;
- cvt.rni.s32.f32 %r8229, %f2259;
- cvt.rn.f32.s32 %f2260, %r8229;
- mov.f32 %f2261, 0fBFC90FDA;
- fma.rn.f32 %f2262, %f2260, %f2261, %f5348;
- mov.f32 %f2263, 0fB3A22168;
- fma.rn.f32 %f2264, %f2260, %f2263, %f2262;
- mov.f32 %f2265, 0fA7C234C5;
- fma.rn.f32 %f5180, %f2260, %f2265, %f2264;
- abs.f32 %f34, %f5348;
- setp.ltu.f32 %p68, %f34, 0f47CE4780;
- @%p68 bra $L__BB0_62;
-
- setp.eq.f32 %p69, %f34, 0f7F800000;
- @%p69 bra $L__BB0_61;
- bra.uni $L__BB0_56;
-
-$L__BB0_61:
- mov.f32 %f2268, 0f00000000;
- mul.rn.f32 %f5180, %f5348, %f2268;
- mov.u32 %r8229, 0;
- bra.uni $L__BB0_62;
-
-$L__BB0_285:
- mov.b32 %r348, %f304;
- shr.u32 %r3413, %r348, 23;
+ mov.f32 %f2324, 0f00000000;
+ mul.rn.f32 %f5248, %f5416, %f2324;
+ mov.u32 %r7970, 0;
+ bra.uni $L__BB0_53;
+
+$L__BB0_276:
+ mov.b32 %r335, %f312;
+ shr.u32 %r3351, %r335, 23;
+ and.b32 %r3352, %r3351, 255;
+ add.s32 %r336, %r3352, -128;
+ shl.b32 %r3353, %r335, 8;
+ or.b32 %r337, %r3353, -2147483648;
+ shr.u32 %r338, %r336, 5;
+ mov.u64 %rd2537, 0;
+ mov.u32 %r8031, 0;
+ mov.u64 %rd872, __cudart_i2opi_f;
+ mov.u64 %rd2538, %rd2537;
+
+$L__BB0_277:
+ .pragma "nounroll";
+ shl.b64 %rd871, %rd2537, 2;
+ add.s64 %rd873, %rd872, %rd871;
+ ld.global.nc.u32 %r3354, [%rd873];
+ mad.wide.u32 %rd874, %r3354, %r337, %rd2538;
+ shr.u64 %rd2538, %rd874, 32;
+ add.s64 %rd875, %rd1, %rd871;
+ st.local.u32 [%rd875], %rd874;
+ add.s32 %r8031, %r8031, 1;
+ cvt.s64.s32 %rd2537, %r8031;
+ setp.ne.s32 %p259, %r8031, 6;
+ @%p259 bra $L__BB0_277;
+
+ st.local.u32 [%rd4], %rd2538;
+ mov.u32 %r3355, 4;
+ sub.s32 %r341, %r3355, %r338;
+ mov.u32 %r3356, 6;
+ sub.s32 %r3357, %r3356, %r338;
+ mul.wide.s32 %rd876, %r3357, 4;
+ add.s64 %rd877, %rd1, %rd876;
+ ld.local.u32 %r8032, [%rd877];
+ ld.local.u32 %r8033, [%rd877+-4];
+ and.b32 %r344, %r336, 31;
+ setp.eq.s32 %p260, %r344, 0;
+ @%p260 bra $L__BB0_280;
+
+ mov.u32 %r3358, 32;
+ sub.s32 %r3359, %r3358, %r344;
+ shr.u32 %r3360, %r8033, %r3359;
+ shl.b32 %r3361, %r8032, %r344;
+ add.s32 %r8032, %r3360, %r3361;
+ mul.wide.s32 %rd878, %r341, 4;
+ add.s64 %rd879, %rd1, %rd878;
+ ld.local.u32 %r3362, [%rd879];
+ shr.u32 %r3363, %r3362, %r3359;
+ shl.b32 %r3364, %r8033, %r344;
+ add.s32 %r8033, %r3363, %r3364;
+
+$L__BB0_280:
+ and.b32 %r3365, %r335, -2147483648;
+ shr.u32 %r3366, %r8033, 30;
+ shl.b32 %r3367, %r8032, 2;
+ or.b32 %r3368, %r3366, %r3367;
+ shr.u32 %r3369, %r3368, 31;
+ shr.u32 %r3370, %r8032, 30;
+ add.s32 %r3371, %r3369, %r3370;
+ neg.s32 %r3372, %r3371;
+ setp.eq.s32 %p261, %r3365, 0;
+ selp.b32 %r8034, %r3371, %r3372, %p261;
+ setp.ne.s32 %p262, %r3369, 0;
+ xor.b32 %r3373, %r3365, -2147483648;
+ selp.b32 %r3374, %r3373, %r3365, %p262;
+ selp.b32 %r3375, -1, 0, %p262;
+ xor.b32 %r3376, %r3368, %r3375;
+ shl.b32 %r3377, %r8033, 2;
+ xor.b32 %r3378, %r3377, %r3375;
+ cvt.u64.u32 %rd880, %r3376;
+ cvt.u64.u32 %rd881, %r3378;
+ bfi.b64 %rd882, %rd880, %rd881, 32, 32;
+ cvt.rn.f64.s64 %fd33, %rd882;
+ mul.f64 %fd34, %fd33, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f2673, %fd34;
+ setp.eq.s32 %p263, %r3374, 0;
+ neg.f32 %f2674, %f2673;
+ selp.f32 %f5345, %f2673, %f2674, %p263;
+
+$L__BB0_282:
+ and.b32 %r351, %r8034, 1;
+ setp.eq.s32 %p264, %r351, 0;
+ selp.f32 %f325, %f5345, 0f3F800000, %p264;
+ mul.rn.f32 %f326, %f5345, %f5345;
+ mov.f32 %f5346, 0fB94D4153;
+ @%p264 bra $L__BB0_284;
+
+ mov.f32 %f2677, 0fBAB607ED;
+ mov.f32 %f2678, 0f37CBAC00;
+ fma.rn.f32 %f5346, %f2678, %f326, %f2677;
+
+$L__BB0_284:
+ selp.f32 %f2679, 0f3C0885E4, 0f3D2AAABB, %p264;
+ fma.rn.f32 %f2680, %f5346, %f326, %f2679;
+ selp.f32 %f2681, 0fBE2AAAA8, 0fBEFFFFFF, %p264;
+ fma.rn.f32 %f2682, %f2680, %f326, %f2681;
+ mov.f32 %f2683, 0f00000000;
+ fma.rn.f32 %f2684, %f326, %f325, %f2683;
+ fma.rn.f32 %f5347, %f2682, %f2684, %f325;
+ and.b32 %r3380, %r8034, 2;
+ setp.eq.s32 %p266, %r3380, 0;
+ @%p266 bra $L__BB0_286;
+
+ mov.f32 %f2686, 0fBF800000;
+ fma.rn.f32 %f5347, %f5347, %f2686, %f2683;
+
+$L__BB0_286:
+ mul.f32 %f2687, %f304, 0f3F22F983;
+ cvt.rni.s32.f32 %r8038, %f2687;
+ cvt.rn.f32.s32 %f2688, %r8038;
+ mov.f32 %f2689, 0fBFC90FDA;
+ fma.rn.f32 %f2690, %f2688, %f2689, %f304;
+ mov.f32 %f2691, 0fB3A22168;
+ fma.rn.f32 %f2692, %f2688, %f2691, %f2690;
+ mov.f32 %f2693, 0fA7C234C5;
+ fma.rn.f32 %f5348, %f2688, %f2693, %f2692;
+ abs.f32 %f333, %f304;
+ setp.ltu.f32 %p267, %f333, 0f47CE4780;
+ @%p267 bra $L__BB0_294;
+
+ setp.eq.f32 %p268, %f333, 0f7F800000;
+ @%p268 bra $L__BB0_293;
+ bra.uni $L__BB0_288;
+
+$L__BB0_293:
+ mov.f32 %f2696, 0f00000000;
+ mul.rn.f32 %f5348, %f304, %f2696;
+ mov.u32 %r8038, 0;
+ bra.uni $L__BB0_294;
+
+$L__BB0_288:
+ mov.b32 %r353, %f304;
+ shr.u32 %r3382, %r353, 23;
+ and.b32 %r3383, %r3382, 255;
+ add.s32 %r354, %r3383, -128;
+ shl.b32 %r3384, %r353, 8;
+ or.b32 %r355, %r3384, -2147483648;
+ shr.u32 %r356, %r354, 5;
+ mov.u64 %rd2539, 0;
+ mov.u32 %r8035, 0;
+ mov.u64 %rd886, __cudart_i2opi_f;
+ mov.u64 %rd2540, %rd2539;
+
+$L__BB0_289:
+ .pragma "nounroll";
+ shl.b64 %rd885, %rd2539, 2;
+ add.s64 %rd887, %rd886, %rd885;
+ ld.global.nc.u32 %r3385, [%rd887];
+ mad.wide.u32 %rd888, %r3385, %r355, %rd2540;
+ shr.u64 %rd2540, %rd888, 32;
+ add.s64 %rd889, %rd1, %rd885;
+ st.local.u32 [%rd889], %rd888;
+ add.s32 %r8035, %r8035, 1;
+ cvt.s64.s32 %rd2539, %r8035;
+ setp.ne.s32 %p269, %r8035, 6;
+ @%p269 bra $L__BB0_289;
+
+ st.local.u32 [%rd4], %rd2540;
+ mov.u32 %r3386, 4;
+ sub.s32 %r359, %r3386, %r356;
+ mov.u32 %r3387, 6;
+ sub.s32 %r3388, %r3387, %r356;
+ mul.wide.s32 %rd890, %r3388, 4;
+ add.s64 %rd891, %rd1, %rd890;
+ ld.local.u32 %r8036, [%rd891];
+ ld.local.u32 %r8037, [%rd891+-4];
+ and.b32 %r362, %r354, 31;
+ setp.eq.s32 %p270, %r362, 0;
+ @%p270 bra $L__BB0_292;
+
+ mov.u32 %r3389, 32;
+ sub.s32 %r3390, %r3389, %r362;
+ shr.u32 %r3391, %r8037, %r3390;
+ shl.b32 %r3392, %r8036, %r362;
+ add.s32 %r8036, %r3391, %r3392;
+ mul.wide.s32 %rd892, %r359, 4;
+ add.s64 %rd893, %rd1, %rd892;
+ ld.local.u32 %r3393, [%rd893];
+ shr.u32 %r3394, %r3393, %r3390;
+ shl.b32 %r3395, %r8037, %r362;
+ add.s32 %r8037, %r3394, %r3395;
+
+$L__BB0_292:
+ and.b32 %r3396, %r353, -2147483648;
+ shr.u32 %r3397, %r8037, 30;
+ shl.b32 %r3398, %r8036, 2;
+ or.b32 %r3399, %r3397, %r3398;
+ shr.u32 %r3400, %r3399, 31;
+ shr.u32 %r3401, %r8036, 30;
+ add.s32 %r3402, %r3400, %r3401;
+ neg.s32 %r3403, %r3402;
+ setp.eq.s32 %p271, %r3396, 0;
+ selp.b32 %r8038, %r3402, %r3403, %p271;
+ setp.ne.s32 %p272, %r3400, 0;
+ xor.b32 %r3404, %r3396, -2147483648;
+ selp.b32 %r3405, %r3404, %r3396, %p272;
+ selp.b32 %r3406, -1, 0, %p272;
+ xor.b32 %r3407, %r3399, %r3406;
+ shl.b32 %r3408, %r8037, 2;
+ xor.b32 %r3409, %r3408, %r3406;
+ cvt.u64.u32 %rd894, %r3407;
+ cvt.u64.u32 %rd895, %r3409;
+ bfi.b64 %rd896, %rd894, %rd895, 32, 32;
+ cvt.rn.f64.s64 %fd35, %rd896;
+ mul.f64 %fd36, %fd35, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f2694, %fd36;
+ setp.eq.s32 %p273, %r3405, 0;
+ neg.f32 %f2695, %f2694;
+ selp.f32 %f5348, %f2694, %f2695, %p273;
+
+$L__BB0_294:
+ add.s32 %r369, %r8038, 1;
+ and.b32 %r370, %r369, 1;
+ setp.eq.s32 %p274, %r370, 0;
+ selp.f32 %f337, %f5348, 0f3F800000, %p274;
+ mul.rn.f32 %f338, %f5348, %f5348;
+ mov.f32 %f5349, 0fB94D4153;
+ @%p274 bra $L__BB0_296;
+
+ mov.f32 %f2698, 0fBAB607ED;
+ mov.f32 %f2699, 0f37CBAC00;
+ fma.rn.f32 %f5349, %f2699, %f338, %f2698;
+
+$L__BB0_296:
+ selp.f32 %f2700, 0f3C0885E4, 0f3D2AAABB, %p274;
+ fma.rn.f32 %f2701, %f5349, %f338, %f2700;
+ selp.f32 %f2702, 0fBE2AAAA8, 0fBEFFFFFF, %p274;
+ fma.rn.f32 %f2703, %f2701, %f338, %f2702;
+ mov.f32 %f2704, 0f00000000;
+ fma.rn.f32 %f2705, %f338, %f337, %f2704;
+ fma.rn.f32 %f5350, %f2703, %f2705, %f337;
+ and.b32 %r3411, %r369, 2;
+ setp.eq.s32 %p276, %r3411, 0;
+ @%p276 bra $L__BB0_298;
+
+ mov.f32 %f2707, 0fBF800000;
+ fma.rn.f32 %f5350, %f5350, %f2707, %f2704;
+
+$L__BB0_298:
+ add.f32 %f5400, %f5347, %f5350;
+ mul.f32 %f2708, %f313, 0f3F22F983;
+ cvt.rni.s32.f32 %r8042, %f2708;
+ cvt.rn.f32.s32 %f2709, %r8042;
+ mov.f32 %f2710, 0fBFC90FDA;
+ fma.rn.f32 %f2711, %f2709, %f2710, %f313;
+ mov.f32 %f2712, 0fB3A22168;
+ fma.rn.f32 %f2713, %f2709, %f2712, %f2711;
+ mov.f32 %f2714, 0fA7C234C5;
+ fma.rn.f32 %f5351, %f2709, %f2714, %f2713;
+ abs.f32 %f346, %f313;
+ setp.ltu.f32 %p277, %f346, 0f47CE4780;
+ @%p277 bra $L__BB0_306;
+
+ setp.eq.f32 %p278, %f346, 0f7F800000;
+ @%p278 bra $L__BB0_305;
+ bra.uni $L__BB0_300;
+
+$L__BB0_305:
+ mov.f32 %f2717, 0f00000000;
+ mul.rn.f32 %f5351, %f313, %f2717;
+ mov.u32 %r8042, 0;
+ bra.uni $L__BB0_306;
+
+$L__BB0_300:
+ mov.b32 %r372, %f313;
+ shr.u32 %r3413, %r372, 23;
and.b32 %r3414, %r3413, 255;
- add.s32 %r349, %r3414, -128;
- shl.b32 %r3415, %r348, 8;
- or.b32 %r350, %r3415, -2147483648;
- shr.u32 %r351, %r349, 5;
- mov.u64 %rd2530, 0;
- mov.u32 %r8290, 0;
- mov.u64 %rd896, __cudart_i2opi_f;
- mov.u64 %rd2531, %rd2530;
-
-$L__BB0_286:
+ add.s32 %r373, %r3414, -128;
+ shl.b32 %r3415, %r372, 8;
+ or.b32 %r374, %r3415, -2147483648;
+ shr.u32 %r375, %r373, 5;
+ mov.u64 %rd2541, 0;
+ mov.u32 %r8039, 0;
+ mov.u64 %rd900, __cudart_i2opi_f;
+ mov.u64 %rd2542, %rd2541;
+
+$L__BB0_301:
.pragma "nounroll";
- shl.b64 %rd895, %rd2530, 2;
- add.s64 %rd897, %rd896, %rd895;
- ld.global.nc.u32 %r3416, [%rd897];
- mad.wide.u32 %rd898, %r3416, %r350, %rd2531;
- shr.u64 %rd2531, %rd898, 32;
- add.s64 %rd899, %rd1, %rd895;
- st.local.u32 [%rd899], %rd898;
- add.s32 %r8290, %r8290, 1;
- cvt.s64.s32 %rd2530, %r8290;
- setp.ne.s32 %p266, %r8290, 6;
- @%p266 bra $L__BB0_286;
-
- st.local.u32 [%rd5], %rd2531;
+ shl.b64 %rd899, %rd2541, 2;
+ add.s64 %rd901, %rd900, %rd899;
+ ld.global.nc.u32 %r3416, [%rd901];
+ mad.wide.u32 %rd902, %r3416, %r374, %rd2542;
+ shr.u64 %rd2542, %rd902, 32;
+ add.s64 %rd903, %rd1, %rd899;
+ st.local.u32 [%rd903], %rd902;
+ add.s32 %r8039, %r8039, 1;
+ cvt.s64.s32 %rd2541, %r8039;
+ setp.ne.s32 %p279, %r8039, 6;
+ @%p279 bra $L__BB0_301;
+
+ st.local.u32 [%rd4], %rd2542;
mov.u32 %r3417, 4;
- sub.s32 %r354, %r3417, %r351;
+ sub.s32 %r378, %r3417, %r375;
mov.u32 %r3418, 6;
- sub.s32 %r3419, %r3418, %r351;
- mul.wide.s32 %rd900, %r3419, 4;
- add.s64 %rd901, %rd1, %rd900;
- ld.local.u32 %r8291, [%rd901];
- ld.local.u32 %r8292, [%rd901+-4];
- and.b32 %r357, %r349, 31;
- setp.eq.s32 %p267, %r357, 0;
- @%p267 bra $L__BB0_289;
+ sub.s32 %r3419, %r3418, %r375;
+ mul.wide.s32 %rd904, %r3419, 4;
+ add.s64 %rd905, %rd1, %rd904;
+ ld.local.u32 %r8040, [%rd905];
+ ld.local.u32 %r8041, [%rd905+-4];
+ and.b32 %r381, %r373, 31;
+ setp.eq.s32 %p280, %r381, 0;
+ @%p280 bra $L__BB0_304;
mov.u32 %r3420, 32;
- sub.s32 %r3421, %r3420, %r357;
- shr.u32 %r3422, %r8292, %r3421;
- shl.b32 %r3423, %r8291, %r357;
- add.s32 %r8291, %r3422, %r3423;
- mul.wide.s32 %rd902, %r354, 4;
- add.s64 %rd903, %rd1, %rd902;
- ld.local.u32 %r3424, [%rd903];
+ sub.s32 %r3421, %r3420, %r381;
+ shr.u32 %r3422, %r8041, %r3421;
+ shl.b32 %r3423, %r8040, %r381;
+ add.s32 %r8040, %r3422, %r3423;
+ mul.wide.s32 %rd906, %r378, 4;
+ add.s64 %rd907, %rd1, %rd906;
+ ld.local.u32 %r3424, [%rd907];
shr.u32 %r3425, %r3424, %r3421;
- shl.b32 %r3426, %r8292, %r357;
- add.s32 %r8292, %r3425, %r3426;
-
-$L__BB0_289:
- and.b32 %r3427, %r348, -2147483648;
- shr.u32 %r3428, %r8292, 30;
- shl.b32 %r3429, %r8291, 2;
+ shl.b32 %r3426, %r8041, %r381;
+ add.s32 %r8041, %r3425, %r3426;
+
+$L__BB0_304:
+ and.b32 %r3427, %r372, -2147483648;
+ shr.u32 %r3428, %r8041, 30;
+ shl.b32 %r3429, %r8040, 2;
or.b32 %r3430, %r3428, %r3429;
shr.u32 %r3431, %r3430, 31;
- shr.u32 %r3432, %r8291, 30;
+ shr.u32 %r3432, %r8040, 30;
add.s32 %r3433, %r3431, %r3432;
neg.s32 %r3434, %r3433;
- setp.eq.s32 %p268, %r3427, 0;
- selp.b32 %r8293, %r3433, %r3434, %p268;
- setp.ne.s32 %p269, %r3431, 0;
+ setp.eq.s32 %p281, %r3427, 0;
+ selp.b32 %r8042, %r3433, %r3434, %p281;
+ setp.ne.s32 %p282, %r3431, 0;
xor.b32 %r3435, %r3427, -2147483648;
- selp.b32 %r3436, %r3435, %r3427, %p269;
- selp.b32 %r3437, -1, 0, %p269;
+ selp.b32 %r3436, %r3435, %r3427, %p282;
+ selp.b32 %r3437, -1, 0, %p282;
xor.b32 %r3438, %r3430, %r3437;
- shl.b32 %r3439, %r8292, 2;
+ shl.b32 %r3439, %r8041, 2;
xor.b32 %r3440, %r3439, %r3437;
- cvt.u64.u32 %rd904, %r3438;
- cvt.u64.u32 %rd905, %r3440;
- bfi.b64 %rd906, %rd904, %rd905, 32, 32;
- cvt.rn.f64.s64 %fd33, %rd906;
- mul.f64 %fd34, %fd33, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f2617, %fd34;
- setp.eq.s32 %p270, %r3436, 0;
- neg.f32 %f2618, %f2617;
- selp.f32 %f5277, %f2617, %f2618, %p270;
-
-$L__BB0_291:
- and.b32 %r364, %r8293, 1;
- setp.eq.s32 %p271, %r364, 0;
- selp.f32 %f317, %f5277, 0f3F800000, %p271;
- mul.rn.f32 %f318, %f5277, %f5277;
- mov.f32 %f5278, 0fB94D4153;
- @%p271 bra $L__BB0_293;
-
- mov.f32 %f2621, 0fBAB607ED;
- mov.f32 %f2622, 0f37CBAC00;
- fma.rn.f32 %f5278, %f2622, %f318, %f2621;
-
-$L__BB0_293:
- selp.f32 %f2623, 0f3C0885E4, 0f3D2AAABB, %p271;
- fma.rn.f32 %f2624, %f5278, %f318, %f2623;
- selp.f32 %f2625, 0fBE2AAAA8, 0fBEFFFFFF, %p271;
- fma.rn.f32 %f2626, %f2624, %f318, %f2625;
- mov.f32 %f2627, 0f00000000;
- fma.rn.f32 %f2628, %f318, %f317, %f2627;
- fma.rn.f32 %f5279, %f2626, %f2628, %f317;
- and.b32 %r3442, %r8293, 2;
- setp.eq.s32 %p273, %r3442, 0;
- @%p273 bra $L__BB0_295;
-
- mov.f32 %f2630, 0fBF800000;
- fma.rn.f32 %f5279, %f5279, %f2630, %f2627;
-
-$L__BB0_295:
- mul.f32 %f2631, %f296, 0f3F22F983;
- cvt.rni.s32.f32 %r8297, %f2631;
- cvt.rn.f32.s32 %f2632, %r8297;
- mov.f32 %f2633, 0fBFC90FDA;
- fma.rn.f32 %f2634, %f2632, %f2633, %f296;
- mov.f32 %f2635, 0fB3A22168;
- fma.rn.f32 %f2636, %f2632, %f2635, %f2634;
- mov.f32 %f2637, 0fA7C234C5;
- fma.rn.f32 %f5280, %f2632, %f2637, %f2636;
- abs.f32 %f325, %f296;
- setp.ltu.f32 %p274, %f325, 0f47CE4780;
- @%p274 bra $L__BB0_303;
-
- setp.eq.f32 %p275, %f325, 0f7F800000;
- @%p275 bra $L__BB0_302;
- bra.uni $L__BB0_297;
-
-$L__BB0_302:
- mov.f32 %f2640, 0f00000000;
- mul.rn.f32 %f5280, %f296, %f2640;
- mov.u32 %r8297, 0;
- bra.uni $L__BB0_303;
-
-$L__BB0_297:
- mov.b32 %r366, %f296;
- shr.u32 %r3444, %r366, 23;
+ cvt.u64.u32 %rd908, %r3438;
+ cvt.u64.u32 %rd909, %r3440;
+ bfi.b64 %rd910, %rd908, %rd909, 32, 32;
+ cvt.rn.f64.s64 %fd37, %rd910;
+ mul.f64 %fd38, %fd37, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f2715, %fd38;
+ setp.eq.s32 %p283, %r3436, 0;
+ neg.f32 %f2716, %f2715;
+ selp.f32 %f5351, %f2715, %f2716, %p283;
+
+$L__BB0_306:
+ and.b32 %r388, %r8042, 1;
+ setp.eq.s32 %p284, %r388, 0;
+ selp.f32 %f350, %f5351, 0f3F800000, %p284;
+ mul.rn.f32 %f351, %f5351, %f5351;
+ mov.f32 %f5352, 0fB94D4153;
+ @%p284 bra $L__BB0_308;
+
+ mov.f32 %f2719, 0fBAB607ED;
+ mov.f32 %f2720, 0f37CBAC00;
+ fma.rn.f32 %f5352, %f2720, %f351, %f2719;
+
+$L__BB0_308:
+ selp.f32 %f2721, 0f3C0885E4, 0f3D2AAABB, %p284;
+ fma.rn.f32 %f2722, %f5352, %f351, %f2721;
+ selp.f32 %f2723, 0fBE2AAAA8, 0fBEFFFFFF, %p284;
+ fma.rn.f32 %f2724, %f2722, %f351, %f2723;
+ mov.f32 %f2725, 0f00000000;
+ fma.rn.f32 %f2726, %f351, %f350, %f2725;
+ fma.rn.f32 %f5353, %f2724, %f2726, %f350;
+ and.b32 %r3442, %r8042, 2;
+ setp.eq.s32 %p286, %r3442, 0;
+ @%p286 bra $L__BB0_310;
+
+ mov.f32 %f2728, 0fBF800000;
+ fma.rn.f32 %f5353, %f5353, %f2728, %f2725;
+
+$L__BB0_310:
+ mul.f32 %f2729, %f305, 0f3F22F983;
+ cvt.rni.s32.f32 %r8046, %f2729;
+ cvt.rn.f32.s32 %f2730, %r8046;
+ mov.f32 %f2731, 0fBFC90FDA;
+ fma.rn.f32 %f2732, %f2730, %f2731, %f305;
+ mov.f32 %f2733, 0fB3A22168;
+ fma.rn.f32 %f2734, %f2730, %f2733, %f2732;
+ mov.f32 %f2735, 0fA7C234C5;
+ fma.rn.f32 %f5354, %f2730, %f2735, %f2734;
+ abs.f32 %f358, %f305;
+ setp.ltu.f32 %p287, %f358, 0f47CE4780;
+ @%p287 bra $L__BB0_318;
+
+ setp.eq.f32 %p288, %f358, 0f7F800000;
+ @%p288 bra $L__BB0_317;
+ bra.uni $L__BB0_312;
+
+$L__BB0_317:
+ mov.f32 %f2738, 0f00000000;
+ mul.rn.f32 %f5354, %f305, %f2738;
+ mov.u32 %r8046, 0;
+ bra.uni $L__BB0_318;
+
+$L__BB0_312:
+ mov.b32 %r390, %f305;
+ shr.u32 %r3444, %r390, 23;
and.b32 %r3445, %r3444, 255;
- add.s32 %r367, %r3445, -128;
- shl.b32 %r3446, %r366, 8;
- or.b32 %r368, %r3446, -2147483648;
- shr.u32 %r369, %r367, 5;
- mov.u64 %rd2532, 0;
- mov.u32 %r8294, 0;
- mov.u64 %rd910, __cudart_i2opi_f;
- mov.u64 %rd2533, %rd2532;
-
-$L__BB0_298:
+ add.s32 %r391, %r3445, -128;
+ shl.b32 %r3446, %r390, 8;
+ or.b32 %r392, %r3446, -2147483648;
+ shr.u32 %r393, %r391, 5;
+ mov.u64 %rd2543, 0;
+ mov.u32 %r8043, 0;
+ mov.u64 %rd914, __cudart_i2opi_f;
+ mov.u64 %rd2544, %rd2543;
+
+$L__BB0_313:
.pragma "nounroll";
- shl.b64 %rd909, %rd2532, 2;
- add.s64 %rd911, %rd910, %rd909;
- ld.global.nc.u32 %r3447, [%rd911];
- mad.wide.u32 %rd912, %r3447, %r368, %rd2533;
- shr.u64 %rd2533, %rd912, 32;
- add.s64 %rd913, %rd1, %rd909;
- st.local.u32 [%rd913], %rd912;
- add.s32 %r8294, %r8294, 1;
- cvt.s64.s32 %rd2532, %r8294;
- setp.ne.s32 %p276, %r8294, 6;
- @%p276 bra $L__BB0_298;
-
- st.local.u32 [%rd5], %rd2533;
+ shl.b64 %rd913, %rd2543, 2;
+ add.s64 %rd915, %rd914, %rd913;
+ ld.global.nc.u32 %r3447, [%rd915];
+ mad.wide.u32 %rd916, %r3447, %r392, %rd2544;
+ shr.u64 %rd2544, %rd916, 32;
+ add.s64 %rd917, %rd1, %rd913;
+ st.local.u32 [%rd917], %rd916;
+ add.s32 %r8043, %r8043, 1;
+ cvt.s64.s32 %rd2543, %r8043;
+ setp.ne.s32 %p289, %r8043, 6;
+ @%p289 bra $L__BB0_313;
+
+ st.local.u32 [%rd4], %rd2544;
mov.u32 %r3448, 4;
- sub.s32 %r372, %r3448, %r369;
+ sub.s32 %r396, %r3448, %r393;
mov.u32 %r3449, 6;
- sub.s32 %r3450, %r3449, %r369;
- mul.wide.s32 %rd914, %r3450, 4;
- add.s64 %rd915, %rd1, %rd914;
- ld.local.u32 %r8295, [%rd915];
- ld.local.u32 %r8296, [%rd915+-4];
- and.b32 %r375, %r367, 31;
- setp.eq.s32 %p277, %r375, 0;
- @%p277 bra $L__BB0_301;
+ sub.s32 %r3450, %r3449, %r393;
+ mul.wide.s32 %rd918, %r3450, 4;
+ add.s64 %rd919, %rd1, %rd918;
+ ld.local.u32 %r8044, [%rd919];
+ ld.local.u32 %r8045, [%rd919+-4];
+ and.b32 %r399, %r391, 31;
+ setp.eq.s32 %p290, %r399, 0;
+ @%p290 bra $L__BB0_316;
mov.u32 %r3451, 32;
- sub.s32 %r3452, %r3451, %r375;
- shr.u32 %r3453, %r8296, %r3452;
- shl.b32 %r3454, %r8295, %r375;
- add.s32 %r8295, %r3453, %r3454;
- mul.wide.s32 %rd916, %r372, 4;
- add.s64 %rd917, %rd1, %rd916;
- ld.local.u32 %r3455, [%rd917];
+ sub.s32 %r3452, %r3451, %r399;
+ shr.u32 %r3453, %r8045, %r3452;
+ shl.b32 %r3454, %r8044, %r399;
+ add.s32 %r8044, %r3453, %r3454;
+ mul.wide.s32 %rd920, %r396, 4;
+ add.s64 %rd921, %rd1, %rd920;
+ ld.local.u32 %r3455, [%rd921];
shr.u32 %r3456, %r3455, %r3452;
- shl.b32 %r3457, %r8296, %r375;
- add.s32 %r8296, %r3456, %r3457;
-
-$L__BB0_301:
- and.b32 %r3458, %r366, -2147483648;
- shr.u32 %r3459, %r8296, 30;
- shl.b32 %r3460, %r8295, 2;
+ shl.b32 %r3457, %r8045, %r399;
+ add.s32 %r8045, %r3456, %r3457;
+
+$L__BB0_316:
+ and.b32 %r3458, %r390, -2147483648;
+ shr.u32 %r3459, %r8045, 30;
+ shl.b32 %r3460, %r8044, 2;
or.b32 %r3461, %r3459, %r3460;
shr.u32 %r3462, %r3461, 31;
- shr.u32 %r3463, %r8295, 30;
+ shr.u32 %r3463, %r8044, 30;
add.s32 %r3464, %r3462, %r3463;
neg.s32 %r3465, %r3464;
- setp.eq.s32 %p278, %r3458, 0;
- selp.b32 %r8297, %r3464, %r3465, %p278;
- setp.ne.s32 %p279, %r3462, 0;
+ setp.eq.s32 %p291, %r3458, 0;
+ selp.b32 %r8046, %r3464, %r3465, %p291;
+ setp.ne.s32 %p292, %r3462, 0;
xor.b32 %r3466, %r3458, -2147483648;
- selp.b32 %r3467, %r3466, %r3458, %p279;
- selp.b32 %r3468, -1, 0, %p279;
+ selp.b32 %r3467, %r3466, %r3458, %p292;
+ selp.b32 %r3468, -1, 0, %p292;
xor.b32 %r3469, %r3461, %r3468;
- shl.b32 %r3470, %r8296, 2;
+ shl.b32 %r3470, %r8045, 2;
xor.b32 %r3471, %r3470, %r3468;
- cvt.u64.u32 %rd918, %r3469;
- cvt.u64.u32 %rd919, %r3471;
- bfi.b64 %rd920, %rd918, %rd919, 32, 32;
- cvt.rn.f64.s64 %fd35, %rd920;
- mul.f64 %fd36, %fd35, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f2638, %fd36;
- setp.eq.s32 %p280, %r3467, 0;
- neg.f32 %f2639, %f2638;
- selp.f32 %f5280, %f2638, %f2639, %p280;
-
-$L__BB0_303:
- add.s32 %r382, %r8297, 1;
- and.b32 %r383, %r382, 1;
- setp.eq.s32 %p281, %r383, 0;
- selp.f32 %f329, %f5280, 0f3F800000, %p281;
- mul.rn.f32 %f330, %f5280, %f5280;
- mov.f32 %f5281, 0fB94D4153;
- @%p281 bra $L__BB0_305;
-
- mov.f32 %f2642, 0fBAB607ED;
- mov.f32 %f2643, 0f37CBAC00;
- fma.rn.f32 %f5281, %f2643, %f330, %f2642;
-
-$L__BB0_305:
- selp.f32 %f2644, 0f3C0885E4, 0f3D2AAABB, %p281;
- fma.rn.f32 %f2645, %f5281, %f330, %f2644;
- selp.f32 %f2646, 0fBE2AAAA8, 0fBEFFFFFF, %p281;
- fma.rn.f32 %f2647, %f2645, %f330, %f2646;
- mov.f32 %f2648, 0f00000000;
- fma.rn.f32 %f2649, %f330, %f329, %f2648;
- fma.rn.f32 %f5282, %f2647, %f2649, %f329;
- and.b32 %r3473, %r382, 2;
- setp.eq.s32 %p283, %r3473, 0;
- @%p283 bra $L__BB0_307;
-
- mov.f32 %f2651, 0fBF800000;
- fma.rn.f32 %f5282, %f5282, %f2651, %f2648;
-
-$L__BB0_307:
- add.f32 %f5332, %f5279, %f5282;
- mul.f32 %f2652, %f305, 0f3F22F983;
- cvt.rni.s32.f32 %r8301, %f2652;
- cvt.rn.f32.s32 %f2653, %r8301;
- mov.f32 %f2654, 0fBFC90FDA;
- fma.rn.f32 %f2655, %f2653, %f2654, %f305;
- mov.f32 %f2656, 0fB3A22168;
- fma.rn.f32 %f2657, %f2653, %f2656, %f2655;
- mov.f32 %f2658, 0fA7C234C5;
- fma.rn.f32 %f5283, %f2653, %f2658, %f2657;
- abs.f32 %f338, %f305;
- setp.ltu.f32 %p284, %f338, 0f47CE4780;
- @%p284 bra $L__BB0_315;
-
- setp.eq.f32 %p285, %f338, 0f7F800000;
- @%p285 bra $L__BB0_314;
- bra.uni $L__BB0_309;
-
-$L__BB0_314:
- mov.f32 %f2661, 0f00000000;
- mul.rn.f32 %f5283, %f305, %f2661;
- mov.u32 %r8301, 0;
- bra.uni $L__BB0_315;
-
-$L__BB0_309:
- mov.b32 %r385, %f305;
- shr.u32 %r3475, %r385, 23;
+ cvt.u64.u32 %rd922, %r3469;
+ cvt.u64.u32 %rd923, %r3471;
+ bfi.b64 %rd924, %rd922, %rd923, 32, 32;
+ cvt.rn.f64.s64 %fd39, %rd924;
+ mul.f64 %fd40, %fd39, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f2736, %fd40;
+ setp.eq.s32 %p293, %r3467, 0;
+ neg.f32 %f2737, %f2736;
+ selp.f32 %f5354, %f2736, %f2737, %p293;
+
+$L__BB0_318:
+ add.s32 %r406, %r8046, 1;
+ and.b32 %r407, %r406, 1;
+ setp.eq.s32 %p294, %r407, 0;
+ selp.f32 %f362, %f5354, 0f3F800000, %p294;
+ mul.rn.f32 %f363, %f5354, %f5354;
+ mov.f32 %f5355, 0fB94D4153;
+ @%p294 bra $L__BB0_320;
+
+ mov.f32 %f2740, 0fBAB607ED;
+ mov.f32 %f2741, 0f37CBAC00;
+ fma.rn.f32 %f5355, %f2741, %f363, %f2740;
+
+$L__BB0_320:
+ selp.f32 %f2742, 0f3C0885E4, 0f3D2AAABB, %p294;
+ fma.rn.f32 %f2743, %f5355, %f363, %f2742;
+ selp.f32 %f2744, 0fBE2AAAA8, 0fBEFFFFFF, %p294;
+ fma.rn.f32 %f2745, %f2743, %f363, %f2744;
+ mov.f32 %f2746, 0f00000000;
+ fma.rn.f32 %f2747, %f363, %f362, %f2746;
+ fma.rn.f32 %f5356, %f2745, %f2747, %f362;
+ and.b32 %r3473, %r406, 2;
+ setp.eq.s32 %p296, %r3473, 0;
+ @%p296 bra $L__BB0_322;
+
+ mov.f32 %f2749, 0fBF800000;
+ fma.rn.f32 %f5356, %f5356, %f2749, %f2746;
+
+$L__BB0_322:
+ add.f32 %f5399, %f5353, %f5356;
+ mul.f32 %f2750, %f314, 0f3F22F983;
+ cvt.rni.s32.f32 %r8050, %f2750;
+ cvt.rn.f32.s32 %f2751, %r8050;
+ mov.f32 %f2752, 0fBFC90FDA;
+ fma.rn.f32 %f2753, %f2751, %f2752, %f314;
+ mov.f32 %f2754, 0fB3A22168;
+ fma.rn.f32 %f2755, %f2751, %f2754, %f2753;
+ mov.f32 %f2756, 0fA7C234C5;
+ fma.rn.f32 %f5357, %f2751, %f2756, %f2755;
+ abs.f32 %f371, %f314;
+ setp.ltu.f32 %p297, %f371, 0f47CE4780;
+ @%p297 bra $L__BB0_330;
+
+ setp.eq.f32 %p298, %f371, 0f7F800000;
+ @%p298 bra $L__BB0_329;
+ bra.uni $L__BB0_324;
+
+$L__BB0_329:
+ mov.f32 %f2759, 0f00000000;
+ mul.rn.f32 %f5357, %f314, %f2759;
+ mov.u32 %r8050, 0;
+ bra.uni $L__BB0_330;
+
+$L__BB0_324:
+ mov.b32 %r409, %f314;
+ shr.u32 %r3475, %r409, 23;
and.b32 %r3476, %r3475, 255;
- add.s32 %r386, %r3476, -128;
- shl.b32 %r3477, %r385, 8;
- or.b32 %r387, %r3477, -2147483648;
- shr.u32 %r388, %r386, 5;
- mov.u64 %rd2534, 0;
- mov.u32 %r8298, 0;
- mov.u64 %rd924, __cudart_i2opi_f;
- mov.u64 %rd2535, %rd2534;
-
-$L__BB0_310:
+ add.s32 %r410, %r3476, -128;
+ shl.b32 %r3477, %r409, 8;
+ or.b32 %r411, %r3477, -2147483648;
+ shr.u32 %r412, %r410, 5;
+ mov.u64 %rd2545, 0;
+ mov.u32 %r8047, 0;
+ mov.u64 %rd928, __cudart_i2opi_f;
+ mov.u64 %rd2546, %rd2545;
+
+$L__BB0_325:
.pragma "nounroll";
- shl.b64 %rd923, %rd2534, 2;
- add.s64 %rd925, %rd924, %rd923;
- ld.global.nc.u32 %r3478, [%rd925];
- mad.wide.u32 %rd926, %r3478, %r387, %rd2535;
- shr.u64 %rd2535, %rd926, 32;
- add.s64 %rd927, %rd1, %rd923;
- st.local.u32 [%rd927], %rd926;
- add.s32 %r8298, %r8298, 1;
- cvt.s64.s32 %rd2534, %r8298;
- setp.ne.s32 %p286, %r8298, 6;
- @%p286 bra $L__BB0_310;
-
- st.local.u32 [%rd5], %rd2535;
+ shl.b64 %rd927, %rd2545, 2;
+ add.s64 %rd929, %rd928, %rd927;
+ ld.global.nc.u32 %r3478, [%rd929];
+ mad.wide.u32 %rd930, %r3478, %r411, %rd2546;
+ shr.u64 %rd2546, %rd930, 32;
+ add.s64 %rd931, %rd1, %rd927;
+ st.local.u32 [%rd931], %rd930;
+ add.s32 %r8047, %r8047, 1;
+ cvt.s64.s32 %rd2545, %r8047;
+ setp.ne.s32 %p299, %r8047, 6;
+ @%p299 bra $L__BB0_325;
+
+ st.local.u32 [%rd4], %rd2546;
mov.u32 %r3479, 4;
- sub.s32 %r391, %r3479, %r388;
+ sub.s32 %r415, %r3479, %r412;
mov.u32 %r3480, 6;
- sub.s32 %r3481, %r3480, %r388;
- mul.wide.s32 %rd928, %r3481, 4;
- add.s64 %rd929, %rd1, %rd928;
- ld.local.u32 %r8299, [%rd929];
- ld.local.u32 %r8300, [%rd929+-4];
- and.b32 %r394, %r386, 31;
- setp.eq.s32 %p287, %r394, 0;
- @%p287 bra $L__BB0_313;
+ sub.s32 %r3481, %r3480, %r412;
+ mul.wide.s32 %rd932, %r3481, 4;
+ add.s64 %rd933, %rd1, %rd932;
+ ld.local.u32 %r8048, [%rd933];
+ ld.local.u32 %r8049, [%rd933+-4];
+ and.b32 %r418, %r410, 31;
+ setp.eq.s32 %p300, %r418, 0;
+ @%p300 bra $L__BB0_328;
mov.u32 %r3482, 32;
- sub.s32 %r3483, %r3482, %r394;
- shr.u32 %r3484, %r8300, %r3483;
- shl.b32 %r3485, %r8299, %r394;
- add.s32 %r8299, %r3484, %r3485;
- mul.wide.s32 %rd930, %r391, 4;
- add.s64 %rd931, %rd1, %rd930;
- ld.local.u32 %r3486, [%rd931];
+ sub.s32 %r3483, %r3482, %r418;
+ shr.u32 %r3484, %r8049, %r3483;
+ shl.b32 %r3485, %r8048, %r418;
+ add.s32 %r8048, %r3484, %r3485;
+ mul.wide.s32 %rd934, %r415, 4;
+ add.s64 %rd935, %rd1, %rd934;
+ ld.local.u32 %r3486, [%rd935];
shr.u32 %r3487, %r3486, %r3483;
- shl.b32 %r3488, %r8300, %r394;
- add.s32 %r8300, %r3487, %r3488;
-
-$L__BB0_313:
- and.b32 %r3489, %r385, -2147483648;
- shr.u32 %r3490, %r8300, 30;
- shl.b32 %r3491, %r8299, 2;
+ shl.b32 %r3488, %r8049, %r418;
+ add.s32 %r8049, %r3487, %r3488;
+
+$L__BB0_328:
+ and.b32 %r3489, %r409, -2147483648;
+ shr.u32 %r3490, %r8049, 30;
+ shl.b32 %r3491, %r8048, 2;
or.b32 %r3492, %r3490, %r3491;
shr.u32 %r3493, %r3492, 31;
- shr.u32 %r3494, %r8299, 30;
+ shr.u32 %r3494, %r8048, 30;
add.s32 %r3495, %r3493, %r3494;
neg.s32 %r3496, %r3495;
- setp.eq.s32 %p288, %r3489, 0;
- selp.b32 %r8301, %r3495, %r3496, %p288;
- setp.ne.s32 %p289, %r3493, 0;
+ setp.eq.s32 %p301, %r3489, 0;
+ selp.b32 %r8050, %r3495, %r3496, %p301;
+ setp.ne.s32 %p302, %r3493, 0;
xor.b32 %r3497, %r3489, -2147483648;
- selp.b32 %r3498, %r3497, %r3489, %p289;
- selp.b32 %r3499, -1, 0, %p289;
+ selp.b32 %r3498, %r3497, %r3489, %p302;
+ selp.b32 %r3499, -1, 0, %p302;
xor.b32 %r3500, %r3492, %r3499;
- shl.b32 %r3501, %r8300, 2;
+ shl.b32 %r3501, %r8049, 2;
xor.b32 %r3502, %r3501, %r3499;
- cvt.u64.u32 %rd932, %r3500;
- cvt.u64.u32 %rd933, %r3502;
- bfi.b64 %rd934, %rd932, %rd933, 32, 32;
- cvt.rn.f64.s64 %fd37, %rd934;
- mul.f64 %fd38, %fd37, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f2659, %fd38;
- setp.eq.s32 %p290, %r3498, 0;
- neg.f32 %f2660, %f2659;
- selp.f32 %f5283, %f2659, %f2660, %p290;
-
-$L__BB0_315:
- and.b32 %r401, %r8301, 1;
- setp.eq.s32 %p291, %r401, 0;
- selp.f32 %f342, %f5283, 0f3F800000, %p291;
- mul.rn.f32 %f343, %f5283, %f5283;
- mov.f32 %f5284, 0fB94D4153;
- @%p291 bra $L__BB0_317;
-
- mov.f32 %f2663, 0fBAB607ED;
- mov.f32 %f2664, 0f37CBAC00;
- fma.rn.f32 %f5284, %f2664, %f343, %f2663;
-
-$L__BB0_317:
- selp.f32 %f2665, 0f3C0885E4, 0f3D2AAABB, %p291;
- fma.rn.f32 %f2666, %f5284, %f343, %f2665;
- selp.f32 %f2667, 0fBE2AAAA8, 0fBEFFFFFF, %p291;
- fma.rn.f32 %f2668, %f2666, %f343, %f2667;
- mov.f32 %f2669, 0f00000000;
- fma.rn.f32 %f2670, %f343, %f342, %f2669;
- fma.rn.f32 %f5285, %f2668, %f2670, %f342;
- and.b32 %r3504, %r8301, 2;
- setp.eq.s32 %p293, %r3504, 0;
- @%p293 bra $L__BB0_319;
-
- mov.f32 %f2672, 0fBF800000;
- fma.rn.f32 %f5285, %f5285, %f2672, %f2669;
-
-$L__BB0_319:
- mul.f32 %f2673, %f297, 0f3F22F983;
- cvt.rni.s32.f32 %r8305, %f2673;
- cvt.rn.f32.s32 %f2674, %r8305;
- mov.f32 %f2675, 0fBFC90FDA;
- fma.rn.f32 %f2676, %f2674, %f2675, %f297;
- mov.f32 %f2677, 0fB3A22168;
- fma.rn.f32 %f2678, %f2674, %f2677, %f2676;
- mov.f32 %f2679, 0fA7C234C5;
- fma.rn.f32 %f5286, %f2674, %f2679, %f2678;
- abs.f32 %f350, %f297;
- setp.ltu.f32 %p294, %f350, 0f47CE4780;
- @%p294 bra $L__BB0_327;
-
- setp.eq.f32 %p295, %f350, 0f7F800000;
- @%p295 bra $L__BB0_326;
- bra.uni $L__BB0_321;
-
-$L__BB0_326:
- mov.f32 %f2682, 0f00000000;
- mul.rn.f32 %f5286, %f297, %f2682;
- mov.u32 %r8305, 0;
- bra.uni $L__BB0_327;
-
-$L__BB0_321:
- mov.b32 %r403, %f297;
- shr.u32 %r3506, %r403, 23;
+ cvt.u64.u32 %rd936, %r3500;
+ cvt.u64.u32 %rd937, %r3502;
+ bfi.b64 %rd938, %rd936, %rd937, 32, 32;
+ cvt.rn.f64.s64 %fd41, %rd938;
+ mul.f64 %fd42, %fd41, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f2757, %fd42;
+ setp.eq.s32 %p303, %r3498, 0;
+ neg.f32 %f2758, %f2757;
+ selp.f32 %f5357, %f2757, %f2758, %p303;
+
+$L__BB0_330:
+ and.b32 %r425, %r8050, 1;
+ setp.eq.s32 %p304, %r425, 0;
+ selp.f32 %f375, %f5357, 0f3F800000, %p304;
+ mul.rn.f32 %f376, %f5357, %f5357;
+ mov.f32 %f5358, 0fB94D4153;
+ @%p304 bra $L__BB0_332;
+
+ mov.f32 %f2761, 0fBAB607ED;
+ mov.f32 %f2762, 0f37CBAC00;
+ fma.rn.f32 %f5358, %f2762, %f376, %f2761;
+
+$L__BB0_332:
+ selp.f32 %f2763, 0f3C0885E4, 0f3D2AAABB, %p304;
+ fma.rn.f32 %f2764, %f5358, %f376, %f2763;
+ selp.f32 %f2765, 0fBE2AAAA8, 0fBEFFFFFF, %p304;
+ fma.rn.f32 %f2766, %f2764, %f376, %f2765;
+ mov.f32 %f2767, 0f00000000;
+ fma.rn.f32 %f2768, %f376, %f375, %f2767;
+ fma.rn.f32 %f5359, %f2766, %f2768, %f375;
+ and.b32 %r3504, %r8050, 2;
+ setp.eq.s32 %p306, %r3504, 0;
+ @%p306 bra $L__BB0_334;
+
+ mov.f32 %f2770, 0fBF800000;
+ fma.rn.f32 %f5359, %f5359, %f2770, %f2767;
+
+$L__BB0_334:
+ mul.f32 %f2771, %f306, 0f3F22F983;
+ cvt.rni.s32.f32 %r8054, %f2771;
+ cvt.rn.f32.s32 %f2772, %r8054;
+ mov.f32 %f2773, 0fBFC90FDA;
+ fma.rn.f32 %f2774, %f2772, %f2773, %f306;
+ mov.f32 %f2775, 0fB3A22168;
+ fma.rn.f32 %f2776, %f2772, %f2775, %f2774;
+ mov.f32 %f2777, 0fA7C234C5;
+ fma.rn.f32 %f5360, %f2772, %f2777, %f2776;
+ abs.f32 %f383, %f306;
+ setp.ltu.f32 %p307, %f383, 0f47CE4780;
+ @%p307 bra $L__BB0_342;
+
+ setp.eq.f32 %p308, %f383, 0f7F800000;
+ @%p308 bra $L__BB0_341;
+ bra.uni $L__BB0_336;
+
+$L__BB0_341:
+ mov.f32 %f2780, 0f00000000;
+ mul.rn.f32 %f5360, %f306, %f2780;
+ mov.u32 %r8054, 0;
+ bra.uni $L__BB0_342;
+
+$L__BB0_336:
+ mov.b32 %r427, %f306;
+ shr.u32 %r3506, %r427, 23;
and.b32 %r3507, %r3506, 255;
- add.s32 %r404, %r3507, -128;
- shl.b32 %r3508, %r403, 8;
- or.b32 %r405, %r3508, -2147483648;
- shr.u32 %r406, %r404, 5;
- mov.u64 %rd2536, 0;
- mov.u32 %r8302, 0;
- mov.u64 %rd938, __cudart_i2opi_f;
- mov.u64 %rd2537, %rd2536;
-
-$L__BB0_322:
+ add.s32 %r428, %r3507, -128;
+ shl.b32 %r3508, %r427, 8;
+ or.b32 %r429, %r3508, -2147483648;
+ shr.u32 %r430, %r428, 5;
+ mov.u64 %rd2547, 0;
+ mov.u32 %r8051, 0;
+ mov.u64 %rd942, __cudart_i2opi_f;
+ mov.u64 %rd2548, %rd2547;
+
+$L__BB0_337:
.pragma "nounroll";
- shl.b64 %rd937, %rd2536, 2;
- add.s64 %rd939, %rd938, %rd937;
- ld.global.nc.u32 %r3509, [%rd939];
- mad.wide.u32 %rd940, %r3509, %r405, %rd2537;
- shr.u64 %rd2537, %rd940, 32;
- add.s64 %rd941, %rd1, %rd937;
- st.local.u32 [%rd941], %rd940;
- add.s32 %r8302, %r8302, 1;
- cvt.s64.s32 %rd2536, %r8302;
- setp.ne.s32 %p296, %r8302, 6;
- @%p296 bra $L__BB0_322;
-
- st.local.u32 [%rd5], %rd2537;
+ shl.b64 %rd941, %rd2547, 2;
+ add.s64 %rd943, %rd942, %rd941;
+ ld.global.nc.u32 %r3509, [%rd943];
+ mad.wide.u32 %rd944, %r3509, %r429, %rd2548;
+ shr.u64 %rd2548, %rd944, 32;
+ add.s64 %rd945, %rd1, %rd941;
+ st.local.u32 [%rd945], %rd944;
+ add.s32 %r8051, %r8051, 1;
+ cvt.s64.s32 %rd2547, %r8051;
+ setp.ne.s32 %p309, %r8051, 6;
+ @%p309 bra $L__BB0_337;
+
+ st.local.u32 [%rd4], %rd2548;
mov.u32 %r3510, 4;
- sub.s32 %r409, %r3510, %r406;
+ sub.s32 %r433, %r3510, %r430;
mov.u32 %r3511, 6;
- sub.s32 %r3512, %r3511, %r406;
- mul.wide.s32 %rd942, %r3512, 4;
- add.s64 %rd943, %rd1, %rd942;
- ld.local.u32 %r8303, [%rd943];
- ld.local.u32 %r8304, [%rd943+-4];
- and.b32 %r412, %r404, 31;
- setp.eq.s32 %p297, %r412, 0;
- @%p297 bra $L__BB0_325;
+ sub.s32 %r3512, %r3511, %r430;
+ mul.wide.s32 %rd946, %r3512, 4;
+ add.s64 %rd947, %rd1, %rd946;
+ ld.local.u32 %r8052, [%rd947];
+ ld.local.u32 %r8053, [%rd947+-4];
+ and.b32 %r436, %r428, 31;
+ setp.eq.s32 %p310, %r436, 0;
+ @%p310 bra $L__BB0_340;
mov.u32 %r3513, 32;
- sub.s32 %r3514, %r3513, %r412;
- shr.u32 %r3515, %r8304, %r3514;
- shl.b32 %r3516, %r8303, %r412;
- add.s32 %r8303, %r3515, %r3516;
- mul.wide.s32 %rd944, %r409, 4;
- add.s64 %rd945, %rd1, %rd944;
- ld.local.u32 %r3517, [%rd945];
+ sub.s32 %r3514, %r3513, %r436;
+ shr.u32 %r3515, %r8053, %r3514;
+ shl.b32 %r3516, %r8052, %r436;
+ add.s32 %r8052, %r3515, %r3516;
+ mul.wide.s32 %rd948, %r433, 4;
+ add.s64 %rd949, %rd1, %rd948;
+ ld.local.u32 %r3517, [%rd949];
shr.u32 %r3518, %r3517, %r3514;
- shl.b32 %r3519, %r8304, %r412;
- add.s32 %r8304, %r3518, %r3519;
-
-$L__BB0_325:
- and.b32 %r3520, %r403, -2147483648;
- shr.u32 %r3521, %r8304, 30;
- shl.b32 %r3522, %r8303, 2;
+ shl.b32 %r3519, %r8053, %r436;
+ add.s32 %r8053, %r3518, %r3519;
+
+$L__BB0_340:
+ and.b32 %r3520, %r427, -2147483648;
+ shr.u32 %r3521, %r8053, 30;
+ shl.b32 %r3522, %r8052, 2;
or.b32 %r3523, %r3521, %r3522;
shr.u32 %r3524, %r3523, 31;
- shr.u32 %r3525, %r8303, 30;
+ shr.u32 %r3525, %r8052, 30;
add.s32 %r3526, %r3524, %r3525;
neg.s32 %r3527, %r3526;
- setp.eq.s32 %p298, %r3520, 0;
- selp.b32 %r8305, %r3526, %r3527, %p298;
- setp.ne.s32 %p299, %r3524, 0;
+ setp.eq.s32 %p311, %r3520, 0;
+ selp.b32 %r8054, %r3526, %r3527, %p311;
+ setp.ne.s32 %p312, %r3524, 0;
xor.b32 %r3528, %r3520, -2147483648;
- selp.b32 %r3529, %r3528, %r3520, %p299;
- selp.b32 %r3530, -1, 0, %p299;
+ selp.b32 %r3529, %r3528, %r3520, %p312;
+ selp.b32 %r3530, -1, 0, %p312;
xor.b32 %r3531, %r3523, %r3530;
- shl.b32 %r3532, %r8304, 2;
+ shl.b32 %r3532, %r8053, 2;
xor.b32 %r3533, %r3532, %r3530;
- cvt.u64.u32 %rd946, %r3531;
- cvt.u64.u32 %rd947, %r3533;
- bfi.b64 %rd948, %rd946, %rd947, 32, 32;
- cvt.rn.f64.s64 %fd39, %rd948;
- mul.f64 %fd40, %fd39, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f2680, %fd40;
- setp.eq.s32 %p300, %r3529, 0;
- neg.f32 %f2681, %f2680;
- selp.f32 %f5286, %f2680, %f2681, %p300;
-
-$L__BB0_327:
- add.s32 %r419, %r8305, 1;
- and.b32 %r420, %r419, 1;
- setp.eq.s32 %p301, %r420, 0;
- selp.f32 %f354, %f5286, 0f3F800000, %p301;
- mul.rn.f32 %f355, %f5286, %f5286;
- mov.f32 %f5287, 0fB94D4153;
- @%p301 bra $L__BB0_329;
-
- mov.f32 %f2684, 0fBAB607ED;
- mov.f32 %f2685, 0f37CBAC00;
- fma.rn.f32 %f5287, %f2685, %f355, %f2684;
-
-$L__BB0_329:
- selp.f32 %f2686, 0f3C0885E4, 0f3D2AAABB, %p301;
- fma.rn.f32 %f2687, %f5287, %f355, %f2686;
- selp.f32 %f2688, 0fBE2AAAA8, 0fBEFFFFFF, %p301;
- fma.rn.f32 %f2689, %f2687, %f355, %f2688;
- mov.f32 %f2690, 0f00000000;
- fma.rn.f32 %f2691, %f355, %f354, %f2690;
- fma.rn.f32 %f5288, %f2689, %f2691, %f354;
- and.b32 %r3535, %r419, 2;
- setp.eq.s32 %p303, %r3535, 0;
- @%p303 bra $L__BB0_331;
-
- mov.f32 %f2693, 0fBF800000;
- fma.rn.f32 %f5288, %f5288, %f2693, %f2690;
-
-$L__BB0_331:
- add.f32 %f5331, %f5285, %f5288;
- mul.f32 %f2694, %f306, 0f3F22F983;
- cvt.rni.s32.f32 %r8309, %f2694;
- cvt.rn.f32.s32 %f2695, %r8309;
- mov.f32 %f2696, 0fBFC90FDA;
- fma.rn.f32 %f2697, %f2695, %f2696, %f306;
- mov.f32 %f2698, 0fB3A22168;
- fma.rn.f32 %f2699, %f2695, %f2698, %f2697;
- mov.f32 %f2700, 0fA7C234C5;
- fma.rn.f32 %f5289, %f2695, %f2700, %f2699;
- abs.f32 %f363, %f306;
- setp.ltu.f32 %p304, %f363, 0f47CE4780;
- @%p304 bra $L__BB0_339;
-
- setp.eq.f32 %p305, %f363, 0f7F800000;
- @%p305 bra $L__BB0_338;
- bra.uni $L__BB0_333;
-
-$L__BB0_338:
- mov.f32 %f2703, 0f00000000;
- mul.rn.f32 %f5289, %f306, %f2703;
- mov.u32 %r8309, 0;
- bra.uni $L__BB0_339;
-
-$L__BB0_333:
- mov.b32 %r422, %f306;
- shr.u32 %r3537, %r422, 23;
+ cvt.u64.u32 %rd950, %r3531;
+ cvt.u64.u32 %rd951, %r3533;
+ bfi.b64 %rd952, %rd950, %rd951, 32, 32;
+ cvt.rn.f64.s64 %fd43, %rd952;
+ mul.f64 %fd44, %fd43, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f2778, %fd44;
+ setp.eq.s32 %p313, %r3529, 0;
+ neg.f32 %f2779, %f2778;
+ selp.f32 %f5360, %f2778, %f2779, %p313;
+
+$L__BB0_342:
+ add.s32 %r443, %r8054, 1;
+ and.b32 %r444, %r443, 1;
+ setp.eq.s32 %p314, %r444, 0;
+ selp.f32 %f387, %f5360, 0f3F800000, %p314;
+ mul.rn.f32 %f388, %f5360, %f5360;
+ mov.f32 %f5361, 0fB94D4153;
+ @%p314 bra $L__BB0_344;
+
+ mov.f32 %f2782, 0fBAB607ED;
+ mov.f32 %f2783, 0f37CBAC00;
+ fma.rn.f32 %f5361, %f2783, %f388, %f2782;
+
+$L__BB0_344:
+ selp.f32 %f2784, 0f3C0885E4, 0f3D2AAABB, %p314;
+ fma.rn.f32 %f2785, %f5361, %f388, %f2784;
+ selp.f32 %f2786, 0fBE2AAAA8, 0fBEFFFFFF, %p314;
+ fma.rn.f32 %f2787, %f2785, %f388, %f2786;
+ mov.f32 %f2788, 0f00000000;
+ fma.rn.f32 %f2789, %f388, %f387, %f2788;
+ fma.rn.f32 %f5362, %f2787, %f2789, %f387;
+ and.b32 %r3535, %r443, 2;
+ setp.eq.s32 %p316, %r3535, 0;
+ @%p316 bra $L__BB0_346;
+
+ mov.f32 %f2791, 0fBF800000;
+ fma.rn.f32 %f5362, %f5362, %f2791, %f2788;
+
+$L__BB0_346:
+ add.f32 %f5398, %f5359, %f5362;
+ mul.f32 %f2792, %f315, 0f3F22F983;
+ cvt.rni.s32.f32 %r8058, %f2792;
+ cvt.rn.f32.s32 %f2793, %r8058;
+ mov.f32 %f2794, 0fBFC90FDA;
+ fma.rn.f32 %f2795, %f2793, %f2794, %f315;
+ mov.f32 %f2796, 0fB3A22168;
+ fma.rn.f32 %f2797, %f2793, %f2796, %f2795;
+ mov.f32 %f2798, 0fA7C234C5;
+ fma.rn.f32 %f5363, %f2793, %f2798, %f2797;
+ abs.f32 %f396, %f315;
+ setp.ltu.f32 %p317, %f396, 0f47CE4780;
+ @%p317 bra $L__BB0_354;
+
+ setp.eq.f32 %p318, %f396, 0f7F800000;
+ @%p318 bra $L__BB0_353;
+ bra.uni $L__BB0_348;
+
+$L__BB0_353:
+ mov.f32 %f2801, 0f00000000;
+ mul.rn.f32 %f5363, %f315, %f2801;
+ mov.u32 %r8058, 0;
+ bra.uni $L__BB0_354;
+
+$L__BB0_348:
+ mov.b32 %r446, %f315;
+ shr.u32 %r3537, %r446, 23;
and.b32 %r3538, %r3537, 255;
- add.s32 %r423, %r3538, -128;
- shl.b32 %r3539, %r422, 8;
- or.b32 %r424, %r3539, -2147483648;
- shr.u32 %r425, %r423, 5;
- mov.u64 %rd2538, 0;
- mov.u32 %r8306, 0;
- mov.u64 %rd952, __cudart_i2opi_f;
- mov.u64 %rd2539, %rd2538;
-
-$L__BB0_334:
+ add.s32 %r447, %r3538, -128;
+ shl.b32 %r3539, %r446, 8;
+ or.b32 %r448, %r3539, -2147483648;
+ shr.u32 %r449, %r447, 5;
+ mov.u64 %rd2549, 0;
+ mov.u32 %r8055, 0;
+ mov.u64 %rd956, __cudart_i2opi_f;
+ mov.u64 %rd2550, %rd2549;
+
+$L__BB0_349:
.pragma "nounroll";
- shl.b64 %rd951, %rd2538, 2;
- add.s64 %rd953, %rd952, %rd951;
- ld.global.nc.u32 %r3540, [%rd953];
- mad.wide.u32 %rd954, %r3540, %r424, %rd2539;
- shr.u64 %rd2539, %rd954, 32;
- add.s64 %rd955, %rd1, %rd951;
- st.local.u32 [%rd955], %rd954;
- add.s32 %r8306, %r8306, 1;
- cvt.s64.s32 %rd2538, %r8306;
- setp.ne.s32 %p306, %r8306, 6;
- @%p306 bra $L__BB0_334;
-
- st.local.u32 [%rd5], %rd2539;
+ shl.b64 %rd955, %rd2549, 2;
+ add.s64 %rd957, %rd956, %rd955;
+ ld.global.nc.u32 %r3540, [%rd957];
+ mad.wide.u32 %rd958, %r3540, %r448, %rd2550;
+ shr.u64 %rd2550, %rd958, 32;
+ add.s64 %rd959, %rd1, %rd955;
+ st.local.u32 [%rd959], %rd958;
+ add.s32 %r8055, %r8055, 1;
+ cvt.s64.s32 %rd2549, %r8055;
+ setp.ne.s32 %p319, %r8055, 6;
+ @%p319 bra $L__BB0_349;
+
+ st.local.u32 [%rd4], %rd2550;
mov.u32 %r3541, 4;
- sub.s32 %r428, %r3541, %r425;
+ sub.s32 %r452, %r3541, %r449;
mov.u32 %r3542, 6;
- sub.s32 %r3543, %r3542, %r425;
- mul.wide.s32 %rd956, %r3543, 4;
- add.s64 %rd957, %rd1, %rd956;
- ld.local.u32 %r8307, [%rd957];
- ld.local.u32 %r8308, [%rd957+-4];
- and.b32 %r431, %r423, 31;
- setp.eq.s32 %p307, %r431, 0;
- @%p307 bra $L__BB0_337;
+ sub.s32 %r3543, %r3542, %r449;
+ mul.wide.s32 %rd960, %r3543, 4;
+ add.s64 %rd961, %rd1, %rd960;
+ ld.local.u32 %r8056, [%rd961];
+ ld.local.u32 %r8057, [%rd961+-4];
+ and.b32 %r455, %r447, 31;
+ setp.eq.s32 %p320, %r455, 0;
+ @%p320 bra $L__BB0_352;
mov.u32 %r3544, 32;
- sub.s32 %r3545, %r3544, %r431;
- shr.u32 %r3546, %r8308, %r3545;
- shl.b32 %r3547, %r8307, %r431;
- add.s32 %r8307, %r3546, %r3547;
- mul.wide.s32 %rd958, %r428, 4;
- add.s64 %rd959, %rd1, %rd958;
- ld.local.u32 %r3548, [%rd959];
+ sub.s32 %r3545, %r3544, %r455;
+ shr.u32 %r3546, %r8057, %r3545;
+ shl.b32 %r3547, %r8056, %r455;
+ add.s32 %r8056, %r3546, %r3547;
+ mul.wide.s32 %rd962, %r452, 4;
+ add.s64 %rd963, %rd1, %rd962;
+ ld.local.u32 %r3548, [%rd963];
shr.u32 %r3549, %r3548, %r3545;
- shl.b32 %r3550, %r8308, %r431;
- add.s32 %r8308, %r3549, %r3550;
-
-$L__BB0_337:
- and.b32 %r3551, %r422, -2147483648;
- shr.u32 %r3552, %r8308, 30;
- shl.b32 %r3553, %r8307, 2;
+ shl.b32 %r3550, %r8057, %r455;
+ add.s32 %r8057, %r3549, %r3550;
+
+$L__BB0_352:
+ and.b32 %r3551, %r446, -2147483648;
+ shr.u32 %r3552, %r8057, 30;
+ shl.b32 %r3553, %r8056, 2;
or.b32 %r3554, %r3552, %r3553;
shr.u32 %r3555, %r3554, 31;
- shr.u32 %r3556, %r8307, 30;
+ shr.u32 %r3556, %r8056, 30;
add.s32 %r3557, %r3555, %r3556;
neg.s32 %r3558, %r3557;
- setp.eq.s32 %p308, %r3551, 0;
- selp.b32 %r8309, %r3557, %r3558, %p308;
- setp.ne.s32 %p309, %r3555, 0;
+ setp.eq.s32 %p321, %r3551, 0;
+ selp.b32 %r8058, %r3557, %r3558, %p321;
+ setp.ne.s32 %p322, %r3555, 0;
xor.b32 %r3559, %r3551, -2147483648;
- selp.b32 %r3560, %r3559, %r3551, %p309;
- selp.b32 %r3561, -1, 0, %p309;
+ selp.b32 %r3560, %r3559, %r3551, %p322;
+ selp.b32 %r3561, -1, 0, %p322;
xor.b32 %r3562, %r3554, %r3561;
- shl.b32 %r3563, %r8308, 2;
+ shl.b32 %r3563, %r8057, 2;
xor.b32 %r3564, %r3563, %r3561;
- cvt.u64.u32 %rd960, %r3562;
- cvt.u64.u32 %rd961, %r3564;
- bfi.b64 %rd962, %rd960, %rd961, 32, 32;
- cvt.rn.f64.s64 %fd41, %rd962;
- mul.f64 %fd42, %fd41, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f2701, %fd42;
- setp.eq.s32 %p310, %r3560, 0;
- neg.f32 %f2702, %f2701;
- selp.f32 %f5289, %f2701, %f2702, %p310;
-
-$L__BB0_339:
- and.b32 %r438, %r8309, 1;
- setp.eq.s32 %p311, %r438, 0;
- selp.f32 %f367, %f5289, 0f3F800000, %p311;
- mul.rn.f32 %f368, %f5289, %f5289;
- mov.f32 %f5290, 0fB94D4153;
- @%p311 bra $L__BB0_341;
-
- mov.f32 %f2705, 0fBAB607ED;
- mov.f32 %f2706, 0f37CBAC00;
- fma.rn.f32 %f5290, %f2706, %f368, %f2705;
-
-$L__BB0_341:
- selp.f32 %f2707, 0f3C0885E4, 0f3D2AAABB, %p311;
- fma.rn.f32 %f2708, %f5290, %f368, %f2707;
- selp.f32 %f2709, 0fBE2AAAA8, 0fBEFFFFFF, %p311;
- fma.rn.f32 %f2710, %f2708, %f368, %f2709;
- mov.f32 %f2711, 0f00000000;
- fma.rn.f32 %f2712, %f368, %f367, %f2711;
- fma.rn.f32 %f5291, %f2710, %f2712, %f367;
- and.b32 %r3566, %r8309, 2;
- setp.eq.s32 %p313, %r3566, 0;
- @%p313 bra $L__BB0_343;
-
- mov.f32 %f2714, 0fBF800000;
- fma.rn.f32 %f5291, %f5291, %f2714, %f2711;
-
-$L__BB0_343:
- mul.f32 %f2715, %f298, 0f3F22F983;
- cvt.rni.s32.f32 %r8313, %f2715;
- cvt.rn.f32.s32 %f2716, %r8313;
- mov.f32 %f2717, 0fBFC90FDA;
- fma.rn.f32 %f2718, %f2716, %f2717, %f298;
- mov.f32 %f2719, 0fB3A22168;
- fma.rn.f32 %f2720, %f2716, %f2719, %f2718;
- mov.f32 %f2721, 0fA7C234C5;
- fma.rn.f32 %f5292, %f2716, %f2721, %f2720;
- abs.f32 %f375, %f298;
- setp.ltu.f32 %p314, %f375, 0f47CE4780;
- @%p314 bra $L__BB0_351;
-
- setp.eq.f32 %p315, %f375, 0f7F800000;
- @%p315 bra $L__BB0_350;
- bra.uni $L__BB0_345;
-
-$L__BB0_350:
- mov.f32 %f2724, 0f00000000;
- mul.rn.f32 %f5292, %f298, %f2724;
- mov.u32 %r8313, 0;
- bra.uni $L__BB0_351;
-
-$L__BB0_345:
- mov.b32 %r440, %f298;
- shr.u32 %r3568, %r440, 23;
+ cvt.u64.u32 %rd964, %r3562;
+ cvt.u64.u32 %rd965, %r3564;
+ bfi.b64 %rd966, %rd964, %rd965, 32, 32;
+ cvt.rn.f64.s64 %fd45, %rd966;
+ mul.f64 %fd46, %fd45, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f2799, %fd46;
+ setp.eq.s32 %p323, %r3560, 0;
+ neg.f32 %f2800, %f2799;
+ selp.f32 %f5363, %f2799, %f2800, %p323;
+
+$L__BB0_354:
+ and.b32 %r462, %r8058, 1;
+ setp.eq.s32 %p324, %r462, 0;
+ selp.f32 %f400, %f5363, 0f3F800000, %p324;
+ mul.rn.f32 %f401, %f5363, %f5363;
+ mov.f32 %f5364, 0fB94D4153;
+ @%p324 bra $L__BB0_356;
+
+ mov.f32 %f2803, 0fBAB607ED;
+ mov.f32 %f2804, 0f37CBAC00;
+ fma.rn.f32 %f5364, %f2804, %f401, %f2803;
+
+$L__BB0_356:
+ selp.f32 %f2805, 0f3C0885E4, 0f3D2AAABB, %p324;
+ fma.rn.f32 %f2806, %f5364, %f401, %f2805;
+ selp.f32 %f2807, 0fBE2AAAA8, 0fBEFFFFFF, %p324;
+ fma.rn.f32 %f2808, %f2806, %f401, %f2807;
+ mov.f32 %f2809, 0f00000000;
+ fma.rn.f32 %f2810, %f401, %f400, %f2809;
+ fma.rn.f32 %f5365, %f2808, %f2810, %f400;
+ and.b32 %r3566, %r8058, 2;
+ setp.eq.s32 %p326, %r3566, 0;
+ @%p326 bra $L__BB0_358;
+
+ mov.f32 %f2812, 0fBF800000;
+ fma.rn.f32 %f5365, %f5365, %f2812, %f2809;
+
+$L__BB0_358:
+ mul.f32 %f2813, %f307, 0f3F22F983;
+ cvt.rni.s32.f32 %r8062, %f2813;
+ cvt.rn.f32.s32 %f2814, %r8062;
+ mov.f32 %f2815, 0fBFC90FDA;
+ fma.rn.f32 %f2816, %f2814, %f2815, %f307;
+ mov.f32 %f2817, 0fB3A22168;
+ fma.rn.f32 %f2818, %f2814, %f2817, %f2816;
+ mov.f32 %f2819, 0fA7C234C5;
+ fma.rn.f32 %f5366, %f2814, %f2819, %f2818;
+ abs.f32 %f408, %f307;
+ setp.ltu.f32 %p327, %f408, 0f47CE4780;
+ @%p327 bra $L__BB0_366;
+
+ setp.eq.f32 %p328, %f408, 0f7F800000;
+ @%p328 bra $L__BB0_365;
+ bra.uni $L__BB0_360;
+
+$L__BB0_365:
+ mov.f32 %f2822, 0f00000000;
+ mul.rn.f32 %f5366, %f307, %f2822;
+ mov.u32 %r8062, 0;
+ bra.uni $L__BB0_366;
+
+$L__BB0_360:
+ mov.b32 %r464, %f307;
+ shr.u32 %r3568, %r464, 23;
and.b32 %r3569, %r3568, 255;
- add.s32 %r441, %r3569, -128;
- shl.b32 %r3570, %r440, 8;
- or.b32 %r442, %r3570, -2147483648;
- shr.u32 %r443, %r441, 5;
- mov.u64 %rd2540, 0;
- mov.u32 %r8310, 0;
- mov.u64 %rd966, __cudart_i2opi_f;
- mov.u64 %rd2541, %rd2540;
-
-$L__BB0_346:
+ add.s32 %r465, %r3569, -128;
+ shl.b32 %r3570, %r464, 8;
+ or.b32 %r466, %r3570, -2147483648;
+ shr.u32 %r467, %r465, 5;
+ mov.u64 %rd2551, 0;
+ mov.u32 %r8059, 0;
+ mov.u64 %rd970, __cudart_i2opi_f;
+ mov.u64 %rd2552, %rd2551;
+
+$L__BB0_361:
.pragma "nounroll";
- shl.b64 %rd965, %rd2540, 2;
- add.s64 %rd967, %rd966, %rd965;
- ld.global.nc.u32 %r3571, [%rd967];
- mad.wide.u32 %rd968, %r3571, %r442, %rd2541;
- shr.u64 %rd2541, %rd968, 32;
- add.s64 %rd969, %rd1, %rd965;
- st.local.u32 [%rd969], %rd968;
- add.s32 %r8310, %r8310, 1;
- cvt.s64.s32 %rd2540, %r8310;
- setp.ne.s32 %p316, %r8310, 6;
- @%p316 bra $L__BB0_346;
-
- st.local.u32 [%rd5], %rd2541;
+ shl.b64 %rd969, %rd2551, 2;
+ add.s64 %rd971, %rd970, %rd969;
+ ld.global.nc.u32 %r3571, [%rd971];
+ mad.wide.u32 %rd972, %r3571, %r466, %rd2552;
+ shr.u64 %rd2552, %rd972, 32;
+ add.s64 %rd973, %rd1, %rd969;
+ st.local.u32 [%rd973], %rd972;
+ add.s32 %r8059, %r8059, 1;
+ cvt.s64.s32 %rd2551, %r8059;
+ setp.ne.s32 %p329, %r8059, 6;
+ @%p329 bra $L__BB0_361;
+
+ st.local.u32 [%rd4], %rd2552;
mov.u32 %r3572, 4;
- sub.s32 %r446, %r3572, %r443;
+ sub.s32 %r470, %r3572, %r467;
mov.u32 %r3573, 6;
- sub.s32 %r3574, %r3573, %r443;
- mul.wide.s32 %rd970, %r3574, 4;
- add.s64 %rd971, %rd1, %rd970;
- ld.local.u32 %r8311, [%rd971];
- ld.local.u32 %r8312, [%rd971+-4];
- and.b32 %r449, %r441, 31;
- setp.eq.s32 %p317, %r449, 0;
- @%p317 bra $L__BB0_349;
+ sub.s32 %r3574, %r3573, %r467;
+ mul.wide.s32 %rd974, %r3574, 4;
+ add.s64 %rd975, %rd1, %rd974;
+ ld.local.u32 %r8060, [%rd975];
+ ld.local.u32 %r8061, [%rd975+-4];
+ and.b32 %r473, %r465, 31;
+ setp.eq.s32 %p330, %r473, 0;
+ @%p330 bra $L__BB0_364;
mov.u32 %r3575, 32;
- sub.s32 %r3576, %r3575, %r449;
- shr.u32 %r3577, %r8312, %r3576;
- shl.b32 %r3578, %r8311, %r449;
- add.s32 %r8311, %r3577, %r3578;
- mul.wide.s32 %rd972, %r446, 4;
- add.s64 %rd973, %rd1, %rd972;
- ld.local.u32 %r3579, [%rd973];
+ sub.s32 %r3576, %r3575, %r473;
+ shr.u32 %r3577, %r8061, %r3576;
+ shl.b32 %r3578, %r8060, %r473;
+ add.s32 %r8060, %r3577, %r3578;
+ mul.wide.s32 %rd976, %r470, 4;
+ add.s64 %rd977, %rd1, %rd976;
+ ld.local.u32 %r3579, [%rd977];
shr.u32 %r3580, %r3579, %r3576;
- shl.b32 %r3581, %r8312, %r449;
- add.s32 %r8312, %r3580, %r3581;
-
-$L__BB0_349:
- and.b32 %r3582, %r440, -2147483648;
- shr.u32 %r3583, %r8312, 30;
- shl.b32 %r3584, %r8311, 2;
+ shl.b32 %r3581, %r8061, %r473;
+ add.s32 %r8061, %r3580, %r3581;
+
+$L__BB0_364:
+ and.b32 %r3582, %r464, -2147483648;
+ shr.u32 %r3583, %r8061, 30;
+ shl.b32 %r3584, %r8060, 2;
or.b32 %r3585, %r3583, %r3584;
shr.u32 %r3586, %r3585, 31;
- shr.u32 %r3587, %r8311, 30;
+ shr.u32 %r3587, %r8060, 30;
add.s32 %r3588, %r3586, %r3587;
neg.s32 %r3589, %r3588;
- setp.eq.s32 %p318, %r3582, 0;
- selp.b32 %r8313, %r3588, %r3589, %p318;
- setp.ne.s32 %p319, %r3586, 0;
+ setp.eq.s32 %p331, %r3582, 0;
+ selp.b32 %r8062, %r3588, %r3589, %p331;
+ setp.ne.s32 %p332, %r3586, 0;
xor.b32 %r3590, %r3582, -2147483648;
- selp.b32 %r3591, %r3590, %r3582, %p319;
- selp.b32 %r3592, -1, 0, %p319;
+ selp.b32 %r3591, %r3590, %r3582, %p332;
+ selp.b32 %r3592, -1, 0, %p332;
xor.b32 %r3593, %r3585, %r3592;
- shl.b32 %r3594, %r8312, 2;
+ shl.b32 %r3594, %r8061, 2;
xor.b32 %r3595, %r3594, %r3592;
- cvt.u64.u32 %rd974, %r3593;
- cvt.u64.u32 %rd975, %r3595;
- bfi.b64 %rd976, %rd974, %rd975, 32, 32;
- cvt.rn.f64.s64 %fd43, %rd976;
- mul.f64 %fd44, %fd43, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f2722, %fd44;
- setp.eq.s32 %p320, %r3591, 0;
- neg.f32 %f2723, %f2722;
- selp.f32 %f5292, %f2722, %f2723, %p320;
-
-$L__BB0_351:
- add.s32 %r456, %r8313, 1;
- and.b32 %r457, %r456, 1;
- setp.eq.s32 %p321, %r457, 0;
- selp.f32 %f379, %f5292, 0f3F800000, %p321;
- mul.rn.f32 %f380, %f5292, %f5292;
- mov.f32 %f5293, 0fB94D4153;
- @%p321 bra $L__BB0_353;
-
- mov.f32 %f2726, 0fBAB607ED;
- mov.f32 %f2727, 0f37CBAC00;
- fma.rn.f32 %f5293, %f2727, %f380, %f2726;
-
-$L__BB0_353:
- selp.f32 %f2728, 0f3C0885E4, 0f3D2AAABB, %p321;
- fma.rn.f32 %f2729, %f5293, %f380, %f2728;
- selp.f32 %f2730, 0fBE2AAAA8, 0fBEFFFFFF, %p321;
- fma.rn.f32 %f2731, %f2729, %f380, %f2730;
- mov.f32 %f2732, 0f00000000;
- fma.rn.f32 %f2733, %f380, %f379, %f2732;
- fma.rn.f32 %f5294, %f2731, %f2733, %f379;
- and.b32 %r3597, %r456, 2;
- setp.eq.s32 %p323, %r3597, 0;
- @%p323 bra $L__BB0_355;
-
- mov.f32 %f2735, 0fBF800000;
- fma.rn.f32 %f5294, %f5294, %f2735, %f2732;
-
-$L__BB0_355:
- add.f32 %f5330, %f5291, %f5294;
- mul.f32 %f2736, %f307, 0f3F22F983;
- cvt.rni.s32.f32 %r8317, %f2736;
- cvt.rn.f32.s32 %f2737, %r8317;
- mov.f32 %f2738, 0fBFC90FDA;
- fma.rn.f32 %f2739, %f2737, %f2738, %f307;
- mov.f32 %f2740, 0fB3A22168;
- fma.rn.f32 %f2741, %f2737, %f2740, %f2739;
- mov.f32 %f2742, 0fA7C234C5;
- fma.rn.f32 %f5295, %f2737, %f2742, %f2741;
- abs.f32 %f388, %f307;
- setp.ltu.f32 %p324, %f388, 0f47CE4780;
- @%p324 bra $L__BB0_363;
-
- setp.eq.f32 %p325, %f388, 0f7F800000;
- @%p325 bra $L__BB0_362;
- bra.uni $L__BB0_357;
-
-$L__BB0_362:
- mov.f32 %f2745, 0f00000000;
- mul.rn.f32 %f5295, %f307, %f2745;
- mov.u32 %r8317, 0;
- bra.uni $L__BB0_363;
-
-$L__BB0_357:
- mov.b32 %r459, %f307;
- shr.u32 %r3599, %r459, 23;
+ cvt.u64.u32 %rd978, %r3593;
+ cvt.u64.u32 %rd979, %r3595;
+ bfi.b64 %rd980, %rd978, %rd979, 32, 32;
+ cvt.rn.f64.s64 %fd47, %rd980;
+ mul.f64 %fd48, %fd47, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f2820, %fd48;
+ setp.eq.s32 %p333, %r3591, 0;
+ neg.f32 %f2821, %f2820;
+ selp.f32 %f5366, %f2820, %f2821, %p333;
+
+$L__BB0_366:
+ add.s32 %r480, %r8062, 1;
+ and.b32 %r481, %r480, 1;
+ setp.eq.s32 %p334, %r481, 0;
+ selp.f32 %f412, %f5366, 0f3F800000, %p334;
+ mul.rn.f32 %f413, %f5366, %f5366;
+ mov.f32 %f5367, 0fB94D4153;
+ @%p334 bra $L__BB0_368;
+
+ mov.f32 %f2824, 0fBAB607ED;
+ mov.f32 %f2825, 0f37CBAC00;
+ fma.rn.f32 %f5367, %f2825, %f413, %f2824;
+
+$L__BB0_368:
+ selp.f32 %f2826, 0f3C0885E4, 0f3D2AAABB, %p334;
+ fma.rn.f32 %f2827, %f5367, %f413, %f2826;
+ selp.f32 %f2828, 0fBE2AAAA8, 0fBEFFFFFF, %p334;
+ fma.rn.f32 %f2829, %f2827, %f413, %f2828;
+ mov.f32 %f2830, 0f00000000;
+ fma.rn.f32 %f2831, %f413, %f412, %f2830;
+ fma.rn.f32 %f5368, %f2829, %f2831, %f412;
+ and.b32 %r3597, %r480, 2;
+ setp.eq.s32 %p336, %r3597, 0;
+ @%p336 bra $L__BB0_370;
+
+ mov.f32 %f2833, 0fBF800000;
+ fma.rn.f32 %f5368, %f5368, %f2833, %f2830;
+
+$L__BB0_370:
+ add.f32 %f5397, %f5365, %f5368;
+ mul.f32 %f2834, %f316, 0f3F22F983;
+ cvt.rni.s32.f32 %r8066, %f2834;
+ cvt.rn.f32.s32 %f2835, %r8066;
+ mov.f32 %f2836, 0fBFC90FDA;
+ fma.rn.f32 %f2837, %f2835, %f2836, %f316;
+ mov.f32 %f2838, 0fB3A22168;
+ fma.rn.f32 %f2839, %f2835, %f2838, %f2837;
+ mov.f32 %f2840, 0fA7C234C5;
+ fma.rn.f32 %f5369, %f2835, %f2840, %f2839;
+ abs.f32 %f421, %f316;
+ setp.ltu.f32 %p337, %f421, 0f47CE4780;
+ @%p337 bra $L__BB0_378;
+
+ setp.eq.f32 %p338, %f421, 0f7F800000;
+ @%p338 bra $L__BB0_377;
+ bra.uni $L__BB0_372;
+
+$L__BB0_377:
+ mov.f32 %f2843, 0f00000000;
+ mul.rn.f32 %f5369, %f316, %f2843;
+ mov.u32 %r8066, 0;
+ bra.uni $L__BB0_378;
+
+$L__BB0_372:
+ mov.b32 %r483, %f316;
+ shr.u32 %r3599, %r483, 23;
and.b32 %r3600, %r3599, 255;
- add.s32 %r460, %r3600, -128;
- shl.b32 %r3601, %r459, 8;
- or.b32 %r461, %r3601, -2147483648;
- shr.u32 %r462, %r460, 5;
- mov.u64 %rd2542, 0;
- mov.u32 %r8314, 0;
- mov.u64 %rd980, __cudart_i2opi_f;
- mov.u64 %rd2543, %rd2542;
-
-$L__BB0_358:
+ add.s32 %r484, %r3600, -128;
+ shl.b32 %r3601, %r483, 8;
+ or.b32 %r485, %r3601, -2147483648;
+ shr.u32 %r486, %r484, 5;
+ mov.u64 %rd2553, 0;
+ mov.u32 %r8063, 0;
+ mov.u64 %rd984, __cudart_i2opi_f;
+ mov.u64 %rd2554, %rd2553;
+
+$L__BB0_373:
.pragma "nounroll";
- shl.b64 %rd979, %rd2542, 2;
- add.s64 %rd981, %rd980, %rd979;
- ld.global.nc.u32 %r3602, [%rd981];
- mad.wide.u32 %rd982, %r3602, %r461, %rd2543;
- shr.u64 %rd2543, %rd982, 32;
- add.s64 %rd983, %rd1, %rd979;
- st.local.u32 [%rd983], %rd982;
- add.s32 %r8314, %r8314, 1;
- cvt.s64.s32 %rd2542, %r8314;
- setp.ne.s32 %p326, %r8314, 6;
- @%p326 bra $L__BB0_358;
-
- st.local.u32 [%rd5], %rd2543;
+ shl.b64 %rd983, %rd2553, 2;
+ add.s64 %rd985, %rd984, %rd983;
+ ld.global.nc.u32 %r3602, [%rd985];
+ mad.wide.u32 %rd986, %r3602, %r485, %rd2554;
+ shr.u64 %rd2554, %rd986, 32;
+ add.s64 %rd987, %rd1, %rd983;
+ st.local.u32 [%rd987], %rd986;
+ add.s32 %r8063, %r8063, 1;
+ cvt.s64.s32 %rd2553, %r8063;
+ setp.ne.s32 %p339, %r8063, 6;
+ @%p339 bra $L__BB0_373;
+
+ st.local.u32 [%rd4], %rd2554;
mov.u32 %r3603, 4;
- sub.s32 %r465, %r3603, %r462;
+ sub.s32 %r489, %r3603, %r486;
mov.u32 %r3604, 6;
- sub.s32 %r3605, %r3604, %r462;
- mul.wide.s32 %rd984, %r3605, 4;
- add.s64 %rd985, %rd1, %rd984;
- ld.local.u32 %r8315, [%rd985];
- ld.local.u32 %r8316, [%rd985+-4];
- and.b32 %r468, %r460, 31;
- setp.eq.s32 %p327, %r468, 0;
- @%p327 bra $L__BB0_361;
+ sub.s32 %r3605, %r3604, %r486;
+ mul.wide.s32 %rd988, %r3605, 4;
+ add.s64 %rd989, %rd1, %rd988;
+ ld.local.u32 %r8064, [%rd989];
+ ld.local.u32 %r8065, [%rd989+-4];
+ and.b32 %r492, %r484, 31;
+ setp.eq.s32 %p340, %r492, 0;
+ @%p340 bra $L__BB0_376;
mov.u32 %r3606, 32;
- sub.s32 %r3607, %r3606, %r468;
- shr.u32 %r3608, %r8316, %r3607;
- shl.b32 %r3609, %r8315, %r468;
- add.s32 %r8315, %r3608, %r3609;
- mul.wide.s32 %rd986, %r465, 4;
- add.s64 %rd987, %rd1, %rd986;
- ld.local.u32 %r3610, [%rd987];
+ sub.s32 %r3607, %r3606, %r492;
+ shr.u32 %r3608, %r8065, %r3607;
+ shl.b32 %r3609, %r8064, %r492;
+ add.s32 %r8064, %r3608, %r3609;
+ mul.wide.s32 %rd990, %r489, 4;
+ add.s64 %rd991, %rd1, %rd990;
+ ld.local.u32 %r3610, [%rd991];
shr.u32 %r3611, %r3610, %r3607;
- shl.b32 %r3612, %r8316, %r468;
- add.s32 %r8316, %r3611, %r3612;
-
-$L__BB0_361:
- and.b32 %r3613, %r459, -2147483648;
- shr.u32 %r3614, %r8316, 30;
- shl.b32 %r3615, %r8315, 2;
+ shl.b32 %r3612, %r8065, %r492;
+ add.s32 %r8065, %r3611, %r3612;
+
+$L__BB0_376:
+ and.b32 %r3613, %r483, -2147483648;
+ shr.u32 %r3614, %r8065, 30;
+ shl.b32 %r3615, %r8064, 2;
or.b32 %r3616, %r3614, %r3615;
shr.u32 %r3617, %r3616, 31;
- shr.u32 %r3618, %r8315, 30;
+ shr.u32 %r3618, %r8064, 30;
add.s32 %r3619, %r3617, %r3618;
neg.s32 %r3620, %r3619;
- setp.eq.s32 %p328, %r3613, 0;
- selp.b32 %r8317, %r3619, %r3620, %p328;
- setp.ne.s32 %p329, %r3617, 0;
+ setp.eq.s32 %p341, %r3613, 0;
+ selp.b32 %r8066, %r3619, %r3620, %p341;
+ setp.ne.s32 %p342, %r3617, 0;
xor.b32 %r3621, %r3613, -2147483648;
- selp.b32 %r3622, %r3621, %r3613, %p329;
- selp.b32 %r3623, -1, 0, %p329;
+ selp.b32 %r3622, %r3621, %r3613, %p342;
+ selp.b32 %r3623, -1, 0, %p342;
xor.b32 %r3624, %r3616, %r3623;
- shl.b32 %r3625, %r8316, 2;
+ shl.b32 %r3625, %r8065, 2;
xor.b32 %r3626, %r3625, %r3623;
- cvt.u64.u32 %rd988, %r3624;
- cvt.u64.u32 %rd989, %r3626;
- bfi.b64 %rd990, %rd988, %rd989, 32, 32;
- cvt.rn.f64.s64 %fd45, %rd990;
- mul.f64 %fd46, %fd45, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f2743, %fd46;
- setp.eq.s32 %p330, %r3622, 0;
- neg.f32 %f2744, %f2743;
- selp.f32 %f5295, %f2743, %f2744, %p330;
-
-$L__BB0_363:
- and.b32 %r475, %r8317, 1;
- setp.eq.s32 %p331, %r475, 0;
- selp.f32 %f392, %f5295, 0f3F800000, %p331;
- mul.rn.f32 %f393, %f5295, %f5295;
- mov.f32 %f5296, 0fB94D4153;
- @%p331 bra $L__BB0_365;
-
- mov.f32 %f2747, 0fBAB607ED;
- mov.f32 %f2748, 0f37CBAC00;
- fma.rn.f32 %f5296, %f2748, %f393, %f2747;
-
-$L__BB0_365:
- selp.f32 %f2749, 0f3C0885E4, 0f3D2AAABB, %p331;
- fma.rn.f32 %f2750, %f5296, %f393, %f2749;
- selp.f32 %f2751, 0fBE2AAAA8, 0fBEFFFFFF, %p331;
- fma.rn.f32 %f2752, %f2750, %f393, %f2751;
- mov.f32 %f2753, 0f00000000;
- fma.rn.f32 %f2754, %f393, %f392, %f2753;
- fma.rn.f32 %f5297, %f2752, %f2754, %f392;
- and.b32 %r3628, %r8317, 2;
- setp.eq.s32 %p333, %r3628, 0;
- @%p333 bra $L__BB0_367;
-
- mov.f32 %f2756, 0fBF800000;
- fma.rn.f32 %f5297, %f5297, %f2756, %f2753;
-
-$L__BB0_367:
- mul.f32 %f2757, %f299, 0f3F22F983;
- cvt.rni.s32.f32 %r8321, %f2757;
- cvt.rn.f32.s32 %f2758, %r8321;
- mov.f32 %f2759, 0fBFC90FDA;
- fma.rn.f32 %f2760, %f2758, %f2759, %f299;
- mov.f32 %f2761, 0fB3A22168;
- fma.rn.f32 %f2762, %f2758, %f2761, %f2760;
- mov.f32 %f2763, 0fA7C234C5;
- fma.rn.f32 %f5298, %f2758, %f2763, %f2762;
- abs.f32 %f400, %f299;
- setp.ltu.f32 %p334, %f400, 0f47CE4780;
- @%p334 bra $L__BB0_375;
-
- setp.eq.f32 %p335, %f400, 0f7F800000;
- @%p335 bra $L__BB0_374;
- bra.uni $L__BB0_369;
-
-$L__BB0_374:
- mov.f32 %f2766, 0f00000000;
- mul.rn.f32 %f5298, %f299, %f2766;
- mov.u32 %r8321, 0;
- bra.uni $L__BB0_375;
-
-$L__BB0_369:
- mov.b32 %r477, %f299;
- shr.u32 %r3630, %r477, 23;
+ cvt.u64.u32 %rd992, %r3624;
+ cvt.u64.u32 %rd993, %r3626;
+ bfi.b64 %rd994, %rd992, %rd993, 32, 32;
+ cvt.rn.f64.s64 %fd49, %rd994;
+ mul.f64 %fd50, %fd49, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f2841, %fd50;
+ setp.eq.s32 %p343, %r3622, 0;
+ neg.f32 %f2842, %f2841;
+ selp.f32 %f5369, %f2841, %f2842, %p343;
+
+$L__BB0_378:
+ and.b32 %r499, %r8066, 1;
+ setp.eq.s32 %p344, %r499, 0;
+ selp.f32 %f425, %f5369, 0f3F800000, %p344;
+ mul.rn.f32 %f426, %f5369, %f5369;
+ mov.f32 %f5370, 0fB94D4153;
+ @%p344 bra $L__BB0_380;
+
+ mov.f32 %f2845, 0fBAB607ED;
+ mov.f32 %f2846, 0f37CBAC00;
+ fma.rn.f32 %f5370, %f2846, %f426, %f2845;
+
+$L__BB0_380:
+ selp.f32 %f2847, 0f3C0885E4, 0f3D2AAABB, %p344;
+ fma.rn.f32 %f2848, %f5370, %f426, %f2847;
+ selp.f32 %f2849, 0fBE2AAAA8, 0fBEFFFFFF, %p344;
+ fma.rn.f32 %f2850, %f2848, %f426, %f2849;
+ mov.f32 %f2851, 0f00000000;
+ fma.rn.f32 %f2852, %f426, %f425, %f2851;
+ fma.rn.f32 %f5371, %f2850, %f2852, %f425;
+ and.b32 %r3628, %r8066, 2;
+ setp.eq.s32 %p346, %r3628, 0;
+ @%p346 bra $L__BB0_382;
+
+ mov.f32 %f2854, 0fBF800000;
+ fma.rn.f32 %f5371, %f5371, %f2854, %f2851;
+
+$L__BB0_382:
+ mul.f32 %f2855, %f308, 0f3F22F983;
+ cvt.rni.s32.f32 %r8070, %f2855;
+ cvt.rn.f32.s32 %f2856, %r8070;
+ mov.f32 %f2857, 0fBFC90FDA;
+ fma.rn.f32 %f2858, %f2856, %f2857, %f308;
+ mov.f32 %f2859, 0fB3A22168;
+ fma.rn.f32 %f2860, %f2856, %f2859, %f2858;
+ mov.f32 %f2861, 0fA7C234C5;
+ fma.rn.f32 %f5372, %f2856, %f2861, %f2860;
+ abs.f32 %f433, %f308;
+ setp.ltu.f32 %p347, %f433, 0f47CE4780;
+ @%p347 bra $L__BB0_390;
+
+ setp.eq.f32 %p348, %f433, 0f7F800000;
+ @%p348 bra $L__BB0_389;
+ bra.uni $L__BB0_384;
+
+$L__BB0_389:
+ mov.f32 %f2864, 0f00000000;
+ mul.rn.f32 %f5372, %f308, %f2864;
+ mov.u32 %r8070, 0;
+ bra.uni $L__BB0_390;
+
+$L__BB0_384:
+ mov.b32 %r501, %f308;
+ shr.u32 %r3630, %r501, 23;
and.b32 %r3631, %r3630, 255;
- add.s32 %r478, %r3631, -128;
- shl.b32 %r3632, %r477, 8;
- or.b32 %r479, %r3632, -2147483648;
- shr.u32 %r480, %r478, 5;
- mov.u64 %rd2544, 0;
- mov.u32 %r8318, 0;
- mov.u64 %rd994, __cudart_i2opi_f;
- mov.u64 %rd2545, %rd2544;
-
-$L__BB0_370:
+ add.s32 %r502, %r3631, -128;
+ shl.b32 %r3632, %r501, 8;
+ or.b32 %r503, %r3632, -2147483648;
+ shr.u32 %r504, %r502, 5;
+ mov.u64 %rd2555, 0;
+ mov.u32 %r8067, 0;
+ mov.u64 %rd998, __cudart_i2opi_f;
+ mov.u64 %rd2556, %rd2555;
+
+$L__BB0_385:
.pragma "nounroll";
- shl.b64 %rd993, %rd2544, 2;
- add.s64 %rd995, %rd994, %rd993;
- ld.global.nc.u32 %r3633, [%rd995];
- mad.wide.u32 %rd996, %r3633, %r479, %rd2545;
- shr.u64 %rd2545, %rd996, 32;
- add.s64 %rd997, %rd1, %rd993;
- st.local.u32 [%rd997], %rd996;
- add.s32 %r8318, %r8318, 1;
- cvt.s64.s32 %rd2544, %r8318;
- setp.ne.s32 %p336, %r8318, 6;
- @%p336 bra $L__BB0_370;
-
- st.local.u32 [%rd5], %rd2545;
+ shl.b64 %rd997, %rd2555, 2;
+ add.s64 %rd999, %rd998, %rd997;
+ ld.global.nc.u32 %r3633, [%rd999];
+ mad.wide.u32 %rd1000, %r3633, %r503, %rd2556;
+ shr.u64 %rd2556, %rd1000, 32;
+ add.s64 %rd1001, %rd1, %rd997;
+ st.local.u32 [%rd1001], %rd1000;
+ add.s32 %r8067, %r8067, 1;
+ cvt.s64.s32 %rd2555, %r8067;
+ setp.ne.s32 %p349, %r8067, 6;
+ @%p349 bra $L__BB0_385;
+
+ st.local.u32 [%rd4], %rd2556;
mov.u32 %r3634, 4;
- sub.s32 %r483, %r3634, %r480;
+ sub.s32 %r507, %r3634, %r504;
mov.u32 %r3635, 6;
- sub.s32 %r3636, %r3635, %r480;
- mul.wide.s32 %rd998, %r3636, 4;
- add.s64 %rd999, %rd1, %rd998;
- ld.local.u32 %r8319, [%rd999];
- ld.local.u32 %r8320, [%rd999+-4];
- and.b32 %r486, %r478, 31;
- setp.eq.s32 %p337, %r486, 0;
- @%p337 bra $L__BB0_373;
+ sub.s32 %r3636, %r3635, %r504;
+ mul.wide.s32 %rd1002, %r3636, 4;
+ add.s64 %rd1003, %rd1, %rd1002;
+ ld.local.u32 %r8068, [%rd1003];
+ ld.local.u32 %r8069, [%rd1003+-4];
+ and.b32 %r510, %r502, 31;
+ setp.eq.s32 %p350, %r510, 0;
+ @%p350 bra $L__BB0_388;
mov.u32 %r3637, 32;
- sub.s32 %r3638, %r3637, %r486;
- shr.u32 %r3639, %r8320, %r3638;
- shl.b32 %r3640, %r8319, %r486;
- add.s32 %r8319, %r3639, %r3640;
- mul.wide.s32 %rd1000, %r483, 4;
- add.s64 %rd1001, %rd1, %rd1000;
- ld.local.u32 %r3641, [%rd1001];
+ sub.s32 %r3638, %r3637, %r510;
+ shr.u32 %r3639, %r8069, %r3638;
+ shl.b32 %r3640, %r8068, %r510;
+ add.s32 %r8068, %r3639, %r3640;
+ mul.wide.s32 %rd1004, %r507, 4;
+ add.s64 %rd1005, %rd1, %rd1004;
+ ld.local.u32 %r3641, [%rd1005];
shr.u32 %r3642, %r3641, %r3638;
- shl.b32 %r3643, %r8320, %r486;
- add.s32 %r8320, %r3642, %r3643;
-
-$L__BB0_373:
- and.b32 %r3644, %r477, -2147483648;
- shr.u32 %r3645, %r8320, 30;
- shl.b32 %r3646, %r8319, 2;
+ shl.b32 %r3643, %r8069, %r510;
+ add.s32 %r8069, %r3642, %r3643;
+
+$L__BB0_388:
+ and.b32 %r3644, %r501, -2147483648;
+ shr.u32 %r3645, %r8069, 30;
+ shl.b32 %r3646, %r8068, 2;
or.b32 %r3647, %r3645, %r3646;
shr.u32 %r3648, %r3647, 31;
- shr.u32 %r3649, %r8319, 30;
+ shr.u32 %r3649, %r8068, 30;
add.s32 %r3650, %r3648, %r3649;
neg.s32 %r3651, %r3650;
- setp.eq.s32 %p338, %r3644, 0;
- selp.b32 %r8321, %r3650, %r3651, %p338;
- setp.ne.s32 %p339, %r3648, 0;
+ setp.eq.s32 %p351, %r3644, 0;
+ selp.b32 %r8070, %r3650, %r3651, %p351;
+ setp.ne.s32 %p352, %r3648, 0;
xor.b32 %r3652, %r3644, -2147483648;
- selp.b32 %r3653, %r3652, %r3644, %p339;
- selp.b32 %r3654, -1, 0, %p339;
+ selp.b32 %r3653, %r3652, %r3644, %p352;
+ selp.b32 %r3654, -1, 0, %p352;
xor.b32 %r3655, %r3647, %r3654;
- shl.b32 %r3656, %r8320, 2;
+ shl.b32 %r3656, %r8069, 2;
xor.b32 %r3657, %r3656, %r3654;
- cvt.u64.u32 %rd1002, %r3655;
- cvt.u64.u32 %rd1003, %r3657;
- bfi.b64 %rd1004, %rd1002, %rd1003, 32, 32;
- cvt.rn.f64.s64 %fd47, %rd1004;
- mul.f64 %fd48, %fd47, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f2764, %fd48;
- setp.eq.s32 %p340, %r3653, 0;
- neg.f32 %f2765, %f2764;
- selp.f32 %f5298, %f2764, %f2765, %p340;
-
-$L__BB0_375:
- add.s32 %r493, %r8321, 1;
- and.b32 %r494, %r493, 1;
- setp.eq.s32 %p341, %r494, 0;
- selp.f32 %f404, %f5298, 0f3F800000, %p341;
- mul.rn.f32 %f405, %f5298, %f5298;
- mov.f32 %f5299, 0fB94D4153;
- @%p341 bra $L__BB0_377;
-
- mov.f32 %f2768, 0fBAB607ED;
- mov.f32 %f2769, 0f37CBAC00;
- fma.rn.f32 %f5299, %f2769, %f405, %f2768;
-
-$L__BB0_377:
- selp.f32 %f2770, 0f3C0885E4, 0f3D2AAABB, %p341;
- fma.rn.f32 %f2771, %f5299, %f405, %f2770;
- selp.f32 %f2772, 0fBE2AAAA8, 0fBEFFFFFF, %p341;
- fma.rn.f32 %f2773, %f2771, %f405, %f2772;
- mov.f32 %f2774, 0f00000000;
- fma.rn.f32 %f2775, %f405, %f404, %f2774;
- fma.rn.f32 %f5300, %f2773, %f2775, %f404;
- and.b32 %r3659, %r493, 2;
- setp.eq.s32 %p343, %r3659, 0;
- @%p343 bra $L__BB0_379;
-
- mov.f32 %f2777, 0fBF800000;
- fma.rn.f32 %f5300, %f5300, %f2777, %f2774;
-
-$L__BB0_379:
- add.f32 %f5329, %f5297, %f5300;
- mul.f32 %f2778, %f308, 0f3F22F983;
- cvt.rni.s32.f32 %r8325, %f2778;
- cvt.rn.f32.s32 %f2779, %r8325;
- mov.f32 %f2780, 0fBFC90FDA;
- fma.rn.f32 %f2781, %f2779, %f2780, %f308;
- mov.f32 %f2782, 0fB3A22168;
- fma.rn.f32 %f2783, %f2779, %f2782, %f2781;
- mov.f32 %f2784, 0fA7C234C5;
- fma.rn.f32 %f5301, %f2779, %f2784, %f2783;
- abs.f32 %f413, %f308;
- setp.ltu.f32 %p344, %f413, 0f47CE4780;
- @%p344 bra $L__BB0_387;
-
- setp.eq.f32 %p345, %f413, 0f7F800000;
- @%p345 bra $L__BB0_386;
- bra.uni $L__BB0_381;
-
-$L__BB0_386:
- mov.f32 %f2787, 0f00000000;
- mul.rn.f32 %f5301, %f308, %f2787;
- mov.u32 %r8325, 0;
- bra.uni $L__BB0_387;
-
-$L__BB0_381:
- mov.b32 %r496, %f308;
- shr.u32 %r3661, %r496, 23;
+ cvt.u64.u32 %rd1006, %r3655;
+ cvt.u64.u32 %rd1007, %r3657;
+ bfi.b64 %rd1008, %rd1006, %rd1007, 32, 32;
+ cvt.rn.f64.s64 %fd51, %rd1008;
+ mul.f64 %fd52, %fd51, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f2862, %fd52;
+ setp.eq.s32 %p353, %r3653, 0;
+ neg.f32 %f2863, %f2862;
+ selp.f32 %f5372, %f2862, %f2863, %p353;
+
+$L__BB0_390:
+ add.s32 %r517, %r8070, 1;
+ and.b32 %r518, %r517, 1;
+ setp.eq.s32 %p354, %r518, 0;
+ selp.f32 %f437, %f5372, 0f3F800000, %p354;
+ mul.rn.f32 %f438, %f5372, %f5372;
+ mov.f32 %f5373, 0fB94D4153;
+ @%p354 bra $L__BB0_392;
+
+ mov.f32 %f2866, 0fBAB607ED;
+ mov.f32 %f2867, 0f37CBAC00;
+ fma.rn.f32 %f5373, %f2867, %f438, %f2866;
+
+$L__BB0_392:
+ selp.f32 %f2868, 0f3C0885E4, 0f3D2AAABB, %p354;
+ fma.rn.f32 %f2869, %f5373, %f438, %f2868;
+ selp.f32 %f2870, 0fBE2AAAA8, 0fBEFFFFFF, %p354;
+ fma.rn.f32 %f2871, %f2869, %f438, %f2870;
+ mov.f32 %f2872, 0f00000000;
+ fma.rn.f32 %f2873, %f438, %f437, %f2872;
+ fma.rn.f32 %f5374, %f2871, %f2873, %f437;
+ and.b32 %r3659, %r517, 2;
+ setp.eq.s32 %p356, %r3659, 0;
+ @%p356 bra $L__BB0_394;
+
+ mov.f32 %f2875, 0fBF800000;
+ fma.rn.f32 %f5374, %f5374, %f2875, %f2872;
+
+$L__BB0_394:
+ add.f32 %f5396, %f5371, %f5374;
+ mul.f32 %f2876, %f317, 0f3F22F983;
+ cvt.rni.s32.f32 %r8074, %f2876;
+ cvt.rn.f32.s32 %f2877, %r8074;
+ mov.f32 %f2878, 0fBFC90FDA;
+ fma.rn.f32 %f2879, %f2877, %f2878, %f317;
+ mov.f32 %f2880, 0fB3A22168;
+ fma.rn.f32 %f2881, %f2877, %f2880, %f2879;
+ mov.f32 %f2882, 0fA7C234C5;
+ fma.rn.f32 %f5375, %f2877, %f2882, %f2881;
+ abs.f32 %f446, %f317;
+ setp.ltu.f32 %p357, %f446, 0f47CE4780;
+ @%p357 bra $L__BB0_402;
+
+ setp.eq.f32 %p358, %f446, 0f7F800000;
+ @%p358 bra $L__BB0_401;
+ bra.uni $L__BB0_396;
+
+$L__BB0_401:
+ mov.f32 %f2885, 0f00000000;
+ mul.rn.f32 %f5375, %f317, %f2885;
+ mov.u32 %r8074, 0;
+ bra.uni $L__BB0_402;
+
+$L__BB0_396:
+ mov.b32 %r520, %f317;
+ shr.u32 %r3661, %r520, 23;
and.b32 %r3662, %r3661, 255;
- add.s32 %r497, %r3662, -128;
- shl.b32 %r3663, %r496, 8;
- or.b32 %r498, %r3663, -2147483648;
- shr.u32 %r499, %r497, 5;
- mov.u64 %rd2546, 0;
- mov.u32 %r8322, 0;
- mov.u64 %rd1008, __cudart_i2opi_f;
- mov.u64 %rd2547, %rd2546;
-
-$L__BB0_382:
+ add.s32 %r521, %r3662, -128;
+ shl.b32 %r3663, %r520, 8;
+ or.b32 %r522, %r3663, -2147483648;
+ shr.u32 %r523, %r521, 5;
+ mov.u64 %rd2557, 0;
+ mov.u32 %r8071, 0;
+ mov.u64 %rd1012, __cudart_i2opi_f;
+ mov.u64 %rd2558, %rd2557;
+
+$L__BB0_397:
.pragma "nounroll";
- shl.b64 %rd1007, %rd2546, 2;
- add.s64 %rd1009, %rd1008, %rd1007;
- ld.global.nc.u32 %r3664, [%rd1009];
- mad.wide.u32 %rd1010, %r3664, %r498, %rd2547;
- shr.u64 %rd2547, %rd1010, 32;
- add.s64 %rd1011, %rd1, %rd1007;
- st.local.u32 [%rd1011], %rd1010;
- add.s32 %r8322, %r8322, 1;
- cvt.s64.s32 %rd2546, %r8322;
- setp.ne.s32 %p346, %r8322, 6;
- @%p346 bra $L__BB0_382;
-
- st.local.u32 [%rd5], %rd2547;
+ shl.b64 %rd1011, %rd2557, 2;
+ add.s64 %rd1013, %rd1012, %rd1011;
+ ld.global.nc.u32 %r3664, [%rd1013];
+ mad.wide.u32 %rd1014, %r3664, %r522, %rd2558;
+ shr.u64 %rd2558, %rd1014, 32;
+ add.s64 %rd1015, %rd1, %rd1011;
+ st.local.u32 [%rd1015], %rd1014;
+ add.s32 %r8071, %r8071, 1;
+ cvt.s64.s32 %rd2557, %r8071;
+ setp.ne.s32 %p359, %r8071, 6;
+ @%p359 bra $L__BB0_397;
+
+ st.local.u32 [%rd4], %rd2558;
mov.u32 %r3665, 4;
- sub.s32 %r502, %r3665, %r499;
+ sub.s32 %r526, %r3665, %r523;
mov.u32 %r3666, 6;
- sub.s32 %r3667, %r3666, %r499;
- mul.wide.s32 %rd1012, %r3667, 4;
- add.s64 %rd1013, %rd1, %rd1012;
- ld.local.u32 %r8323, [%rd1013];
- ld.local.u32 %r8324, [%rd1013+-4];
- and.b32 %r505, %r497, 31;
- setp.eq.s32 %p347, %r505, 0;
- @%p347 bra $L__BB0_385;
+ sub.s32 %r3667, %r3666, %r523;
+ mul.wide.s32 %rd1016, %r3667, 4;
+ add.s64 %rd1017, %rd1, %rd1016;
+ ld.local.u32 %r8072, [%rd1017];
+ ld.local.u32 %r8073, [%rd1017+-4];
+ and.b32 %r529, %r521, 31;
+ setp.eq.s32 %p360, %r529, 0;
+ @%p360 bra $L__BB0_400;
mov.u32 %r3668, 32;
- sub.s32 %r3669, %r3668, %r505;
- shr.u32 %r3670, %r8324, %r3669;
- shl.b32 %r3671, %r8323, %r505;
- add.s32 %r8323, %r3670, %r3671;
- mul.wide.s32 %rd1014, %r502, 4;
- add.s64 %rd1015, %rd1, %rd1014;
- ld.local.u32 %r3672, [%rd1015];
+ sub.s32 %r3669, %r3668, %r529;
+ shr.u32 %r3670, %r8073, %r3669;
+ shl.b32 %r3671, %r8072, %r529;
+ add.s32 %r8072, %r3670, %r3671;
+ mul.wide.s32 %rd1018, %r526, 4;
+ add.s64 %rd1019, %rd1, %rd1018;
+ ld.local.u32 %r3672, [%rd1019];
shr.u32 %r3673, %r3672, %r3669;
- shl.b32 %r3674, %r8324, %r505;
- add.s32 %r8324, %r3673, %r3674;
-
-$L__BB0_385:
- and.b32 %r3675, %r496, -2147483648;
- shr.u32 %r3676, %r8324, 30;
- shl.b32 %r3677, %r8323, 2;
+ shl.b32 %r3674, %r8073, %r529;
+ add.s32 %r8073, %r3673, %r3674;
+
+$L__BB0_400:
+ and.b32 %r3675, %r520, -2147483648;
+ shr.u32 %r3676, %r8073, 30;
+ shl.b32 %r3677, %r8072, 2;
or.b32 %r3678, %r3676, %r3677;
shr.u32 %r3679, %r3678, 31;
- shr.u32 %r3680, %r8323, 30;
+ shr.u32 %r3680, %r8072, 30;
add.s32 %r3681, %r3679, %r3680;
neg.s32 %r3682, %r3681;
- setp.eq.s32 %p348, %r3675, 0;
- selp.b32 %r8325, %r3681, %r3682, %p348;
- setp.ne.s32 %p349, %r3679, 0;
+ setp.eq.s32 %p361, %r3675, 0;
+ selp.b32 %r8074, %r3681, %r3682, %p361;
+ setp.ne.s32 %p362, %r3679, 0;
xor.b32 %r3683, %r3675, -2147483648;
- selp.b32 %r3684, %r3683, %r3675, %p349;
- selp.b32 %r3685, -1, 0, %p349;
+ selp.b32 %r3684, %r3683, %r3675, %p362;
+ selp.b32 %r3685, -1, 0, %p362;
xor.b32 %r3686, %r3678, %r3685;
- shl.b32 %r3687, %r8324, 2;
+ shl.b32 %r3687, %r8073, 2;
xor.b32 %r3688, %r3687, %r3685;
- cvt.u64.u32 %rd1016, %r3686;
- cvt.u64.u32 %rd1017, %r3688;
- bfi.b64 %rd1018, %rd1016, %rd1017, 32, 32;
- cvt.rn.f64.s64 %fd49, %rd1018;
- mul.f64 %fd50, %fd49, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f2785, %fd50;
- setp.eq.s32 %p350, %r3684, 0;
- neg.f32 %f2786, %f2785;
- selp.f32 %f5301, %f2785, %f2786, %p350;
-
-$L__BB0_387:
- and.b32 %r512, %r8325, 1;
- setp.eq.s32 %p351, %r512, 0;
- selp.f32 %f417, %f5301, 0f3F800000, %p351;
- mul.rn.f32 %f418, %f5301, %f5301;
- mov.f32 %f5302, 0fB94D4153;
- @%p351 bra $L__BB0_389;
-
- mov.f32 %f2789, 0fBAB607ED;
- mov.f32 %f2790, 0f37CBAC00;
- fma.rn.f32 %f5302, %f2790, %f418, %f2789;
-
-$L__BB0_389:
- selp.f32 %f2791, 0f3C0885E4, 0f3D2AAABB, %p351;
- fma.rn.f32 %f2792, %f5302, %f418, %f2791;
- selp.f32 %f2793, 0fBE2AAAA8, 0fBEFFFFFF, %p351;
- fma.rn.f32 %f2794, %f2792, %f418, %f2793;
- mov.f32 %f2795, 0f00000000;
- fma.rn.f32 %f2796, %f418, %f417, %f2795;
- fma.rn.f32 %f5303, %f2794, %f2796, %f417;
- and.b32 %r3690, %r8325, 2;
- setp.eq.s32 %p353, %r3690, 0;
- @%p353 bra $L__BB0_391;
-
- mov.f32 %f2798, 0fBF800000;
- fma.rn.f32 %f5303, %f5303, %f2798, %f2795;
-
-$L__BB0_391:
- mul.f32 %f2799, %f300, 0f3F22F983;
- cvt.rni.s32.f32 %r8329, %f2799;
- cvt.rn.f32.s32 %f2800, %r8329;
- mov.f32 %f2801, 0fBFC90FDA;
- fma.rn.f32 %f2802, %f2800, %f2801, %f300;
- mov.f32 %f2803, 0fB3A22168;
- fma.rn.f32 %f2804, %f2800, %f2803, %f2802;
- mov.f32 %f2805, 0fA7C234C5;
- fma.rn.f32 %f5304, %f2800, %f2805, %f2804;
- abs.f32 %f425, %f300;
- setp.ltu.f32 %p354, %f425, 0f47CE4780;
- @%p354 bra $L__BB0_399;
-
- setp.eq.f32 %p355, %f425, 0f7F800000;
- @%p355 bra $L__BB0_398;
- bra.uni $L__BB0_393;
-
-$L__BB0_398:
- mov.f32 %f2808, 0f00000000;
- mul.rn.f32 %f5304, %f300, %f2808;
- mov.u32 %r8329, 0;
- bra.uni $L__BB0_399;
-
-$L__BB0_393:
- mov.b32 %r514, %f300;
- shr.u32 %r3692, %r514, 23;
+ cvt.u64.u32 %rd1020, %r3686;
+ cvt.u64.u32 %rd1021, %r3688;
+ bfi.b64 %rd1022, %rd1020, %rd1021, 32, 32;
+ cvt.rn.f64.s64 %fd53, %rd1022;
+ mul.f64 %fd54, %fd53, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f2883, %fd54;
+ setp.eq.s32 %p363, %r3684, 0;
+ neg.f32 %f2884, %f2883;
+ selp.f32 %f5375, %f2883, %f2884, %p363;
+
+$L__BB0_402:
+ and.b32 %r536, %r8074, 1;
+ setp.eq.s32 %p364, %r536, 0;
+ selp.f32 %f450, %f5375, 0f3F800000, %p364;
+ mul.rn.f32 %f451, %f5375, %f5375;
+ mov.f32 %f5376, 0fB94D4153;
+ @%p364 bra $L__BB0_404;
+
+ mov.f32 %f2887, 0fBAB607ED;
+ mov.f32 %f2888, 0f37CBAC00;
+ fma.rn.f32 %f5376, %f2888, %f451, %f2887;
+
+$L__BB0_404:
+ selp.f32 %f2889, 0f3C0885E4, 0f3D2AAABB, %p364;
+ fma.rn.f32 %f2890, %f5376, %f451, %f2889;
+ selp.f32 %f2891, 0fBE2AAAA8, 0fBEFFFFFF, %p364;
+ fma.rn.f32 %f2892, %f2890, %f451, %f2891;
+ mov.f32 %f2893, 0f00000000;
+ fma.rn.f32 %f2894, %f451, %f450, %f2893;
+ fma.rn.f32 %f5377, %f2892, %f2894, %f450;
+ and.b32 %r3690, %r8074, 2;
+ setp.eq.s32 %p366, %r3690, 0;
+ @%p366 bra $L__BB0_406;
+
+ mov.f32 %f2896, 0fBF800000;
+ fma.rn.f32 %f5377, %f5377, %f2896, %f2893;
+
+$L__BB0_406:
+ mul.f32 %f2897, %f309, 0f3F22F983;
+ cvt.rni.s32.f32 %r8078, %f2897;
+ cvt.rn.f32.s32 %f2898, %r8078;
+ mov.f32 %f2899, 0fBFC90FDA;
+ fma.rn.f32 %f2900, %f2898, %f2899, %f309;
+ mov.f32 %f2901, 0fB3A22168;
+ fma.rn.f32 %f2902, %f2898, %f2901, %f2900;
+ mov.f32 %f2903, 0fA7C234C5;
+ fma.rn.f32 %f5378, %f2898, %f2903, %f2902;
+ abs.f32 %f458, %f309;
+ setp.ltu.f32 %p367, %f458, 0f47CE4780;
+ @%p367 bra $L__BB0_414;
+
+ setp.eq.f32 %p368, %f458, 0f7F800000;
+ @%p368 bra $L__BB0_413;
+ bra.uni $L__BB0_408;
+
+$L__BB0_413:
+ mov.f32 %f2906, 0f00000000;
+ mul.rn.f32 %f5378, %f309, %f2906;
+ mov.u32 %r8078, 0;
+ bra.uni $L__BB0_414;
+
+$L__BB0_408:
+ mov.b32 %r538, %f309;
+ shr.u32 %r3692, %r538, 23;
and.b32 %r3693, %r3692, 255;
- add.s32 %r515, %r3693, -128;
- shl.b32 %r3694, %r514, 8;
- or.b32 %r516, %r3694, -2147483648;
- shr.u32 %r517, %r515, 5;
- mov.u64 %rd2548, 0;
- mov.u32 %r8326, 0;
- mov.u64 %rd1022, __cudart_i2opi_f;
- mov.u64 %rd2549, %rd2548;
-
-$L__BB0_394:
+ add.s32 %r539, %r3693, -128;
+ shl.b32 %r3694, %r538, 8;
+ or.b32 %r540, %r3694, -2147483648;
+ shr.u32 %r541, %r539, 5;
+ mov.u64 %rd2559, 0;
+ mov.u32 %r8075, 0;
+ mov.u64 %rd1026, __cudart_i2opi_f;
+ mov.u64 %rd2560, %rd2559;
+
+$L__BB0_409:
.pragma "nounroll";
- shl.b64 %rd1021, %rd2548, 2;
- add.s64 %rd1023, %rd1022, %rd1021;
- ld.global.nc.u32 %r3695, [%rd1023];
- mad.wide.u32 %rd1024, %r3695, %r516, %rd2549;
- shr.u64 %rd2549, %rd1024, 32;
- add.s64 %rd1025, %rd1, %rd1021;
- st.local.u32 [%rd1025], %rd1024;
- add.s32 %r8326, %r8326, 1;
- cvt.s64.s32 %rd2548, %r8326;
- setp.ne.s32 %p356, %r8326, 6;
- @%p356 bra $L__BB0_394;
-
- st.local.u32 [%rd5], %rd2549;
+ shl.b64 %rd1025, %rd2559, 2;
+ add.s64 %rd1027, %rd1026, %rd1025;
+ ld.global.nc.u32 %r3695, [%rd1027];
+ mad.wide.u32 %rd1028, %r3695, %r540, %rd2560;
+ shr.u64 %rd2560, %rd1028, 32;
+ add.s64 %rd1029, %rd1, %rd1025;
+ st.local.u32 [%rd1029], %rd1028;
+ add.s32 %r8075, %r8075, 1;
+ cvt.s64.s32 %rd2559, %r8075;
+ setp.ne.s32 %p369, %r8075, 6;
+ @%p369 bra $L__BB0_409;
+
+ st.local.u32 [%rd4], %rd2560;
mov.u32 %r3696, 4;
- sub.s32 %r520, %r3696, %r517;
+ sub.s32 %r544, %r3696, %r541;
mov.u32 %r3697, 6;
- sub.s32 %r3698, %r3697, %r517;
- mul.wide.s32 %rd1026, %r3698, 4;
- add.s64 %rd1027, %rd1, %rd1026;
- ld.local.u32 %r8327, [%rd1027];
- ld.local.u32 %r8328, [%rd1027+-4];
- and.b32 %r523, %r515, 31;
- setp.eq.s32 %p357, %r523, 0;
- @%p357 bra $L__BB0_397;
+ sub.s32 %r3698, %r3697, %r541;
+ mul.wide.s32 %rd1030, %r3698, 4;
+ add.s64 %rd1031, %rd1, %rd1030;
+ ld.local.u32 %r8076, [%rd1031];
+ ld.local.u32 %r8077, [%rd1031+-4];
+ and.b32 %r547, %r539, 31;
+ setp.eq.s32 %p370, %r547, 0;
+ @%p370 bra $L__BB0_412;
mov.u32 %r3699, 32;
- sub.s32 %r3700, %r3699, %r523;
- shr.u32 %r3701, %r8328, %r3700;
- shl.b32 %r3702, %r8327, %r523;
- add.s32 %r8327, %r3701, %r3702;
- mul.wide.s32 %rd1028, %r520, 4;
- add.s64 %rd1029, %rd1, %rd1028;
- ld.local.u32 %r3703, [%rd1029];
+ sub.s32 %r3700, %r3699, %r547;
+ shr.u32 %r3701, %r8077, %r3700;
+ shl.b32 %r3702, %r8076, %r547;
+ add.s32 %r8076, %r3701, %r3702;
+ mul.wide.s32 %rd1032, %r544, 4;
+ add.s64 %rd1033, %rd1, %rd1032;
+ ld.local.u32 %r3703, [%rd1033];
shr.u32 %r3704, %r3703, %r3700;
- shl.b32 %r3705, %r8328, %r523;
- add.s32 %r8328, %r3704, %r3705;
-
-$L__BB0_397:
- and.b32 %r3706, %r514, -2147483648;
- shr.u32 %r3707, %r8328, 30;
- shl.b32 %r3708, %r8327, 2;
+ shl.b32 %r3705, %r8077, %r547;
+ add.s32 %r8077, %r3704, %r3705;
+
+$L__BB0_412:
+ and.b32 %r3706, %r538, -2147483648;
+ shr.u32 %r3707, %r8077, 30;
+ shl.b32 %r3708, %r8076, 2;
or.b32 %r3709, %r3707, %r3708;
shr.u32 %r3710, %r3709, 31;
- shr.u32 %r3711, %r8327, 30;
+ shr.u32 %r3711, %r8076, 30;
add.s32 %r3712, %r3710, %r3711;
neg.s32 %r3713, %r3712;
- setp.eq.s32 %p358, %r3706, 0;
- selp.b32 %r8329, %r3712, %r3713, %p358;
- setp.ne.s32 %p359, %r3710, 0;
+ setp.eq.s32 %p371, %r3706, 0;
+ selp.b32 %r8078, %r3712, %r3713, %p371;
+ setp.ne.s32 %p372, %r3710, 0;
xor.b32 %r3714, %r3706, -2147483648;
- selp.b32 %r3715, %r3714, %r3706, %p359;
- selp.b32 %r3716, -1, 0, %p359;
+ selp.b32 %r3715, %r3714, %r3706, %p372;
+ selp.b32 %r3716, -1, 0, %p372;
xor.b32 %r3717, %r3709, %r3716;
- shl.b32 %r3718, %r8328, 2;
+ shl.b32 %r3718, %r8077, 2;
xor.b32 %r3719, %r3718, %r3716;
- cvt.u64.u32 %rd1030, %r3717;
- cvt.u64.u32 %rd1031, %r3719;
- bfi.b64 %rd1032, %rd1030, %rd1031, 32, 32;
- cvt.rn.f64.s64 %fd51, %rd1032;
- mul.f64 %fd52, %fd51, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f2806, %fd52;
- setp.eq.s32 %p360, %r3715, 0;
- neg.f32 %f2807, %f2806;
- selp.f32 %f5304, %f2806, %f2807, %p360;
-
-$L__BB0_399:
- add.s32 %r530, %r8329, 1;
- and.b32 %r531, %r530, 1;
- setp.eq.s32 %p361, %r531, 0;
- selp.f32 %f429, %f5304, 0f3F800000, %p361;
- mul.rn.f32 %f430, %f5304, %f5304;
- mov.f32 %f5305, 0fB94D4153;
- @%p361 bra $L__BB0_401;
-
- mov.f32 %f2810, 0fBAB607ED;
- mov.f32 %f2811, 0f37CBAC00;
- fma.rn.f32 %f5305, %f2811, %f430, %f2810;
-
-$L__BB0_401:
- selp.f32 %f2812, 0f3C0885E4, 0f3D2AAABB, %p361;
- fma.rn.f32 %f2813, %f5305, %f430, %f2812;
- selp.f32 %f2814, 0fBE2AAAA8, 0fBEFFFFFF, %p361;
- fma.rn.f32 %f2815, %f2813, %f430, %f2814;
- mov.f32 %f2816, 0f00000000;
- fma.rn.f32 %f2817, %f430, %f429, %f2816;
- fma.rn.f32 %f5306, %f2815, %f2817, %f429;
- and.b32 %r3721, %r530, 2;
- setp.eq.s32 %p363, %r3721, 0;
- @%p363 bra $L__BB0_403;
-
- mov.f32 %f2819, 0fBF800000;
- fma.rn.f32 %f5306, %f5306, %f2819, %f2816;
-
-$L__BB0_403:
- add.f32 %f5328, %f5303, %f5306;
- mul.f32 %f2820, %f309, 0f3F22F983;
- cvt.rni.s32.f32 %r8333, %f2820;
- cvt.rn.f32.s32 %f2821, %r8333;
- mov.f32 %f2822, 0fBFC90FDA;
- fma.rn.f32 %f2823, %f2821, %f2822, %f309;
- mov.f32 %f2824, 0fB3A22168;
- fma.rn.f32 %f2825, %f2821, %f2824, %f2823;
- mov.f32 %f2826, 0fA7C234C5;
- fma.rn.f32 %f5307, %f2821, %f2826, %f2825;
- abs.f32 %f438, %f309;
- setp.ltu.f32 %p364, %f438, 0f47CE4780;
- @%p364 bra $L__BB0_411;
-
- setp.eq.f32 %p365, %f438, 0f7F800000;
- @%p365 bra $L__BB0_410;
- bra.uni $L__BB0_405;
-
-$L__BB0_410:
- mov.f32 %f2829, 0f00000000;
- mul.rn.f32 %f5307, %f309, %f2829;
- mov.u32 %r8333, 0;
- bra.uni $L__BB0_411;
-
-$L__BB0_405:
- mov.b32 %r533, %f309;
- shr.u32 %r3723, %r533, 23;
+ cvt.u64.u32 %rd1034, %r3717;
+ cvt.u64.u32 %rd1035, %r3719;
+ bfi.b64 %rd1036, %rd1034, %rd1035, 32, 32;
+ cvt.rn.f64.s64 %fd55, %rd1036;
+ mul.f64 %fd56, %fd55, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f2904, %fd56;
+ setp.eq.s32 %p373, %r3715, 0;
+ neg.f32 %f2905, %f2904;
+ selp.f32 %f5378, %f2904, %f2905, %p373;
+
+$L__BB0_414:
+ add.s32 %r554, %r8078, 1;
+ and.b32 %r555, %r554, 1;
+ setp.eq.s32 %p374, %r555, 0;
+ selp.f32 %f462, %f5378, 0f3F800000, %p374;
+ mul.rn.f32 %f463, %f5378, %f5378;
+ mov.f32 %f5379, 0fB94D4153;
+ @%p374 bra $L__BB0_416;
+
+ mov.f32 %f2908, 0fBAB607ED;
+ mov.f32 %f2909, 0f37CBAC00;
+ fma.rn.f32 %f5379, %f2909, %f463, %f2908;
+
+$L__BB0_416:
+ selp.f32 %f2910, 0f3C0885E4, 0f3D2AAABB, %p374;
+ fma.rn.f32 %f2911, %f5379, %f463, %f2910;
+ selp.f32 %f2912, 0fBE2AAAA8, 0fBEFFFFFF, %p374;
+ fma.rn.f32 %f2913, %f2911, %f463, %f2912;
+ mov.f32 %f2914, 0f00000000;
+ fma.rn.f32 %f2915, %f463, %f462, %f2914;
+ fma.rn.f32 %f5380, %f2913, %f2915, %f462;
+ and.b32 %r3721, %r554, 2;
+ setp.eq.s32 %p376, %r3721, 0;
+ @%p376 bra $L__BB0_418;
+
+ mov.f32 %f2917, 0fBF800000;
+ fma.rn.f32 %f5380, %f5380, %f2917, %f2914;
+
+$L__BB0_418:
+ add.f32 %f5395, %f5377, %f5380;
+ mul.f32 %f2918, %f318, 0f3F22F983;
+ cvt.rni.s32.f32 %r8082, %f2918;
+ cvt.rn.f32.s32 %f2919, %r8082;
+ mov.f32 %f2920, 0fBFC90FDA;
+ fma.rn.f32 %f2921, %f2919, %f2920, %f318;
+ mov.f32 %f2922, 0fB3A22168;
+ fma.rn.f32 %f2923, %f2919, %f2922, %f2921;
+ mov.f32 %f2924, 0fA7C234C5;
+ fma.rn.f32 %f5381, %f2919, %f2924, %f2923;
+ abs.f32 %f471, %f318;
+ setp.ltu.f32 %p377, %f471, 0f47CE4780;
+ @%p377 bra $L__BB0_426;
+
+ setp.eq.f32 %p378, %f471, 0f7F800000;
+ @%p378 bra $L__BB0_425;
+ bra.uni $L__BB0_420;
+
+$L__BB0_425:
+ mov.f32 %f2927, 0f00000000;
+ mul.rn.f32 %f5381, %f318, %f2927;
+ mov.u32 %r8082, 0;
+ bra.uni $L__BB0_426;
+
+$L__BB0_420:
+ mov.b32 %r557, %f318;
+ shr.u32 %r3723, %r557, 23;
and.b32 %r3724, %r3723, 255;
- add.s32 %r534, %r3724, -128;
- shl.b32 %r3725, %r533, 8;
- or.b32 %r535, %r3725, -2147483648;
- shr.u32 %r536, %r534, 5;
- mov.u64 %rd2550, 0;
- mov.u32 %r8330, 0;
- mov.u64 %rd1036, __cudart_i2opi_f;
- mov.u64 %rd2551, %rd2550;
-
-$L__BB0_406:
+ add.s32 %r558, %r3724, -128;
+ shl.b32 %r3725, %r557, 8;
+ or.b32 %r559, %r3725, -2147483648;
+ shr.u32 %r560, %r558, 5;
+ mov.u64 %rd2561, 0;
+ mov.u32 %r8079, 0;
+ mov.u64 %rd1040, __cudart_i2opi_f;
+ mov.u64 %rd2562, %rd2561;
+
+$L__BB0_421:
.pragma "nounroll";
- shl.b64 %rd1035, %rd2550, 2;
- add.s64 %rd1037, %rd1036, %rd1035;
- ld.global.nc.u32 %r3726, [%rd1037];
- mad.wide.u32 %rd1038, %r3726, %r535, %rd2551;
- shr.u64 %rd2551, %rd1038, 32;
- add.s64 %rd1039, %rd1, %rd1035;
- st.local.u32 [%rd1039], %rd1038;
- add.s32 %r8330, %r8330, 1;
- cvt.s64.s32 %rd2550, %r8330;
- setp.ne.s32 %p366, %r8330, 6;
- @%p366 bra $L__BB0_406;
-
- st.local.u32 [%rd5], %rd2551;
+ shl.b64 %rd1039, %rd2561, 2;
+ add.s64 %rd1041, %rd1040, %rd1039;
+ ld.global.nc.u32 %r3726, [%rd1041];
+ mad.wide.u32 %rd1042, %r3726, %r559, %rd2562;
+ shr.u64 %rd2562, %rd1042, 32;
+ add.s64 %rd1043, %rd1, %rd1039;
+ st.local.u32 [%rd1043], %rd1042;
+ add.s32 %r8079, %r8079, 1;
+ cvt.s64.s32 %rd2561, %r8079;
+ setp.ne.s32 %p379, %r8079, 6;
+ @%p379 bra $L__BB0_421;
+
+ st.local.u32 [%rd4], %rd2562;
mov.u32 %r3727, 4;
- sub.s32 %r539, %r3727, %r536;
+ sub.s32 %r563, %r3727, %r560;
mov.u32 %r3728, 6;
- sub.s32 %r3729, %r3728, %r536;
- mul.wide.s32 %rd1040, %r3729, 4;
- add.s64 %rd1041, %rd1, %rd1040;
- ld.local.u32 %r8331, [%rd1041];
- ld.local.u32 %r8332, [%rd1041+-4];
- and.b32 %r542, %r534, 31;
- setp.eq.s32 %p367, %r542, 0;
- @%p367 bra $L__BB0_409;
+ sub.s32 %r3729, %r3728, %r560;
+ mul.wide.s32 %rd1044, %r3729, 4;
+ add.s64 %rd1045, %rd1, %rd1044;
+ ld.local.u32 %r8080, [%rd1045];
+ ld.local.u32 %r8081, [%rd1045+-4];
+ and.b32 %r566, %r558, 31;
+ setp.eq.s32 %p380, %r566, 0;
+ @%p380 bra $L__BB0_424;
mov.u32 %r3730, 32;
- sub.s32 %r3731, %r3730, %r542;
- shr.u32 %r3732, %r8332, %r3731;
- shl.b32 %r3733, %r8331, %r542;
- add.s32 %r8331, %r3732, %r3733;
- mul.wide.s32 %rd1042, %r539, 4;
- add.s64 %rd1043, %rd1, %rd1042;
- ld.local.u32 %r3734, [%rd1043];
+ sub.s32 %r3731, %r3730, %r566;
+ shr.u32 %r3732, %r8081, %r3731;
+ shl.b32 %r3733, %r8080, %r566;
+ add.s32 %r8080, %r3732, %r3733;
+ mul.wide.s32 %rd1046, %r563, 4;
+ add.s64 %rd1047, %rd1, %rd1046;
+ ld.local.u32 %r3734, [%rd1047];
shr.u32 %r3735, %r3734, %r3731;
- shl.b32 %r3736, %r8332, %r542;
- add.s32 %r8332, %r3735, %r3736;
-
-$L__BB0_409:
- and.b32 %r3737, %r533, -2147483648;
- shr.u32 %r3738, %r8332, 30;
- shl.b32 %r3739, %r8331, 2;
+ shl.b32 %r3736, %r8081, %r566;
+ add.s32 %r8081, %r3735, %r3736;
+
+$L__BB0_424:
+ and.b32 %r3737, %r557, -2147483648;
+ shr.u32 %r3738, %r8081, 30;
+ shl.b32 %r3739, %r8080, 2;
or.b32 %r3740, %r3738, %r3739;
shr.u32 %r3741, %r3740, 31;
- shr.u32 %r3742, %r8331, 30;
+ shr.u32 %r3742, %r8080, 30;
add.s32 %r3743, %r3741, %r3742;
neg.s32 %r3744, %r3743;
- setp.eq.s32 %p368, %r3737, 0;
- selp.b32 %r8333, %r3743, %r3744, %p368;
- setp.ne.s32 %p369, %r3741, 0;
+ setp.eq.s32 %p381, %r3737, 0;
+ selp.b32 %r8082, %r3743, %r3744, %p381;
+ setp.ne.s32 %p382, %r3741, 0;
xor.b32 %r3745, %r3737, -2147483648;
- selp.b32 %r3746, %r3745, %r3737, %p369;
- selp.b32 %r3747, -1, 0, %p369;
+ selp.b32 %r3746, %r3745, %r3737, %p382;
+ selp.b32 %r3747, -1, 0, %p382;
xor.b32 %r3748, %r3740, %r3747;
- shl.b32 %r3749, %r8332, 2;
+ shl.b32 %r3749, %r8081, 2;
xor.b32 %r3750, %r3749, %r3747;
- cvt.u64.u32 %rd1044, %r3748;
- cvt.u64.u32 %rd1045, %r3750;
- bfi.b64 %rd1046, %rd1044, %rd1045, 32, 32;
- cvt.rn.f64.s64 %fd53, %rd1046;
- mul.f64 %fd54, %fd53, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f2827, %fd54;
- setp.eq.s32 %p370, %r3746, 0;
- neg.f32 %f2828, %f2827;
- selp.f32 %f5307, %f2827, %f2828, %p370;
-
-$L__BB0_411:
- and.b32 %r549, %r8333, 1;
- setp.eq.s32 %p371, %r549, 0;
- selp.f32 %f442, %f5307, 0f3F800000, %p371;
- mul.rn.f32 %f443, %f5307, %f5307;
- mov.f32 %f5308, 0fB94D4153;
- @%p371 bra $L__BB0_413;
-
- mov.f32 %f2831, 0fBAB607ED;
- mov.f32 %f2832, 0f37CBAC00;
- fma.rn.f32 %f5308, %f2832, %f443, %f2831;
-
-$L__BB0_413:
- selp.f32 %f2833, 0f3C0885E4, 0f3D2AAABB, %p371;
- fma.rn.f32 %f2834, %f5308, %f443, %f2833;
- selp.f32 %f2835, 0fBE2AAAA8, 0fBEFFFFFF, %p371;
- fma.rn.f32 %f2836, %f2834, %f443, %f2835;
- mov.f32 %f2837, 0f00000000;
- fma.rn.f32 %f2838, %f443, %f442, %f2837;
- fma.rn.f32 %f5309, %f2836, %f2838, %f442;
- and.b32 %r3752, %r8333, 2;
- setp.eq.s32 %p373, %r3752, 0;
- @%p373 bra $L__BB0_415;
-
- mov.f32 %f2840, 0fBF800000;
- fma.rn.f32 %f5309, %f5309, %f2840, %f2837;
-
-$L__BB0_415:
- mul.f32 %f2841, %f301, 0f3F22F983;
- cvt.rni.s32.f32 %r8337, %f2841;
- cvt.rn.f32.s32 %f2842, %r8337;
- mov.f32 %f2843, 0fBFC90FDA;
- fma.rn.f32 %f2844, %f2842, %f2843, %f301;
- mov.f32 %f2845, 0fB3A22168;
- fma.rn.f32 %f2846, %f2842, %f2845, %f2844;
- mov.f32 %f2847, 0fA7C234C5;
- fma.rn.f32 %f5310, %f2842, %f2847, %f2846;
- abs.f32 %f450, %f301;
- setp.ltu.f32 %p374, %f450, 0f47CE4780;
- @%p374 bra $L__BB0_423;
-
- setp.eq.f32 %p375, %f450, 0f7F800000;
- @%p375 bra $L__BB0_422;
- bra.uni $L__BB0_417;
-
-$L__BB0_422:
- mov.f32 %f2850, 0f00000000;
- mul.rn.f32 %f5310, %f301, %f2850;
- mov.u32 %r8337, 0;
- bra.uni $L__BB0_423;
-
-$L__BB0_417:
- mov.b32 %r551, %f301;
- shr.u32 %r3754, %r551, 23;
+ cvt.u64.u32 %rd1048, %r3748;
+ cvt.u64.u32 %rd1049, %r3750;
+ bfi.b64 %rd1050, %rd1048, %rd1049, 32, 32;
+ cvt.rn.f64.s64 %fd57, %rd1050;
+ mul.f64 %fd58, %fd57, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f2925, %fd58;
+ setp.eq.s32 %p383, %r3746, 0;
+ neg.f32 %f2926, %f2925;
+ selp.f32 %f5381, %f2925, %f2926, %p383;
+
+$L__BB0_426:
+ and.b32 %r573, %r8082, 1;
+ setp.eq.s32 %p384, %r573, 0;
+ selp.f32 %f475, %f5381, 0f3F800000, %p384;
+ mul.rn.f32 %f476, %f5381, %f5381;
+ mov.f32 %f5382, 0fB94D4153;
+ @%p384 bra $L__BB0_428;
+
+ mov.f32 %f2929, 0fBAB607ED;
+ mov.f32 %f2930, 0f37CBAC00;
+ fma.rn.f32 %f5382, %f2930, %f476, %f2929;
+
+$L__BB0_428:
+ selp.f32 %f2931, 0f3C0885E4, 0f3D2AAABB, %p384;
+ fma.rn.f32 %f2932, %f5382, %f476, %f2931;
+ selp.f32 %f2933, 0fBE2AAAA8, 0fBEFFFFFF, %p384;
+ fma.rn.f32 %f2934, %f2932, %f476, %f2933;
+ mov.f32 %f2935, 0f00000000;
+ fma.rn.f32 %f2936, %f476, %f475, %f2935;
+ fma.rn.f32 %f5383, %f2934, %f2936, %f475;
+ and.b32 %r3752, %r8082, 2;
+ setp.eq.s32 %p386, %r3752, 0;
+ @%p386 bra $L__BB0_430;
+
+ mov.f32 %f2938, 0fBF800000;
+ fma.rn.f32 %f5383, %f5383, %f2938, %f2935;
+
+$L__BB0_430:
+ mul.f32 %f2939, %f310, 0f3F22F983;
+ cvt.rni.s32.f32 %r8086, %f2939;
+ cvt.rn.f32.s32 %f2940, %r8086;
+ mov.f32 %f2941, 0fBFC90FDA;
+ fma.rn.f32 %f2942, %f2940, %f2941, %f310;
+ mov.f32 %f2943, 0fB3A22168;
+ fma.rn.f32 %f2944, %f2940, %f2943, %f2942;
+ mov.f32 %f2945, 0fA7C234C5;
+ fma.rn.f32 %f5384, %f2940, %f2945, %f2944;
+ abs.f32 %f483, %f310;
+ setp.ltu.f32 %p387, %f483, 0f47CE4780;
+ @%p387 bra $L__BB0_438;
+
+ setp.eq.f32 %p388, %f483, 0f7F800000;
+ @%p388 bra $L__BB0_437;
+ bra.uni $L__BB0_432;
+
+$L__BB0_437:
+ mov.f32 %f2948, 0f00000000;
+ mul.rn.f32 %f5384, %f310, %f2948;
+ mov.u32 %r8086, 0;
+ bra.uni $L__BB0_438;
+
+$L__BB0_432:
+ mov.b32 %r575, %f310;
+ shr.u32 %r3754, %r575, 23;
and.b32 %r3755, %r3754, 255;
- add.s32 %r552, %r3755, -128;
- shl.b32 %r3756, %r551, 8;
- or.b32 %r553, %r3756, -2147483648;
- shr.u32 %r554, %r552, 5;
- mov.u64 %rd2552, 0;
- mov.u32 %r8334, 0;
- mov.u64 %rd1050, __cudart_i2opi_f;
- mov.u64 %rd2553, %rd2552;
-
-$L__BB0_418:
+ add.s32 %r576, %r3755, -128;
+ shl.b32 %r3756, %r575, 8;
+ or.b32 %r577, %r3756, -2147483648;
+ shr.u32 %r578, %r576, 5;
+ mov.u64 %rd2563, 0;
+ mov.u32 %r8083, 0;
+ mov.u64 %rd1054, __cudart_i2opi_f;
+ mov.u64 %rd2564, %rd2563;
+
+$L__BB0_433:
.pragma "nounroll";
- shl.b64 %rd1049, %rd2552, 2;
- add.s64 %rd1051, %rd1050, %rd1049;
- ld.global.nc.u32 %r3757, [%rd1051];
- mad.wide.u32 %rd1052, %r3757, %r553, %rd2553;
- shr.u64 %rd2553, %rd1052, 32;
- add.s64 %rd1053, %rd1, %rd1049;
- st.local.u32 [%rd1053], %rd1052;
- add.s32 %r8334, %r8334, 1;
- cvt.s64.s32 %rd2552, %r8334;
- setp.ne.s32 %p376, %r8334, 6;
- @%p376 bra $L__BB0_418;
-
- st.local.u32 [%rd5], %rd2553;
+ shl.b64 %rd1053, %rd2563, 2;
+ add.s64 %rd1055, %rd1054, %rd1053;
+ ld.global.nc.u32 %r3757, [%rd1055];
+ mad.wide.u32 %rd1056, %r3757, %r577, %rd2564;
+ shr.u64 %rd2564, %rd1056, 32;
+ add.s64 %rd1057, %rd1, %rd1053;
+ st.local.u32 [%rd1057], %rd1056;
+ add.s32 %r8083, %r8083, 1;
+ cvt.s64.s32 %rd2563, %r8083;
+ setp.ne.s32 %p389, %r8083, 6;
+ @%p389 bra $L__BB0_433;
+
+ st.local.u32 [%rd4], %rd2564;
mov.u32 %r3758, 4;
- sub.s32 %r557, %r3758, %r554;
+ sub.s32 %r581, %r3758, %r578;
mov.u32 %r3759, 6;
- sub.s32 %r3760, %r3759, %r554;
- mul.wide.s32 %rd1054, %r3760, 4;
- add.s64 %rd1055, %rd1, %rd1054;
- ld.local.u32 %r8335, [%rd1055];
- ld.local.u32 %r8336, [%rd1055+-4];
- and.b32 %r560, %r552, 31;
- setp.eq.s32 %p377, %r560, 0;
- @%p377 bra $L__BB0_421;
+ sub.s32 %r3760, %r3759, %r578;
+ mul.wide.s32 %rd1058, %r3760, 4;
+ add.s64 %rd1059, %rd1, %rd1058;
+ ld.local.u32 %r8084, [%rd1059];
+ ld.local.u32 %r8085, [%rd1059+-4];
+ and.b32 %r584, %r576, 31;
+ setp.eq.s32 %p390, %r584, 0;
+ @%p390 bra $L__BB0_436;
mov.u32 %r3761, 32;
- sub.s32 %r3762, %r3761, %r560;
- shr.u32 %r3763, %r8336, %r3762;
- shl.b32 %r3764, %r8335, %r560;
- add.s32 %r8335, %r3763, %r3764;
- mul.wide.s32 %rd1056, %r557, 4;
- add.s64 %rd1057, %rd1, %rd1056;
- ld.local.u32 %r3765, [%rd1057];
+ sub.s32 %r3762, %r3761, %r584;
+ shr.u32 %r3763, %r8085, %r3762;
+ shl.b32 %r3764, %r8084, %r584;
+ add.s32 %r8084, %r3763, %r3764;
+ mul.wide.s32 %rd1060, %r581, 4;
+ add.s64 %rd1061, %rd1, %rd1060;
+ ld.local.u32 %r3765, [%rd1061];
shr.u32 %r3766, %r3765, %r3762;
- shl.b32 %r3767, %r8336, %r560;
- add.s32 %r8336, %r3766, %r3767;
-
-$L__BB0_421:
- and.b32 %r3768, %r551, -2147483648;
- shr.u32 %r3769, %r8336, 30;
- shl.b32 %r3770, %r8335, 2;
+ shl.b32 %r3767, %r8085, %r584;
+ add.s32 %r8085, %r3766, %r3767;
+
+$L__BB0_436:
+ and.b32 %r3768, %r575, -2147483648;
+ shr.u32 %r3769, %r8085, 30;
+ shl.b32 %r3770, %r8084, 2;
or.b32 %r3771, %r3769, %r3770;
shr.u32 %r3772, %r3771, 31;
- shr.u32 %r3773, %r8335, 30;
+ shr.u32 %r3773, %r8084, 30;
add.s32 %r3774, %r3772, %r3773;
neg.s32 %r3775, %r3774;
- setp.eq.s32 %p378, %r3768, 0;
- selp.b32 %r8337, %r3774, %r3775, %p378;
- setp.ne.s32 %p379, %r3772, 0;
+ setp.eq.s32 %p391, %r3768, 0;
+ selp.b32 %r8086, %r3774, %r3775, %p391;
+ setp.ne.s32 %p392, %r3772, 0;
xor.b32 %r3776, %r3768, -2147483648;
- selp.b32 %r3777, %r3776, %r3768, %p379;
- selp.b32 %r3778, -1, 0, %p379;
+ selp.b32 %r3777, %r3776, %r3768, %p392;
+ selp.b32 %r3778, -1, 0, %p392;
xor.b32 %r3779, %r3771, %r3778;
- shl.b32 %r3780, %r8336, 2;
+ shl.b32 %r3780, %r8085, 2;
xor.b32 %r3781, %r3780, %r3778;
- cvt.u64.u32 %rd1058, %r3779;
- cvt.u64.u32 %rd1059, %r3781;
- bfi.b64 %rd1060, %rd1058, %rd1059, 32, 32;
- cvt.rn.f64.s64 %fd55, %rd1060;
- mul.f64 %fd56, %fd55, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f2848, %fd56;
- setp.eq.s32 %p380, %r3777, 0;
- neg.f32 %f2849, %f2848;
- selp.f32 %f5310, %f2848, %f2849, %p380;
-
-$L__BB0_423:
- add.s32 %r567, %r8337, 1;
- and.b32 %r568, %r567, 1;
- setp.eq.s32 %p381, %r568, 0;
- selp.f32 %f454, %f5310, 0f3F800000, %p381;
- mul.rn.f32 %f455, %f5310, %f5310;
- mov.f32 %f5311, 0fB94D4153;
- @%p381 bra $L__BB0_425;
-
- mov.f32 %f2852, 0fBAB607ED;
- mov.f32 %f2853, 0f37CBAC00;
- fma.rn.f32 %f5311, %f2853, %f455, %f2852;
-
-$L__BB0_425:
- selp.f32 %f2854, 0f3C0885E4, 0f3D2AAABB, %p381;
- fma.rn.f32 %f2855, %f5311, %f455, %f2854;
- selp.f32 %f2856, 0fBE2AAAA8, 0fBEFFFFFF, %p381;
- fma.rn.f32 %f2857, %f2855, %f455, %f2856;
- mov.f32 %f2858, 0f00000000;
- fma.rn.f32 %f2859, %f455, %f454, %f2858;
- fma.rn.f32 %f5312, %f2857, %f2859, %f454;
- and.b32 %r3783, %r567, 2;
- setp.eq.s32 %p383, %r3783, 0;
- @%p383 bra $L__BB0_427;
-
- mov.f32 %f2861, 0fBF800000;
- fma.rn.f32 %f5312, %f5312, %f2861, %f2858;
-
-$L__BB0_427:
- add.f32 %f5327, %f5309, %f5312;
- mul.f32 %f2862, %f310, 0f3F22F983;
- cvt.rni.s32.f32 %r8341, %f2862;
- cvt.rn.f32.s32 %f2863, %r8341;
- mov.f32 %f2864, 0fBFC90FDA;
- fma.rn.f32 %f2865, %f2863, %f2864, %f310;
- mov.f32 %f2866, 0fB3A22168;
- fma.rn.f32 %f2867, %f2863, %f2866, %f2865;
- mov.f32 %f2868, 0fA7C234C5;
- fma.rn.f32 %f5313, %f2863, %f2868, %f2867;
- abs.f32 %f463, %f310;
- setp.ltu.f32 %p384, %f463, 0f47CE4780;
- @%p384 bra $L__BB0_435;
-
- setp.eq.f32 %p385, %f463, 0f7F800000;
- @%p385 bra $L__BB0_434;
- bra.uni $L__BB0_429;
-
-$L__BB0_434:
- mov.f32 %f2871, 0f00000000;
- mul.rn.f32 %f5313, %f310, %f2871;
- mov.u32 %r8341, 0;
- bra.uni $L__BB0_435;
-
-$L__BB0_429:
- mov.b32 %r570, %f310;
- shr.u32 %r3785, %r570, 23;
+ cvt.u64.u32 %rd1062, %r3779;
+ cvt.u64.u32 %rd1063, %r3781;
+ bfi.b64 %rd1064, %rd1062, %rd1063, 32, 32;
+ cvt.rn.f64.s64 %fd59, %rd1064;
+ mul.f64 %fd60, %fd59, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f2946, %fd60;
+ setp.eq.s32 %p393, %r3777, 0;
+ neg.f32 %f2947, %f2946;
+ selp.f32 %f5384, %f2946, %f2947, %p393;
+
+$L__BB0_438:
+ add.s32 %r591, %r8086, 1;
+ and.b32 %r592, %r591, 1;
+ setp.eq.s32 %p394, %r592, 0;
+ selp.f32 %f487, %f5384, 0f3F800000, %p394;
+ mul.rn.f32 %f488, %f5384, %f5384;
+ mov.f32 %f5385, 0fB94D4153;
+ @%p394 bra $L__BB0_440;
+
+ mov.f32 %f2950, 0fBAB607ED;
+ mov.f32 %f2951, 0f37CBAC00;
+ fma.rn.f32 %f5385, %f2951, %f488, %f2950;
+
+$L__BB0_440:
+ selp.f32 %f2952, 0f3C0885E4, 0f3D2AAABB, %p394;
+ fma.rn.f32 %f2953, %f5385, %f488, %f2952;
+ selp.f32 %f2954, 0fBE2AAAA8, 0fBEFFFFFF, %p394;
+ fma.rn.f32 %f2955, %f2953, %f488, %f2954;
+ mov.f32 %f2956, 0f00000000;
+ fma.rn.f32 %f2957, %f488, %f487, %f2956;
+ fma.rn.f32 %f5386, %f2955, %f2957, %f487;
+ and.b32 %r3783, %r591, 2;
+ setp.eq.s32 %p396, %r3783, 0;
+ @%p396 bra $L__BB0_442;
+
+ mov.f32 %f2959, 0fBF800000;
+ fma.rn.f32 %f5386, %f5386, %f2959, %f2956;
+
+$L__BB0_442:
+ add.f32 %f5394, %f5383, %f5386;
+ mul.f32 %f2960, %f319, 0f3F22F983;
+ cvt.rni.s32.f32 %r8090, %f2960;
+ cvt.rn.f32.s32 %f2961, %r8090;
+ mov.f32 %f2962, 0fBFC90FDA;
+ fma.rn.f32 %f2963, %f2961, %f2962, %f319;
+ mov.f32 %f2964, 0fB3A22168;
+ fma.rn.f32 %f2965, %f2961, %f2964, %f2963;
+ mov.f32 %f2966, 0fA7C234C5;
+ fma.rn.f32 %f5387, %f2961, %f2966, %f2965;
+ abs.f32 %f496, %f319;
+ setp.ltu.f32 %p397, %f496, 0f47CE4780;
+ @%p397 bra $L__BB0_450;
+
+ setp.eq.f32 %p398, %f496, 0f7F800000;
+ @%p398 bra $L__BB0_449;
+ bra.uni $L__BB0_444;
+
+$L__BB0_449:
+ mov.f32 %f2969, 0f00000000;
+ mul.rn.f32 %f5387, %f319, %f2969;
+ mov.u32 %r8090, 0;
+ bra.uni $L__BB0_450;
+
+$L__BB0_444:
+ mov.b32 %r594, %f319;
+ shr.u32 %r3785, %r594, 23;
and.b32 %r3786, %r3785, 255;
- add.s32 %r571, %r3786, -128;
- shl.b32 %r3787, %r570, 8;
- or.b32 %r572, %r3787, -2147483648;
- shr.u32 %r573, %r571, 5;
- mov.u64 %rd2554, 0;
- mov.u32 %r8338, 0;
- mov.u64 %rd1064, __cudart_i2opi_f;
- mov.u64 %rd2555, %rd2554;
-
-$L__BB0_430:
+ add.s32 %r595, %r3786, -128;
+ shl.b32 %r3787, %r594, 8;
+ or.b32 %r596, %r3787, -2147483648;
+ shr.u32 %r597, %r595, 5;
+ mov.u64 %rd2565, 0;
+ mov.u32 %r8087, 0;
+ mov.u64 %rd1068, __cudart_i2opi_f;
+ mov.u64 %rd2566, %rd2565;
+
+$L__BB0_445:
.pragma "nounroll";
- shl.b64 %rd1063, %rd2554, 2;
- add.s64 %rd1065, %rd1064, %rd1063;
- ld.global.nc.u32 %r3788, [%rd1065];
- mad.wide.u32 %rd1066, %r3788, %r572, %rd2555;
- shr.u64 %rd2555, %rd1066, 32;
- add.s64 %rd1067, %rd1, %rd1063;
- st.local.u32 [%rd1067], %rd1066;
- add.s32 %r8338, %r8338, 1;
- cvt.s64.s32 %rd2554, %r8338;
- setp.ne.s32 %p386, %r8338, 6;
- @%p386 bra $L__BB0_430;
-
- st.local.u32 [%rd5], %rd2555;
+ shl.b64 %rd1067, %rd2565, 2;
+ add.s64 %rd1069, %rd1068, %rd1067;
+ ld.global.nc.u32 %r3788, [%rd1069];
+ mad.wide.u32 %rd1070, %r3788, %r596, %rd2566;
+ shr.u64 %rd2566, %rd1070, 32;
+ add.s64 %rd1071, %rd1, %rd1067;
+ st.local.u32 [%rd1071], %rd1070;
+ add.s32 %r8087, %r8087, 1;
+ cvt.s64.s32 %rd2565, %r8087;
+ setp.ne.s32 %p399, %r8087, 6;
+ @%p399 bra $L__BB0_445;
+
+ st.local.u32 [%rd4], %rd2566;
mov.u32 %r3789, 4;
- sub.s32 %r576, %r3789, %r573;
+ sub.s32 %r600, %r3789, %r597;
mov.u32 %r3790, 6;
- sub.s32 %r3791, %r3790, %r573;
- mul.wide.s32 %rd1068, %r3791, 4;
- add.s64 %rd1069, %rd1, %rd1068;
- ld.local.u32 %r8339, [%rd1069];
- ld.local.u32 %r8340, [%rd1069+-4];
- and.b32 %r579, %r571, 31;
- setp.eq.s32 %p387, %r579, 0;
- @%p387 bra $L__BB0_433;
+ sub.s32 %r3791, %r3790, %r597;
+ mul.wide.s32 %rd1072, %r3791, 4;
+ add.s64 %rd1073, %rd1, %rd1072;
+ ld.local.u32 %r8088, [%rd1073];
+ ld.local.u32 %r8089, [%rd1073+-4];
+ and.b32 %r603, %r595, 31;
+ setp.eq.s32 %p400, %r603, 0;
+ @%p400 bra $L__BB0_448;
mov.u32 %r3792, 32;
- sub.s32 %r3793, %r3792, %r579;
- shr.u32 %r3794, %r8340, %r3793;
- shl.b32 %r3795, %r8339, %r579;
- add.s32 %r8339, %r3794, %r3795;
- mul.wide.s32 %rd1070, %r576, 4;
- add.s64 %rd1071, %rd1, %rd1070;
- ld.local.u32 %r3796, [%rd1071];
+ sub.s32 %r3793, %r3792, %r603;
+ shr.u32 %r3794, %r8089, %r3793;
+ shl.b32 %r3795, %r8088, %r603;
+ add.s32 %r8088, %r3794, %r3795;
+ mul.wide.s32 %rd1074, %r600, 4;
+ add.s64 %rd1075, %rd1, %rd1074;
+ ld.local.u32 %r3796, [%rd1075];
shr.u32 %r3797, %r3796, %r3793;
- shl.b32 %r3798, %r8340, %r579;
- add.s32 %r8340, %r3797, %r3798;
-
-$L__BB0_433:
- and.b32 %r3799, %r570, -2147483648;
- shr.u32 %r3800, %r8340, 30;
- shl.b32 %r3801, %r8339, 2;
+ shl.b32 %r3798, %r8089, %r603;
+ add.s32 %r8089, %r3797, %r3798;
+
+$L__BB0_448:
+ and.b32 %r3799, %r594, -2147483648;
+ shr.u32 %r3800, %r8089, 30;
+ shl.b32 %r3801, %r8088, 2;
or.b32 %r3802, %r3800, %r3801;
shr.u32 %r3803, %r3802, 31;
- shr.u32 %r3804, %r8339, 30;
+ shr.u32 %r3804, %r8088, 30;
add.s32 %r3805, %r3803, %r3804;
neg.s32 %r3806, %r3805;
- setp.eq.s32 %p388, %r3799, 0;
- selp.b32 %r8341, %r3805, %r3806, %p388;
- setp.ne.s32 %p389, %r3803, 0;
+ setp.eq.s32 %p401, %r3799, 0;
+ selp.b32 %r8090, %r3805, %r3806, %p401;
+ setp.ne.s32 %p402, %r3803, 0;
xor.b32 %r3807, %r3799, -2147483648;
- selp.b32 %r3808, %r3807, %r3799, %p389;
- selp.b32 %r3809, -1, 0, %p389;
+ selp.b32 %r3808, %r3807, %r3799, %p402;
+ selp.b32 %r3809, -1, 0, %p402;
xor.b32 %r3810, %r3802, %r3809;
- shl.b32 %r3811, %r8340, 2;
+ shl.b32 %r3811, %r8089, 2;
xor.b32 %r3812, %r3811, %r3809;
- cvt.u64.u32 %rd1072, %r3810;
- cvt.u64.u32 %rd1073, %r3812;
- bfi.b64 %rd1074, %rd1072, %rd1073, 32, 32;
- cvt.rn.f64.s64 %fd57, %rd1074;
- mul.f64 %fd58, %fd57, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f2869, %fd58;
- setp.eq.s32 %p390, %r3808, 0;
- neg.f32 %f2870, %f2869;
- selp.f32 %f5313, %f2869, %f2870, %p390;
-
-$L__BB0_435:
- and.b32 %r586, %r8341, 1;
- setp.eq.s32 %p391, %r586, 0;
- selp.f32 %f467, %f5313, 0f3F800000, %p391;
- mul.rn.f32 %f468, %f5313, %f5313;
- mov.f32 %f5314, 0fB94D4153;
- @%p391 bra $L__BB0_437;
-
- mov.f32 %f2873, 0fBAB607ED;
- mov.f32 %f2874, 0f37CBAC00;
- fma.rn.f32 %f5314, %f2874, %f468, %f2873;
-
-$L__BB0_437:
- selp.f32 %f2875, 0f3C0885E4, 0f3D2AAABB, %p391;
- fma.rn.f32 %f2876, %f5314, %f468, %f2875;
- selp.f32 %f2877, 0fBE2AAAA8, 0fBEFFFFFF, %p391;
- fma.rn.f32 %f2878, %f2876, %f468, %f2877;
- mov.f32 %f2879, 0f00000000;
- fma.rn.f32 %f2880, %f468, %f467, %f2879;
- fma.rn.f32 %f5315, %f2878, %f2880, %f467;
- and.b32 %r3814, %r8341, 2;
- setp.eq.s32 %p393, %r3814, 0;
- @%p393 bra $L__BB0_439;
-
- mov.f32 %f2882, 0fBF800000;
- fma.rn.f32 %f5315, %f5315, %f2882, %f2879;
-
-$L__BB0_439:
- mul.f32 %f2883, %f302, 0f3F22F983;
- cvt.rni.s32.f32 %r8345, %f2883;
- cvt.rn.f32.s32 %f2884, %r8345;
- mov.f32 %f2885, 0fBFC90FDA;
- fma.rn.f32 %f2886, %f2884, %f2885, %f302;
- mov.f32 %f2887, 0fB3A22168;
- fma.rn.f32 %f2888, %f2884, %f2887, %f2886;
- mov.f32 %f2889, 0fA7C234C5;
- fma.rn.f32 %f5316, %f2884, %f2889, %f2888;
- abs.f32 %f475, %f302;
- setp.ltu.f32 %p394, %f475, 0f47CE4780;
- @%p394 bra $L__BB0_447;
-
- setp.eq.f32 %p395, %f475, 0f7F800000;
- @%p395 bra $L__BB0_446;
- bra.uni $L__BB0_441;
-
-$L__BB0_446:
- mov.f32 %f2892, 0f00000000;
- mul.rn.f32 %f5316, %f302, %f2892;
- mov.u32 %r8345, 0;
- bra.uni $L__BB0_447;
-
-$L__BB0_441:
- mov.b32 %r588, %f302;
- shr.u32 %r3816, %r588, 23;
+ cvt.u64.u32 %rd1076, %r3810;
+ cvt.u64.u32 %rd1077, %r3812;
+ bfi.b64 %rd1078, %rd1076, %rd1077, 32, 32;
+ cvt.rn.f64.s64 %fd61, %rd1078;
+ mul.f64 %fd62, %fd61, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f2967, %fd62;
+ setp.eq.s32 %p403, %r3808, 0;
+ neg.f32 %f2968, %f2967;
+ selp.f32 %f5387, %f2967, %f2968, %p403;
+
+$L__BB0_450:
+ and.b32 %r610, %r8090, 1;
+ setp.eq.s32 %p404, %r610, 0;
+ selp.f32 %f500, %f5387, 0f3F800000, %p404;
+ mul.rn.f32 %f501, %f5387, %f5387;
+ mov.f32 %f5388, 0fB94D4153;
+ @%p404 bra $L__BB0_452;
+
+ mov.f32 %f2971, 0fBAB607ED;
+ mov.f32 %f2972, 0f37CBAC00;
+ fma.rn.f32 %f5388, %f2972, %f501, %f2971;
+
+$L__BB0_452:
+ selp.f32 %f2973, 0f3C0885E4, 0f3D2AAABB, %p404;
+ fma.rn.f32 %f2974, %f5388, %f501, %f2973;
+ selp.f32 %f2975, 0fBE2AAAA8, 0fBEFFFFFF, %p404;
+ fma.rn.f32 %f2976, %f2974, %f501, %f2975;
+ mov.f32 %f2977, 0f00000000;
+ fma.rn.f32 %f2978, %f501, %f500, %f2977;
+ fma.rn.f32 %f5389, %f2976, %f2978, %f500;
+ and.b32 %r3814, %r8090, 2;
+ setp.eq.s32 %p406, %r3814, 0;
+ @%p406 bra $L__BB0_454;
+
+ mov.f32 %f2980, 0fBF800000;
+ fma.rn.f32 %f5389, %f5389, %f2980, %f2977;
+
+$L__BB0_454:
+ mul.f32 %f2981, %f311, 0f3F22F983;
+ cvt.rni.s32.f32 %r8094, %f2981;
+ cvt.rn.f32.s32 %f2982, %r8094;
+ mov.f32 %f2983, 0fBFC90FDA;
+ fma.rn.f32 %f2984, %f2982, %f2983, %f311;
+ mov.f32 %f2985, 0fB3A22168;
+ fma.rn.f32 %f2986, %f2982, %f2985, %f2984;
+ mov.f32 %f2987, 0fA7C234C5;
+ fma.rn.f32 %f5390, %f2982, %f2987, %f2986;
+ abs.f32 %f508, %f311;
+ setp.ltu.f32 %p407, %f508, 0f47CE4780;
+ @%p407 bra $L__BB0_462;
+
+ setp.eq.f32 %p408, %f508, 0f7F800000;
+ @%p408 bra $L__BB0_461;
+ bra.uni $L__BB0_456;
+
+$L__BB0_461:
+ mov.f32 %f2990, 0f00000000;
+ mul.rn.f32 %f5390, %f311, %f2990;
+ mov.u32 %r8094, 0;
+ bra.uni $L__BB0_462;
+
+$L__BB0_456:
+ mov.b32 %r612, %f311;
+ shr.u32 %r3816, %r612, 23;
and.b32 %r3817, %r3816, 255;
- add.s32 %r589, %r3817, -128;
- shl.b32 %r3818, %r588, 8;
- or.b32 %r590, %r3818, -2147483648;
- shr.u32 %r591, %r589, 5;
- mov.u64 %rd2556, 0;
- mov.u32 %r8342, 0;
- mov.u64 %rd1078, __cudart_i2opi_f;
- mov.u64 %rd2557, %rd2556;
-
-$L__BB0_442:
+ add.s32 %r613, %r3817, -128;
+ shl.b32 %r3818, %r612, 8;
+ or.b32 %r614, %r3818, -2147483648;
+ shr.u32 %r615, %r613, 5;
+ mov.u64 %rd2567, 0;
+ mov.u32 %r8091, 0;
+ mov.u64 %rd1082, __cudart_i2opi_f;
+ mov.u64 %rd2568, %rd2567;
+
+$L__BB0_457:
.pragma "nounroll";
- shl.b64 %rd1077, %rd2556, 2;
- add.s64 %rd1079, %rd1078, %rd1077;
- ld.global.nc.u32 %r3819, [%rd1079];
- mad.wide.u32 %rd1080, %r3819, %r590, %rd2557;
- shr.u64 %rd2557, %rd1080, 32;
- add.s64 %rd1081, %rd1, %rd1077;
- st.local.u32 [%rd1081], %rd1080;
- add.s32 %r8342, %r8342, 1;
- cvt.s64.s32 %rd2556, %r8342;
- setp.ne.s32 %p396, %r8342, 6;
- @%p396 bra $L__BB0_442;
-
- st.local.u32 [%rd5], %rd2557;
+ shl.b64 %rd1081, %rd2567, 2;
+ add.s64 %rd1083, %rd1082, %rd1081;
+ ld.global.nc.u32 %r3819, [%rd1083];
+ mad.wide.u32 %rd1084, %r3819, %r614, %rd2568;
+ shr.u64 %rd2568, %rd1084, 32;
+ add.s64 %rd1085, %rd1, %rd1081;
+ st.local.u32 [%rd1085], %rd1084;
+ add.s32 %r8091, %r8091, 1;
+ cvt.s64.s32 %rd2567, %r8091;
+ setp.ne.s32 %p409, %r8091, 6;
+ @%p409 bra $L__BB0_457;
+
+ st.local.u32 [%rd4], %rd2568;
mov.u32 %r3820, 4;
- sub.s32 %r594, %r3820, %r591;
+ sub.s32 %r618, %r3820, %r615;
mov.u32 %r3821, 6;
- sub.s32 %r3822, %r3821, %r591;
- mul.wide.s32 %rd1082, %r3822, 4;
- add.s64 %rd1083, %rd1, %rd1082;
- ld.local.u32 %r8343, [%rd1083];
- ld.local.u32 %r8344, [%rd1083+-4];
- and.b32 %r597, %r589, 31;
- setp.eq.s32 %p397, %r597, 0;
- @%p397 bra $L__BB0_445;
+ sub.s32 %r3822, %r3821, %r615;
+ mul.wide.s32 %rd1086, %r3822, 4;
+ add.s64 %rd1087, %rd1, %rd1086;
+ ld.local.u32 %r8092, [%rd1087];
+ ld.local.u32 %r8093, [%rd1087+-4];
+ and.b32 %r621, %r613, 31;
+ setp.eq.s32 %p410, %r621, 0;
+ @%p410 bra $L__BB0_460;
mov.u32 %r3823, 32;
- sub.s32 %r3824, %r3823, %r597;
- shr.u32 %r3825, %r8344, %r3824;
- shl.b32 %r3826, %r8343, %r597;
- add.s32 %r8343, %r3825, %r3826;
- mul.wide.s32 %rd1084, %r594, 4;
- add.s64 %rd1085, %rd1, %rd1084;
- ld.local.u32 %r3827, [%rd1085];
+ sub.s32 %r3824, %r3823, %r621;
+ shr.u32 %r3825, %r8093, %r3824;
+ shl.b32 %r3826, %r8092, %r621;
+ add.s32 %r8092, %r3825, %r3826;
+ mul.wide.s32 %rd1088, %r618, 4;
+ add.s64 %rd1089, %rd1, %rd1088;
+ ld.local.u32 %r3827, [%rd1089];
shr.u32 %r3828, %r3827, %r3824;
- shl.b32 %r3829, %r8344, %r597;
- add.s32 %r8344, %r3828, %r3829;
-
-$L__BB0_445:
- and.b32 %r3830, %r588, -2147483648;
- shr.u32 %r3831, %r8344, 30;
- shl.b32 %r3832, %r8343, 2;
+ shl.b32 %r3829, %r8093, %r621;
+ add.s32 %r8093, %r3828, %r3829;
+
+$L__BB0_460:
+ and.b32 %r3830, %r612, -2147483648;
+ shr.u32 %r3831, %r8093, 30;
+ shl.b32 %r3832, %r8092, 2;
or.b32 %r3833, %r3831, %r3832;
shr.u32 %r3834, %r3833, 31;
- shr.u32 %r3835, %r8343, 30;
+ shr.u32 %r3835, %r8092, 30;
add.s32 %r3836, %r3834, %r3835;
neg.s32 %r3837, %r3836;
- setp.eq.s32 %p398, %r3830, 0;
- selp.b32 %r8345, %r3836, %r3837, %p398;
- setp.ne.s32 %p399, %r3834, 0;
+ setp.eq.s32 %p411, %r3830, 0;
+ selp.b32 %r8094, %r3836, %r3837, %p411;
+ setp.ne.s32 %p412, %r3834, 0;
xor.b32 %r3838, %r3830, -2147483648;
- selp.b32 %r3839, %r3838, %r3830, %p399;
- selp.b32 %r3840, -1, 0, %p399;
+ selp.b32 %r3839, %r3838, %r3830, %p412;
+ selp.b32 %r3840, -1, 0, %p412;
xor.b32 %r3841, %r3833, %r3840;
- shl.b32 %r3842, %r8344, 2;
+ shl.b32 %r3842, %r8093, 2;
xor.b32 %r3843, %r3842, %r3840;
- cvt.u64.u32 %rd1086, %r3841;
- cvt.u64.u32 %rd1087, %r3843;
- bfi.b64 %rd1088, %rd1086, %rd1087, 32, 32;
- cvt.rn.f64.s64 %fd59, %rd1088;
- mul.f64 %fd60, %fd59, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f2890, %fd60;
- setp.eq.s32 %p400, %r3839, 0;
- neg.f32 %f2891, %f2890;
- selp.f32 %f5316, %f2890, %f2891, %p400;
-
-$L__BB0_447:
- add.s32 %r604, %r8345, 1;
- and.b32 %r605, %r604, 1;
- setp.eq.s32 %p401, %r605, 0;
- selp.f32 %f479, %f5316, 0f3F800000, %p401;
- mul.rn.f32 %f480, %f5316, %f5316;
- mov.f32 %f5317, 0fB94D4153;
- @%p401 bra $L__BB0_449;
-
- mov.f32 %f2894, 0fBAB607ED;
- mov.f32 %f2895, 0f37CBAC00;
- fma.rn.f32 %f5317, %f2895, %f480, %f2894;
-
-$L__BB0_449:
- selp.f32 %f2896, 0f3C0885E4, 0f3D2AAABB, %p401;
- fma.rn.f32 %f2897, %f5317, %f480, %f2896;
- selp.f32 %f2898, 0fBE2AAAA8, 0fBEFFFFFF, %p401;
- fma.rn.f32 %f2899, %f2897, %f480, %f2898;
- mov.f32 %f2900, 0f00000000;
- fma.rn.f32 %f2901, %f480, %f479, %f2900;
- fma.rn.f32 %f5318, %f2899, %f2901, %f479;
- and.b32 %r3845, %r604, 2;
- setp.eq.s32 %p403, %r3845, 0;
- @%p403 bra $L__BB0_451;
-
- mov.f32 %f2903, 0fBF800000;
- fma.rn.f32 %f5318, %f5318, %f2903, %f2900;
-
-$L__BB0_451:
- add.f32 %f5326, %f5315, %f5318;
- mul.f32 %f2904, %f311, 0f3F22F983;
- cvt.rni.s32.f32 %r8349, %f2904;
- cvt.rn.f32.s32 %f2905, %r8349;
- mov.f32 %f2906, 0fBFC90FDA;
- fma.rn.f32 %f2907, %f2905, %f2906, %f311;
- mov.f32 %f2908, 0fB3A22168;
- fma.rn.f32 %f2909, %f2905, %f2908, %f2907;
- mov.f32 %f2910, 0fA7C234C5;
- fma.rn.f32 %f5319, %f2905, %f2910, %f2909;
- abs.f32 %f488, %f311;
- setp.ltu.f32 %p404, %f488, 0f47CE4780;
- @%p404 bra $L__BB0_459;
-
- setp.eq.f32 %p405, %f488, 0f7F800000;
- @%p405 bra $L__BB0_458;
- bra.uni $L__BB0_453;
-
-$L__BB0_458:
- mov.f32 %f2913, 0f00000000;
- mul.rn.f32 %f5319, %f311, %f2913;
- mov.u32 %r8349, 0;
- bra.uni $L__BB0_459;
-
-$L__BB0_453:
- mov.b32 %r607, %f311;
- shr.u32 %r3847, %r607, 23;
- and.b32 %r3848, %r3847, 255;
- add.s32 %r608, %r3848, -128;
- shl.b32 %r3849, %r607, 8;
- or.b32 %r609, %r3849, -2147483648;
- shr.u32 %r610, %r608, 5;
- mov.u64 %rd2558, 0;
- mov.u32 %r8346, 0;
- mov.u64 %rd1092, __cudart_i2opi_f;
- mov.u64 %rd2559, %rd2558;
-
-$L__BB0_454:
+ cvt.u64.u32 %rd1090, %r3841;
+ cvt.u64.u32 %rd1091, %r3843;
+ bfi.b64 %rd1092, %rd1090, %rd1091, 32, 32;
+ cvt.rn.f64.s64 %fd63, %rd1092;
+ mul.f64 %fd64, %fd63, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f2988, %fd64;
+ setp.eq.s32 %p413, %r3839, 0;
+ neg.f32 %f2989, %f2988;
+ selp.f32 %f5390, %f2988, %f2989, %p413;
+
+$L__BB0_462:
+ add.s32 %r628, %r8094, 1;
+ and.b32 %r629, %r628, 1;
+ setp.eq.s32 %p414, %r629, 0;
+ selp.f32 %f512, %f5390, 0f3F800000, %p414;
+ mul.rn.f32 %f513, %f5390, %f5390;
+ mov.f32 %f5391, 0fB94D4153;
+ @%p414 bra $L__BB0_464;
+
+ mov.f32 %f2992, 0fBAB607ED;
+ mov.f32 %f2993, 0f37CBAC00;
+ fma.rn.f32 %f5391, %f2993, %f513, %f2992;
+
+$L__BB0_464:
+ selp.f32 %f2994, 0f3C0885E4, 0f3D2AAABB, %p414;
+ fma.rn.f32 %f2995, %f5391, %f513, %f2994;
+ selp.f32 %f2996, 0fBE2AAAA8, 0fBEFFFFFF, %p414;
+ fma.rn.f32 %f2997, %f2995, %f513, %f2996;
+ mov.f32 %f2998, 0f00000000;
+ fma.rn.f32 %f2999, %f513, %f512, %f2998;
+ fma.rn.f32 %f5392, %f2997, %f2999, %f512;
+ and.b32 %r3845, %r628, 2;
+ setp.eq.s32 %p416, %r3845, 0;
+ @%p416 bra $L__BB0_466;
+
+ mov.f32 %f3001, 0fBF800000;
+ fma.rn.f32 %f5392, %f5392, %f3001, %f2998;
+
+$L__BB0_466:
+ add.f32 %f5393, %f5389, %f5392;
+ bra.uni $L__BB0_467;
+
+$L__BB0_47:
+ mov.b32 %r2786, %f5416;
+ shl.b32 %r2787, %r2786, 8;
+ or.b32 %r34, %r2787, -2147483648;
+ mov.u64 %rd2505, 0;
+ mov.u32 %r7967, 0;
+ mov.u64 %rd624, __cudart_i2opi_f;
+ mov.u64 %rd2506, %rd2505;
+
+$L__BB0_48:
.pragma "nounroll";
- shl.b64 %rd1091, %rd2558, 2;
- add.s64 %rd1093, %rd1092, %rd1091;
- ld.global.nc.u32 %r3850, [%rd1093];
- mad.wide.u32 %rd1094, %r3850, %r609, %rd2559;
- shr.u64 %rd2559, %rd1094, 32;
- add.s64 %rd1095, %rd1, %rd1091;
- st.local.u32 [%rd1095], %rd1094;
- add.s32 %r8346, %r8346, 1;
- cvt.s64.s32 %rd2558, %r8346;
- setp.ne.s32 %p406, %r8346, 6;
- @%p406 bra $L__BB0_454;
-
- st.local.u32 [%rd5], %rd2559;
- mov.u32 %r3851, 4;
- sub.s32 %r613, %r3851, %r610;
- mov.u32 %r3852, 6;
- sub.s32 %r3853, %r3852, %r610;
- mul.wide.s32 %rd1096, %r3853, 4;
- add.s64 %rd1097, %rd1, %rd1096;
- ld.local.u32 %r8347, [%rd1097];
- ld.local.u32 %r8348, [%rd1097+-4];
- and.b32 %r616, %r608, 31;
- setp.eq.s32 %p407, %r616, 0;
- @%p407 bra $L__BB0_457;
-
- mov.u32 %r3854, 32;
- sub.s32 %r3855, %r3854, %r616;
- shr.u32 %r3856, %r8348, %r3855;
- shl.b32 %r3857, %r8347, %r616;
- add.s32 %r8347, %r3856, %r3857;
- mul.wide.s32 %rd1098, %r613, 4;
- add.s64 %rd1099, %rd1, %rd1098;
- ld.local.u32 %r3858, [%rd1099];
- shr.u32 %r3859, %r3858, %r3855;
- shl.b32 %r3860, %r8348, %r616;
- add.s32 %r8348, %r3859, %r3860;
-
-$L__BB0_457:
- and.b32 %r3861, %r607, -2147483648;
- shr.u32 %r3862, %r8348, 30;
- shl.b32 %r3863, %r8347, 2;
- or.b32 %r3864, %r3862, %r3863;
- shr.u32 %r3865, %r3864, 31;
- shr.u32 %r3866, %r8347, 30;
- add.s32 %r3867, %r3865, %r3866;
- neg.s32 %r3868, %r3867;
- setp.eq.s32 %p408, %r3861, 0;
- selp.b32 %r8349, %r3867, %r3868, %p408;
- setp.ne.s32 %p409, %r3865, 0;
- xor.b32 %r3869, %r3861, -2147483648;
- selp.b32 %r3870, %r3869, %r3861, %p409;
- selp.b32 %r3871, -1, 0, %p409;
- xor.b32 %r3872, %r3864, %r3871;
- shl.b32 %r3873, %r8348, 2;
- xor.b32 %r3874, %r3873, %r3871;
- cvt.u64.u32 %rd1100, %r3872;
- cvt.u64.u32 %rd1101, %r3874;
- bfi.b64 %rd1102, %rd1100, %rd1101, 32, 32;
- cvt.rn.f64.s64 %fd61, %rd1102;
- mul.f64 %fd62, %fd61, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f2911, %fd62;
- setp.eq.s32 %p410, %r3870, 0;
- neg.f32 %f2912, %f2911;
- selp.f32 %f5319, %f2911, %f2912, %p410;
-
-$L__BB0_459:
- and.b32 %r623, %r8349, 1;
- setp.eq.s32 %p411, %r623, 0;
- selp.f32 %f492, %f5319, 0f3F800000, %p411;
- mul.rn.f32 %f493, %f5319, %f5319;
- mov.f32 %f5320, 0fB94D4153;
- @%p411 bra $L__BB0_461;
-
- mov.f32 %f2915, 0fBAB607ED;
- mov.f32 %f2916, 0f37CBAC00;
- fma.rn.f32 %f5320, %f2916, %f493, %f2915;
-
-$L__BB0_461:
- selp.f32 %f2917, 0f3C0885E4, 0f3D2AAABB, %p411;
- fma.rn.f32 %f2918, %f5320, %f493, %f2917;
- selp.f32 %f2919, 0fBE2AAAA8, 0fBEFFFFFF, %p411;
- fma.rn.f32 %f2920, %f2918, %f493, %f2919;
- mov.f32 %f2921, 0f00000000;
- fma.rn.f32 %f2922, %f493, %f492, %f2921;
- fma.rn.f32 %f5321, %f2920, %f2922, %f492;
- and.b32 %r3876, %r8349, 2;
- setp.eq.s32 %p413, %r3876, 0;
- @%p413 bra $L__BB0_463;
-
- mov.f32 %f2924, 0fBF800000;
- fma.rn.f32 %f5321, %f5321, %f2924, %f2921;
-
-$L__BB0_463:
- mul.f32 %f2925, %f303, 0f3F22F983;
- cvt.rni.s32.f32 %r8353, %f2925;
- cvt.rn.f32.s32 %f2926, %r8353;
- mov.f32 %f2927, 0fBFC90FDA;
- fma.rn.f32 %f2928, %f2926, %f2927, %f303;
- mov.f32 %f2929, 0fB3A22168;
- fma.rn.f32 %f2930, %f2926, %f2929, %f2928;
- mov.f32 %f2931, 0fA7C234C5;
- fma.rn.f32 %f5322, %f2926, %f2931, %f2930;
- abs.f32 %f500, %f303;
- setp.ltu.f32 %p414, %f500, 0f47CE4780;
- @%p414 bra $L__BB0_471;
-
- setp.eq.f32 %p415, %f500, 0f7F800000;
- @%p415 bra $L__BB0_470;
- bra.uni $L__BB0_465;
-
-$L__BB0_470:
- mov.f32 %f2934, 0f00000000;
- mul.rn.f32 %f5322, %f303, %f2934;
- mov.u32 %r8353, 0;
- bra.uni $L__BB0_471;
-
-$L__BB0_465:
- mov.b32 %r625, %f303;
- shr.u32 %r3878, %r625, 23;
- and.b32 %r3879, %r3878, 255;
- add.s32 %r626, %r3879, -128;
- shl.b32 %r3880, %r625, 8;
- or.b32 %r627, %r3880, -2147483648;
- shr.u32 %r628, %r626, 5;
- mov.u64 %rd2560, 0;
- mov.u32 %r8350, 0;
- mov.u64 %rd1106, __cudart_i2opi_f;
- mov.u64 %rd2561, %rd2560;
-
-$L__BB0_466:
+ shl.b64 %rd623, %rd2505, 2;
+ add.s64 %rd625, %rd624, %rd623;
+ ld.global.nc.u32 %r2788, [%rd625];
+ mad.wide.u32 %rd626, %r2788, %r34, %rd2506;
+ shr.u64 %rd2506, %rd626, 32;
+ add.s64 %rd627, %rd1, %rd623;
+ st.local.u32 [%rd627], %rd626;
+ add.s32 %r7967, %r7967, 1;
+ cvt.s64.s32 %rd2505, %r7967;
+ setp.ne.s32 %p63, %r7967, 6;
+ @%p63 bra $L__BB0_48;
+
+ mov.b32 %r7823, %f5416;
+ shr.u32 %r2789, %r7823, 23;
+ and.b32 %r2790, %r2789, 255;
+ add.s32 %r2791, %r2790, -128;
+ shr.u32 %r2792, %r2791, 5;
+ st.local.u32 [%rd4], %rd2506;
+ and.b32 %r39, %r2791, 31;
+ mov.u32 %r2794, 6;
+ sub.s32 %r2795, %r2794, %r2792;
+ mul.wide.s32 %rd628, %r2795, 4;
+ add.s64 %rd629, %rd1, %rd628;
+ ld.local.u32 %r7968, [%rd629];
+ ld.local.u32 %r7969, [%rd629+-4];
+ setp.eq.s32 %p64, %r39, 0;
+ @%p64 bra $L__BB0_51;
+
+ mov.b32 %r7831, %f5416;
+ shr.u32 %r7830, %r7831, 23;
+ and.b32 %r7829, %r7830, 255;
+ add.s32 %r7828, %r7829, -128;
+ shr.u32 %r7827, %r7828, 5;
+ mov.u32 %r7826, 4;
+ sub.s32 %r7825, %r7826, %r7827;
+ mov.u32 %r2796, 32;
+ sub.s32 %r2797, %r2796, %r39;
+ shr.u32 %r2798, %r7969, %r2797;
+ shl.b32 %r2799, %r7968, %r39;
+ add.s32 %r7968, %r2798, %r2799;
+ mul.wide.s32 %rd630, %r7825, 4;
+ add.s64 %rd631, %rd1, %rd630;
+ ld.local.u32 %r2800, [%rd631];
+ shr.u32 %r2801, %r2800, %r2797;
+ shl.b32 %r2802, %r7969, %r39;
+ add.s32 %r7969, %r2801, %r2802;
+
+$L__BB0_51:
+ mov.b32 %r7824, %f5416;
+ and.b32 %r2803, %r7824, -2147483648;
+ shr.u32 %r2804, %r7969, 30;
+ shl.b32 %r2805, %r7968, 2;
+ or.b32 %r2806, %r2804, %r2805;
+ shr.u32 %r2807, %r2806, 31;
+ shr.u32 %r2808, %r7968, 30;
+ add.s32 %r2809, %r2807, %r2808;
+ neg.s32 %r2810, %r2809;
+ setp.eq.s32 %p65, %r2803, 0;
+ selp.b32 %r7970, %r2809, %r2810, %p65;
+ setp.ne.s32 %p66, %r2807, 0;
+ xor.b32 %r2811, %r2803, -2147483648;
+ selp.b32 %r2812, %r2811, %r2803, %p66;
+ selp.b32 %r2813, -1, 0, %p66;
+ xor.b32 %r2814, %r2806, %r2813;
+ shl.b32 %r2815, %r7969, 2;
+ xor.b32 %r2816, %r2815, %r2813;
+ cvt.u64.u32 %rd632, %r2814;
+ cvt.u64.u32 %rd633, %r2816;
+ bfi.b64 %rd634, %rd632, %rd633, 32, 32;
+ cvt.rn.f64.s64 %fd1, %rd634;
+ mul.f64 %fd2, %fd1, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f2322, %fd2;
+ setp.eq.s32 %p67, %r2812, 0;
+ neg.f32 %f2323, %f2322;
+ selp.f32 %f5248, %f2322, %f2323, %p67;
+
+$L__BB0_53:
+ and.b32 %r48, %r7970, 1;
+ setp.eq.s32 %p68, %r48, 0;
+ mul.rn.f32 %f47, %f5248, %f5248;
+ mov.f32 %f5249, 0fB94D4153;
+ @%p68 bra $L__BB0_55;
+
+ mov.f32 %f2326, 0fBAB607ED;
+ mov.f32 %f2327, 0f37CBAC00;
+ fma.rn.f32 %f5249, %f2327, %f47, %f2326;
+
+$L__BB0_55:
+ and.b32 %r7832, %r7970, 1;
+ setp.eq.s32 %p1774, %r7832, 0;
+ selp.f32 %f5221, %f5248, 0f3F800000, %p1774;
+ selp.f32 %f2328, 0f3C0885E4, 0f3D2AAABB, %p1774;
+ fma.rn.f32 %f2329, %f5249, %f47, %f2328;
+ selp.f32 %f2330, 0fBE2AAAA8, 0fBEFFFFFF, %p1774;
+ fma.rn.f32 %f2331, %f2329, %f47, %f2330;
+ mov.f32 %f2332, 0f00000000;
+ fma.rn.f32 %f2333, %f47, %f5221, %f2332;
+ fma.rn.f32 %f5282, %f2331, %f2333, %f5221;
+ and.b32 %r2818, %r7970, 2;
+ setp.eq.s32 %p70, %r2818, 0;
+ @%p70 bra $L__BB0_57;
+
+ mov.f32 %f2335, 0fBF800000;
+ fma.rn.f32 %f5282, %f5282, %f2335, %f2332;
+
+$L__BB0_57:
+ shl.b32 %r7822, %r12, 5;
+ neg.s32 %r7821, %r7822;
+ setp.ge.s32 %p1773, %r11, %r7821;
+ @%p1773 bra $L__BB0_70;
+
+ mul.f32 %f2337, %f5607, 0f3F22F983;
+ cvt.rni.s32.f32 %r7974, %f2337;
+ cvt.rn.f32.s32 %f2338, %r7974;
+ mov.f32 %f2339, 0fBFC90FDA;
+ fma.rn.f32 %f2340, %f2338, %f2339, %f5607;
+ mov.f32 %f2341, 0fB3A22168;
+ fma.rn.f32 %f2342, %f2338, %f2341, %f2340;
+ mov.f32 %f2343, 0fA7C234C5;
+ fma.rn.f32 %f5252, %f2338, %f2343, %f2342;
+ abs.f32 %f55, %f5607;
+ setp.ltu.f32 %p72, %f55, 0f47CE4780;
+ @%p72 bra $L__BB0_66;
+
+ setp.eq.f32 %p73, %f55, 0f7F800000;
+ @%p73 bra $L__BB0_65;
+ bra.uni $L__BB0_60;
+
+$L__BB0_65:
+ mov.f32 %f2346, 0f00000000;
+ mul.rn.f32 %f5252, %f5607, %f2346;
+ mov.u32 %r7974, 0;
+ bra.uni $L__BB0_66;
+
+$L__BB0_60:
+ mov.b32 %r50, %f5607;
+ shr.u32 %r2822, %r50, 23;
+ and.b32 %r2823, %r2822, 255;
+ shl.b32 %r2824, %r50, 8;
+ or.b32 %r52, %r2824, -2147483648;
+ mov.u64 %rd2507, 0;
+ mov.u32 %r7971, 0;
+ mov.u64 %rd638, __cudart_i2opi_f;
+ mov.u64 %rd2508, %rd2507;
+
+$L__BB0_61:
.pragma "nounroll";
- shl.b64 %rd1105, %rd2560, 2;
- add.s64 %rd1107, %rd1106, %rd1105;
- ld.global.nc.u32 %r3881, [%rd1107];
- mad.wide.u32 %rd1108, %r3881, %r627, %rd2561;
- shr.u64 %rd2561, %rd1108, 32;
- add.s64 %rd1109, %rd1, %rd1105;
- st.local.u32 [%rd1109], %rd1108;
- add.s32 %r8350, %r8350, 1;
- cvt.s64.s32 %rd2560, %r8350;
- setp.ne.s32 %p416, %r8350, 6;
- @%p416 bra $L__BB0_466;
-
- st.local.u32 [%rd5], %rd2561;
- mov.u32 %r3882, 4;
- sub.s32 %r631, %r3882, %r628;
- mov.u32 %r3883, 6;
- sub.s32 %r3884, %r3883, %r628;
- mul.wide.s32 %rd1110, %r3884, 4;
- add.s64 %rd1111, %rd1, %rd1110;
- ld.local.u32 %r8351, [%rd1111];
- ld.local.u32 %r8352, [%rd1111+-4];
- and.b32 %r634, %r626, 31;
- setp.eq.s32 %p417, %r634, 0;
- @%p417 bra $L__BB0_469;
-
- mov.u32 %r3885, 32;
- sub.s32 %r3886, %r3885, %r634;
- shr.u32 %r3887, %r8352, %r3886;
- shl.b32 %r3888, %r8351, %r634;
- add.s32 %r8351, %r3887, %r3888;
- mul.wide.s32 %rd1112, %r631, 4;
- add.s64 %rd1113, %rd1, %rd1112;
- ld.local.u32 %r3889, [%rd1113];
- shr.u32 %r3890, %r3889, %r3886;
- shl.b32 %r3891, %r8352, %r634;
- add.s32 %r8352, %r3890, %r3891;
-
-$L__BB0_469:
- and.b32 %r3892, %r625, -2147483648;
- shr.u32 %r3893, %r8352, 30;
- shl.b32 %r3894, %r8351, 2;
- or.b32 %r3895, %r3893, %r3894;
- shr.u32 %r3896, %r3895, 31;
- shr.u32 %r3897, %r8351, 30;
- add.s32 %r3898, %r3896, %r3897;
- neg.s32 %r3899, %r3898;
- setp.eq.s32 %p418, %r3892, 0;
- selp.b32 %r8353, %r3898, %r3899, %p418;
- setp.ne.s32 %p419, %r3896, 0;
- xor.b32 %r3900, %r3892, -2147483648;
- selp.b32 %r3901, %r3900, %r3892, %p419;
- selp.b32 %r3902, -1, 0, %p419;
- xor.b32 %r3903, %r3895, %r3902;
- shl.b32 %r3904, %r8352, 2;
- xor.b32 %r3905, %r3904, %r3902;
- cvt.u64.u32 %rd1114, %r3903;
- cvt.u64.u32 %rd1115, %r3905;
- bfi.b64 %rd1116, %rd1114, %rd1115, 32, 32;
- cvt.rn.f64.s64 %fd63, %rd1116;
- mul.f64 %fd64, %fd63, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f2932, %fd64;
- setp.eq.s32 %p420, %r3901, 0;
- neg.f32 %f2933, %f2932;
- selp.f32 %f5322, %f2932, %f2933, %p420;
+ shl.b64 %rd637, %rd2507, 2;
+ add.s64 %rd639, %rd638, %rd637;
+ ld.global.nc.u32 %r2825, [%rd639];
+ mad.wide.u32 %rd640, %r2825, %r52, %rd2508;
+ shr.u64 %rd2508, %rd640, 32;
+ add.s64 %rd641, %rd1, %rd637;
+ st.local.u32 [%rd641], %rd640;
+ add.s32 %r7971, %r7971, 1;
+ cvt.s64.s32 %rd2507, %r7971;
+ setp.ne.s32 %p74, %r7971, 6;
+ @%p74 bra $L__BB0_61;
+
+ add.s32 %r7775, %r2823, -128;
+ mov.b32 %r7774, %f5607;
+ shr.u32 %r7773, %r7774, 23;
+ and.b32 %r7772, %r7773, 255;
+ add.s32 %r7771, %r7772, -128;
+ shr.u32 %r7770, %r7771, 5;
+ st.local.u32 [%rd4], %rd2508;
+ mov.u32 %r2827, 6;
+ sub.s32 %r2828, %r2827, %r7770;
+ mul.wide.s32 %rd642, %r2828, 4;
+ add.s64 %rd643, %rd1, %rd642;
+ ld.local.u32 %r7972, [%rd643];
+ ld.local.u32 %r7973, [%rd643+-4];
+ and.b32 %r59, %r7771, 31;
+ setp.eq.s32 %p75, %r59, 0;
+ @%p75 bra $L__BB0_64;
+
+ mov.b32 %r7839, %f5607;
+ shr.u32 %r7838, %r7839, 23;
+ and.b32 %r7837, %r7838, 255;
+ add.s32 %r7836, %r7837, -128;
+ shr.u32 %r7835, %r7836, 5;
+ mov.u32 %r7834, 4;
+ sub.s32 %r7833, %r7834, %r7835;
+ mov.u32 %r2829, 32;
+ sub.s32 %r2830, %r2829, %r59;
+ shr.u32 %r2831, %r7973, %r2830;
+ shl.b32 %r2832, %r7972, %r59;
+ add.s32 %r7972, %r2831, %r2832;
+ mul.wide.s32 %rd644, %r7833, 4;
+ add.s64 %rd645, %rd1, %rd644;
+ ld.local.u32 %r2833, [%rd645];
+ shr.u32 %r2834, %r2833, %r2830;
+ shl.b32 %r2835, %r7973, %r59;
+ add.s32 %r7973, %r2834, %r2835;
+
+$L__BB0_64:
+ mov.b32 %r7776, %f5607;
+ and.b32 %r2836, %r7776, -2147483648;
+ shr.u32 %r2837, %r7973, 30;
+ shl.b32 %r2838, %r7972, 2;
+ or.b32 %r2839, %r2837, %r2838;
+ shr.u32 %r2840, %r2839, 31;
+ shr.u32 %r2841, %r7972, 30;
+ add.s32 %r2842, %r2840, %r2841;
+ neg.s32 %r2843, %r2842;
+ setp.eq.s32 %p76, %r2836, 0;
+ selp.b32 %r7974, %r2842, %r2843, %p76;
+ setp.ne.s32 %p77, %r2840, 0;
+ xor.b32 %r2844, %r2836, -2147483648;
+ selp.b32 %r2845, %r2844, %r2836, %p77;
+ selp.b32 %r2846, -1, 0, %p77;
+ xor.b32 %r2847, %r2839, %r2846;
+ shl.b32 %r2848, %r7973, 2;
+ xor.b32 %r2849, %r2848, %r2846;
+ cvt.u64.u32 %rd646, %r2847;
+ cvt.u64.u32 %rd647, %r2849;
+ bfi.b64 %rd648, %rd646, %rd647, 32, 32;
+ cvt.rn.f64.s64 %fd3, %rd648;
+ mul.f64 %fd4, %fd3, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f2344, %fd4;
+ setp.eq.s32 %p78, %r2845, 0;
+ neg.f32 %f2345, %f2344;
+ selp.f32 %f5252, %f2344, %f2345, %p78;
+
+$L__BB0_66:
+ add.s32 %r66, %r7974, 1;
+ and.b32 %r67, %r66, 1;
+ setp.eq.s32 %p79, %r67, 0;
+ selp.f32 %f59, %f5252, 0f3F800000, %p79;
+ mul.rn.f32 %f60, %f5252, %f5252;
+ mov.f32 %f5253, 0fB94D4153;
+ @%p79 bra $L__BB0_68;
+
+ mov.f32 %f2348, 0fBAB607ED;
+ mov.f32 %f2349, 0f37CBAC00;
+ fma.rn.f32 %f5253, %f2349, %f60, %f2348;
+
+$L__BB0_68:
+ add.s32 %r7842, %r7974, 1;
+ add.s32 %r7841, %r7974, 1;
+ and.b32 %r7840, %r7841, 1;
+ setp.eq.s32 %p1775, %r7840, 0;
+ selp.f32 %f2350, 0f3C0885E4, 0f3D2AAABB, %p1775;
+ fma.rn.f32 %f2351, %f5253, %f60, %f2350;
+ selp.f32 %f2352, 0fBE2AAAA8, 0fBEFFFFFF, %p1775;
+ fma.rn.f32 %f2353, %f2351, %f60, %f2352;
+ mov.f32 %f2354, 0f00000000;
+ fma.rn.f32 %f2355, %f60, %f59, %f2354;
+ fma.rn.f32 %f5284, %f2353, %f2355, %f59;
+ and.b32 %r2851, %r7841, 2;
+ setp.eq.s32 %p81, %r2851, 0;
+ @%p81 bra $L__BB0_70;
+
+ mov.f32 %f2357, 0fBF800000;
+ fma.rn.f32 %f5284, %f5284, %f2357, %f2354;
+
+$L__BB0_70:
+ add.f32 %f5400, %f5282, %f5284;
+
+$L__BB0_71:
+ setp.gt.s32 %p1762, %r12, 14;
+ mov.f32 %f5281, %f5282;
+ mov.f32 %f5283, %f5284;
+ @%p1762 bra $L__BB0_100;
+
+ shl.b32 %r2852, %r12, 5;
+ mov.u32 %r2853, -32;
+ sub.s32 %r68, %r2853, %r2852;
+ setp.ge.s32 %p83, %r11, %r68;
+ mov.f32 %f5281, %f5282;
+ @%p83 bra $L__BB0_85;
+
+ mul.f32 %f2359, %f5415, 0f3F22F983;
+ cvt.rni.s32.f32 %r7978, %f2359;
+ cvt.rn.f32.s32 %f2360, %r7978;
+ mov.f32 %f2361, 0fBFC90FDA;
+ fma.rn.f32 %f2362, %f2360, %f2361, %f5415;
+ mov.f32 %f2363, 0fB3A22168;
+ fma.rn.f32 %f2364, %f2360, %f2363, %f2362;
+ mov.f32 %f2365, 0fA7C234C5;
+ fma.rn.f32 %f5259, %f2360, %f2365, %f2364;
+ abs.f32 %f72, %f5415;
+ setp.ltu.f32 %p84, %f72, 0f47CE4780;
+ @%p84 bra $L__BB0_81;
+
+ setp.eq.f32 %p85, %f72, 0f7F800000;
+ @%p85 bra $L__BB0_80;
+ bra.uni $L__BB0_75;
+
+$L__BB0_80:
+ mov.f32 %f2368, 0f00000000;
+ mul.rn.f32 %f5259, %f5415, %f2368;
+ mov.u32 %r7978, 0;
+ bra.uni $L__BB0_81;
+
+$L__BB0_75:
+ mov.b32 %r70, %f5415;
+ shr.u32 %r2855, %r70, 23;
+ and.b32 %r2856, %r2855, 255;
+ shl.b32 %r2857, %r70, 8;
+ or.b32 %r72, %r2857, -2147483648;
+ mov.u64 %rd2509, 0;
+ mov.u32 %r7975, 0;
+ mov.u64 %rd652, __cudart_i2opi_f;
+ mov.u64 %rd2510, %rd2509;
+
+$L__BB0_76:
+ .pragma "nounroll";
+ shl.b64 %rd651, %rd2509, 2;
+ add.s64 %rd653, %rd652, %rd651;
+ ld.global.nc.u32 %r2858, [%rd653];
+ mad.wide.u32 %rd654, %r2858, %r72, %rd2510;
+ shr.u64 %rd2510, %rd654, 32;
+ add.s64 %rd655, %rd1, %rd651;
+ st.local.u32 [%rd655], %rd654;
+ add.s32 %r7975, %r7975, 1;
+ cvt.s64.s32 %rd2509, %r7975;
+ setp.ne.s32 %p86, %r7975, 6;
+ @%p86 bra $L__BB0_76;
+
+ add.s32 %r7851, %r2856, -128;
+ mov.b32 %r7850, %f5415;
+ shr.u32 %r7849, %r7850, 23;
+ and.b32 %r7848, %r7849, 255;
+ add.s32 %r7847, %r7848, -128;
+ shr.u32 %r7846, %r7847, 5;
+ st.local.u32 [%rd4], %rd2510;
+ mov.u32 %r2860, 6;
+ sub.s32 %r2861, %r2860, %r7846;
+ mul.wide.s32 %rd656, %r2861, 4;
+ add.s64 %rd657, %rd1, %rd656;
+ ld.local.u32 %r7976, [%rd657];
+ ld.local.u32 %r7977, [%rd657+-4];
+ and.b32 %r79, %r7847, 31;
+ setp.eq.s32 %p87, %r79, 0;
+ @%p87 bra $L__BB0_79;
+
+ mov.b32 %r7859, %f5415;
+ shr.u32 %r7858, %r7859, 23;
+ and.b32 %r7857, %r7858, 255;
+ add.s32 %r7856, %r7857, -128;
+ shr.u32 %r7855, %r7856, 5;
+ mov.u32 %r7854, 4;
+ sub.s32 %r7853, %r7854, %r7855;
+ mov.u32 %r2862, 32;
+ sub.s32 %r2863, %r2862, %r79;
+ shr.u32 %r2864, %r7977, %r2863;
+ shl.b32 %r2865, %r7976, %r79;
+ add.s32 %r7976, %r2864, %r2865;
+ mul.wide.s32 %rd658, %r7853, 4;
+ add.s64 %rd659, %rd1, %rd658;
+ ld.local.u32 %r2866, [%rd659];
+ shr.u32 %r2867, %r2866, %r2863;
+ shl.b32 %r2868, %r7977, %r79;
+ add.s32 %r7977, %r2867, %r2868;
+
+$L__BB0_79:
+ mov.b32 %r7852, %f5415;
+ and.b32 %r2869, %r7852, -2147483648;
+ shr.u32 %r2870, %r7977, 30;
+ shl.b32 %r2871, %r7976, 2;
+ or.b32 %r2872, %r2870, %r2871;
+ shr.u32 %r2873, %r2872, 31;
+ shr.u32 %r2874, %r7976, 30;
+ add.s32 %r2875, %r2873, %r2874;
+ neg.s32 %r2876, %r2875;
+ setp.eq.s32 %p88, %r2869, 0;
+ selp.b32 %r7978, %r2875, %r2876, %p88;
+ setp.ne.s32 %p89, %r2873, 0;
+ xor.b32 %r2877, %r2869, -2147483648;
+ selp.b32 %r2878, %r2877, %r2869, %p89;
+ selp.b32 %r2879, -1, 0, %p89;
+ xor.b32 %r2880, %r2872, %r2879;
+ shl.b32 %r2881, %r7977, 2;
+ xor.b32 %r2882, %r2881, %r2879;
+ cvt.u64.u32 %rd660, %r2880;
+ cvt.u64.u32 %rd661, %r2882;
+ bfi.b64 %rd662, %rd660, %rd661, 32, 32;
+ cvt.rn.f64.s64 %fd5, %rd662;
+ mul.f64 %fd6, %fd5, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f2366, %fd6;
+ setp.eq.s32 %p90, %r2878, 0;
+ neg.f32 %f2367, %f2366;
+ selp.f32 %f5259, %f2366, %f2367, %p90;
+
+$L__BB0_81:
+ and.b32 %r86, %r7978, 1;
+ setp.eq.s32 %p91, %r86, 0;
+ mul.rn.f32 %f77, %f5259, %f5259;
+ mov.f32 %f5260, 0fB94D4153;
+ @%p91 bra $L__BB0_83;
+
+ mov.f32 %f2370, 0fBAB607ED;
+ mov.f32 %f2371, 0f37CBAC00;
+ fma.rn.f32 %f5260, %f2371, %f77, %f2370;
+
+$L__BB0_83:
+ and.b32 %r7860, %r7978, 1;
+ setp.eq.s32 %p1778, %r7860, 0;
+ selp.f32 %f5222, %f5259, 0f3F800000, %p1778;
+ selp.f32 %f2372, 0f3C0885E4, 0f3D2AAABB, %p1778;
+ fma.rn.f32 %f2373, %f5260, %f77, %f2372;
+ selp.f32 %f2374, 0fBE2AAAA8, 0fBEFFFFFF, %p1778;
+ fma.rn.f32 %f2375, %f2373, %f77, %f2374;
+ mov.f32 %f2376, 0f00000000;
+ fma.rn.f32 %f2377, %f77, %f5222, %f2376;
+ fma.rn.f32 %f5281, %f2375, %f2377, %f5222;
+ and.b32 %r2884, %r7978, 2;
+ setp.eq.s32 %p93, %r2884, 0;
+ @%p93 bra $L__BB0_85;
+
+ mov.f32 %f2379, 0fBF800000;
+ fma.rn.f32 %f5281, %f5281, %f2379, %f2376;
+
+$L__BB0_85:
+ shl.b32 %r7845, %r12, 5;
+ mov.u32 %r7844, -32;
+ sub.s32 %r7843, %r7844, %r7845;
+ setp.ge.s32 %p1776, %r11, %r7843;
+ mov.f32 %f5283, %f5284;
+ @%p1776 bra $L__BB0_98;
+
+ mul.f32 %f2380, %f5606, 0f3F22F983;
+ cvt.rni.s32.f32 %r7982, %f2380;
+ cvt.rn.f32.s32 %f2381, %r7982;
+ mov.f32 %f2382, 0fBFC90FDA;
+ fma.rn.f32 %f2383, %f2381, %f2382, %f5606;
+ mov.f32 %f2384, 0fB3A22168;
+ fma.rn.f32 %f2385, %f2381, %f2384, %f2383;
+ mov.f32 %f2386, 0fA7C234C5;
+ fma.rn.f32 %f5263, %f2381, %f2386, %f2385;
+ abs.f32 %f85, %f5606;
+ setp.ltu.f32 %p95, %f85, 0f47CE4780;
+ @%p95 bra $L__BB0_94;
+
+ setp.eq.f32 %p96, %f85, 0f7F800000;
+ @%p96 bra $L__BB0_93;
+ bra.uni $L__BB0_88;
+
+$L__BB0_93:
+ mov.f32 %f2389, 0f00000000;
+ mul.rn.f32 %f5263, %f5606, %f2389;
+ mov.u32 %r7982, 0;
+ bra.uni $L__BB0_94;
+
+$L__BB0_88:
+ mov.b32 %r88, %f5606;
+ shr.u32 %r2886, %r88, 23;
+ and.b32 %r2887, %r2886, 255;
+ shl.b32 %r2888, %r88, 8;
+ or.b32 %r90, %r2888, -2147483648;
+ mov.u64 %rd2511, 0;
+ mov.u32 %r7979, 0;
+ mov.u64 %rd666, __cudart_i2opi_f;
+ mov.u64 %rd2512, %rd2511;
+
+$L__BB0_89:
+ .pragma "nounroll";
+ shl.b64 %rd665, %rd2511, 2;
+ add.s64 %rd667, %rd666, %rd665;
+ ld.global.nc.u32 %r2889, [%rd667];
+ mad.wide.u32 %rd668, %r2889, %r90, %rd2512;
+ shr.u64 %rd2512, %rd668, 32;
+ add.s64 %rd669, %rd1, %rd665;
+ st.local.u32 [%rd669], %rd668;
+ add.s32 %r7979, %r7979, 1;
+ cvt.s64.s32 %rd2511, %r7979;
+ setp.ne.s32 %p97, %r7979, 6;
+ @%p97 bra $L__BB0_89;
+
+ add.s32 %r7869, %r2887, -128;
+ mov.b32 %r7868, %f5606;
+ shr.u32 %r7867, %r7868, 23;
+ and.b32 %r7866, %r7867, 255;
+ add.s32 %r7865, %r7866, -128;
+ shr.u32 %r7864, %r7865, 5;
+ st.local.u32 [%rd4], %rd2512;
+ mov.u32 %r2891, 6;
+ sub.s32 %r2892, %r2891, %r7864;
+ mul.wide.s32 %rd670, %r2892, 4;
+ add.s64 %rd671, %rd1, %rd670;
+ ld.local.u32 %r7980, [%rd671];
+ ld.local.u32 %r7981, [%rd671+-4];
+ and.b32 %r97, %r7865, 31;
+ setp.eq.s32 %p98, %r97, 0;
+ @%p98 bra $L__BB0_92;
+
+ mov.b32 %r7877, %f5606;
+ shr.u32 %r7876, %r7877, 23;
+ and.b32 %r7875, %r7876, 255;
+ add.s32 %r7874, %r7875, -128;
+ shr.u32 %r7873, %r7874, 5;
+ mov.u32 %r7872, 4;
+ sub.s32 %r7871, %r7872, %r7873;
+ mov.u32 %r2893, 32;
+ sub.s32 %r2894, %r2893, %r97;
+ shr.u32 %r2895, %r7981, %r2894;
+ shl.b32 %r2896, %r7980, %r97;
+ add.s32 %r7980, %r2895, %r2896;
+ mul.wide.s32 %rd672, %r7871, 4;
+ add.s64 %rd673, %rd1, %rd672;
+ ld.local.u32 %r2897, [%rd673];
+ shr.u32 %r2898, %r2897, %r2894;
+ shl.b32 %r2899, %r7981, %r97;
+ add.s32 %r7981, %r2898, %r2899;
+
+$L__BB0_92:
+ mov.b32 %r7870, %f5606;
+ and.b32 %r2900, %r7870, -2147483648;
+ shr.u32 %r2901, %r7981, 30;
+ shl.b32 %r2902, %r7980, 2;
+ or.b32 %r2903, %r2901, %r2902;
+ shr.u32 %r2904, %r2903, 31;
+ shr.u32 %r2905, %r7980, 30;
+ add.s32 %r2906, %r2904, %r2905;
+ neg.s32 %r2907, %r2906;
+ setp.eq.s32 %p99, %r2900, 0;
+ selp.b32 %r7982, %r2906, %r2907, %p99;
+ setp.ne.s32 %p100, %r2904, 0;
+ xor.b32 %r2908, %r2900, -2147483648;
+ selp.b32 %r2909, %r2908, %r2900, %p100;
+ selp.b32 %r2910, -1, 0, %p100;
+ xor.b32 %r2911, %r2903, %r2910;
+ shl.b32 %r2912, %r7981, 2;
+ xor.b32 %r2913, %r2912, %r2910;
+ cvt.u64.u32 %rd674, %r2911;
+ cvt.u64.u32 %rd675, %r2913;
+ bfi.b64 %rd676, %rd674, %rd675, 32, 32;
+ cvt.rn.f64.s64 %fd7, %rd676;
+ mul.f64 %fd8, %fd7, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f2387, %fd8;
+ setp.eq.s32 %p101, %r2909, 0;
+ neg.f32 %f2388, %f2387;
+ selp.f32 %f5263, %f2387, %f2388, %p101;
+
+$L__BB0_94:
+ add.s32 %r104, %r7982, 1;
+ and.b32 %r105, %r104, 1;
+ setp.eq.s32 %p102, %r105, 0;
+ selp.f32 %f89, %f5263, 0f3F800000, %p102;
+ mul.rn.f32 %f90, %f5263, %f5263;
+ mov.f32 %f5264, 0fB94D4153;
+ @%p102 bra $L__BB0_96;
+
+ mov.f32 %f2391, 0fBAB607ED;
+ mov.f32 %f2392, 0f37CBAC00;
+ fma.rn.f32 %f5264, %f2392, %f90, %f2391;
+
+$L__BB0_96:
+ add.s32 %r7880, %r7982, 1;
+ add.s32 %r7879, %r7982, 1;
+ and.b32 %r7878, %r7879, 1;
+ setp.eq.s32 %p1780, %r7878, 0;
+ selp.f32 %f2393, 0f3C0885E4, 0f3D2AAABB, %p1780;
+ fma.rn.f32 %f2394, %f5264, %f90, %f2393;
+ selp.f32 %f2395, 0fBE2AAAA8, 0fBEFFFFFF, %p1780;
+ fma.rn.f32 %f2396, %f2394, %f90, %f2395;
+ mov.f32 %f2397, 0f00000000;
+ fma.rn.f32 %f2398, %f90, %f89, %f2397;
+ fma.rn.f32 %f5283, %f2396, %f2398, %f89;
+ and.b32 %r2915, %r7879, 2;
+ setp.eq.s32 %p104, %r2915, 0;
+ @%p104 bra $L__BB0_98;
+
+ mov.f32 %f2400, 0fBF800000;
+ fma.rn.f32 %f5283, %f5283, %f2400, %f2397;
+
+$L__BB0_98:
+ shl.b32 %r7863, %r12, 5;
+ mov.u32 %r7862, -32;
+ sub.s32 %r7861, %r7862, %r7863;
+ setp.lt.s32 %p1779, %r11, %r7861;
+ setp.ge.s32 %p1777, %r11, %r68;
+ selp.f32 %f97, %f5283, %f5284, %p1779;
+ selp.f32 %f98, %f5281, %f5282, %p1779;
+ @%p1777 bra $L__BB0_100;
+
+ add.f32 %f5399, %f98, %f97;
+ mov.f32 %f5282, %f5281;
+ mov.f32 %f5284, %f5283;
+
+$L__BB0_100:
+ add.s32 %r7804, %r12, 1;
+ setp.gt.s32 %p1771, %r7804, 14;
+ @%p1771 bra $L__BB0_129;
+
+ shl.b32 %r2917, %r12, 5;
+ neg.s32 %r106, %r2917;
+ setp.ge.s32 %p108, %r11, %r106;
+ @%p108 bra $L__BB0_114;
+
+ mul.f32 %f2403, %f5414, 0f3F22F983;
+ cvt.rni.s32.f32 %r7986, %f2403;
+ cvt.rn.f32.s32 %f2404, %r7986;
+ mov.f32 %f2405, 0fBFC90FDA;
+ fma.rn.f32 %f2406, %f2404, %f2405, %f5414;
+ mov.f32 %f2407, 0fB3A22168;
+ fma.rn.f32 %f2408, %f2404, %f2407, %f2406;
+ mov.f32 %f2409, 0fA7C234C5;
+ fma.rn.f32 %f5272, %f2404, %f2409, %f2408;
+ abs.f32 %f106, %f5414;
+ setp.ltu.f32 %p109, %f106, 0f47CE4780;
+ @%p109 bra $L__BB0_110;
+
+ setp.eq.f32 %p110, %f106, 0f7F800000;
+ @%p110 bra $L__BB0_109;
+ bra.uni $L__BB0_104;
+
+$L__BB0_109:
+ mov.f32 %f2412, 0f00000000;
+ mul.rn.f32 %f5272, %f5414, %f2412;
+ mov.u32 %r7986, 0;
+ bra.uni $L__BB0_110;
+
+$L__BB0_104:
+ mov.b32 %r108, %f5414;
+ shr.u32 %r2919, %r108, 23;
+ and.b32 %r2920, %r2919, 255;
+ shl.b32 %r2921, %r108, 8;
+ or.b32 %r110, %r2921, -2147483648;
+ mov.u64 %rd2513, 0;
+ mov.u32 %r7983, 0;
+ mov.u64 %rd680, __cudart_i2opi_f;
+ mov.u64 %rd2514, %rd2513;
+
+$L__BB0_105:
+ .pragma "nounroll";
+ shl.b64 %rd679, %rd2513, 2;
+ add.s64 %rd681, %rd680, %rd679;
+ ld.global.nc.u32 %r2922, [%rd681];
+ mad.wide.u32 %rd682, %r2922, %r110, %rd2514;
+ shr.u64 %rd2514, %rd682, 32;
+ add.s64 %rd683, %rd1, %rd679;
+ st.local.u32 [%rd683], %rd682;
+ add.s32 %r7983, %r7983, 1;
+ cvt.s64.s32 %rd2513, %r7983;
+ setp.ne.s32 %p111, %r7983, 6;
+ @%p111 bra $L__BB0_105;
+
+ add.s32 %r7886, %r2920, -128;
+ mov.b32 %r7885, %f5414;
+ shr.u32 %r7884, %r7885, 23;
+ and.b32 %r7883, %r7884, 255;
+ add.s32 %r7882, %r7883, -128;
+ shr.u32 %r7881, %r7882, 5;
+ st.local.u32 [%rd4], %rd2514;
+ mov.u32 %r2924, 6;
+ sub.s32 %r2925, %r2924, %r7881;
+ mul.wide.s32 %rd684, %r2925, 4;
+ add.s64 %rd685, %rd1, %rd684;
+ ld.local.u32 %r7984, [%rd685];
+ ld.local.u32 %r7985, [%rd685+-4];
+ and.b32 %r117, %r7882, 31;
+ setp.eq.s32 %p112, %r117, 0;
+ @%p112 bra $L__BB0_108;
+
+ mov.b32 %r7896, %f5414;
+ shr.u32 %r7895, %r7896, 23;
+ and.b32 %r7894, %r7895, 255;
+ add.s32 %r7893, %r7894, -128;
+ shr.u32 %r7892, %r7893, 5;
+ mov.u32 %r7891, 4;
+ sub.s32 %r7890, %r7891, %r7892;
+ mov.u32 %r2926, 32;
+ sub.s32 %r2927, %r2926, %r117;
+ shr.u32 %r2928, %r7985, %r2927;
+ shl.b32 %r2929, %r7984, %r117;
+ add.s32 %r7984, %r2928, %r2929;
+ mul.wide.s32 %rd686, %r7890, 4;
+ add.s64 %rd687, %rd1, %rd686;
+ ld.local.u32 %r2930, [%rd687];
+ shr.u32 %r2931, %r2930, %r2927;
+ shl.b32 %r2932, %r7985, %r117;
+ add.s32 %r7985, %r2931, %r2932;
+
+$L__BB0_108:
+ mov.b32 %r7887, %f5414;
+ and.b32 %r2933, %r7887, -2147483648;
+ shr.u32 %r2934, %r7985, 30;
+ shl.b32 %r2935, %r7984, 2;
+ or.b32 %r2936, %r2934, %r2935;
+ shr.u32 %r2937, %r2936, 31;
+ shr.u32 %r2938, %r7984, 30;
+ add.s32 %r2939, %r2937, %r2938;
+ neg.s32 %r2940, %r2939;
+ setp.eq.s32 %p113, %r2933, 0;
+ selp.b32 %r7986, %r2939, %r2940, %p113;
+ setp.ne.s32 %p114, %r2937, 0;
+ xor.b32 %r2941, %r2933, -2147483648;
+ selp.b32 %r2942, %r2941, %r2933, %p114;
+ selp.b32 %r2943, -1, 0, %p114;
+ xor.b32 %r2944, %r2936, %r2943;
+ shl.b32 %r2945, %r7985, 2;
+ xor.b32 %r2946, %r2945, %r2943;
+ cvt.u64.u32 %rd688, %r2944;
+ cvt.u64.u32 %rd689, %r2946;
+ bfi.b64 %rd690, %rd688, %rd689, 32, 32;
+ cvt.rn.f64.s64 %fd9, %rd690;
+ mul.f64 %fd10, %fd9, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f2410, %fd10;
+ setp.eq.s32 %p115, %r2942, 0;
+ neg.f32 %f2411, %f2410;
+ selp.f32 %f5272, %f2410, %f2411, %p115;
+
+$L__BB0_110:
+ and.b32 %r124, %r7986, 1;
+ setp.eq.s32 %p116, %r124, 0;
+ mul.rn.f32 %f111, %f5272, %f5272;
+ mov.f32 %f5273, 0fB94D4153;
+ @%p116 bra $L__BB0_112;
+
+ mov.f32 %f2414, 0fBAB607ED;
+ mov.f32 %f2415, 0f37CBAC00;
+ fma.rn.f32 %f5273, %f2415, %f111, %f2414;
+
+$L__BB0_112:
+ and.b32 %r7952, %r7986, 1;
+ setp.eq.s32 %p1787, %r7952, 0;
+ selp.f32 %f5223, %f5272, 0f3F800000, %p1787;
+ selp.f32 %f2416, 0f3C0885E4, 0f3D2AAABB, %p1787;
+ fma.rn.f32 %f2417, %f5273, %f111, %f2416;
+ selp.f32 %f2418, 0fBE2AAAA8, 0fBEFFFFFF, %p1787;
+ fma.rn.f32 %f2419, %f2417, %f111, %f2418;
+ mov.f32 %f2420, 0f00000000;
+ fma.rn.f32 %f2421, %f111, %f5223, %f2420;
+ fma.rn.f32 %f5281, %f2419, %f2421, %f5223;
+ and.b32 %r2948, %r7986, 2;
+ setp.eq.s32 %p118, %r2948, 0;
+ @%p118 bra $L__BB0_114;
+
+ mov.f32 %f2423, 0fBF800000;
+ fma.rn.f32 %f5281, %f5281, %f2423, %f2420;
+
+$L__BB0_114:
+ shl.b32 %r7951, %r12, 5;
+ neg.s32 %r7950, %r7951;
+ setp.ge.s32 %p1786, %r11, %r7950;
+ @%p1786 bra $L__BB0_127;
+
+ mul.f32 %f2424, %f5406, 0f3F22F983;
+ cvt.rni.s32.f32 %r7990, %f2424;
+ cvt.rn.f32.s32 %f2425, %r7990;
+ mov.f32 %f2426, 0fBFC90FDA;
+ fma.rn.f32 %f2427, %f2425, %f2426, %f5406;
+ mov.f32 %f2428, 0fB3A22168;
+ fma.rn.f32 %f2429, %f2425, %f2428, %f2427;
+ mov.f32 %f2430, 0fA7C234C5;
+ fma.rn.f32 %f5276, %f2425, %f2430, %f2429;
+ abs.f32 %f119, %f5406;
+ setp.ltu.f32 %p120, %f119, 0f47CE4780;
+ @%p120 bra $L__BB0_123;
+
+ setp.eq.f32 %p121, %f119, 0f7F800000;
+ @%p121 bra $L__BB0_122;
+ bra.uni $L__BB0_117;
+
+$L__BB0_122:
+ mov.f32 %f2433, 0f00000000;
+ mul.rn.f32 %f5276, %f5406, %f2433;
+ mov.u32 %r7990, 0;
+ bra.uni $L__BB0_123;
+
+$L__BB0_117:
+ mov.b32 %r126, %f5406;
+ shr.u32 %r2950, %r126, 23;
+ and.b32 %r2951, %r2950, 255;
+ shl.b32 %r2952, %r126, 8;
+ or.b32 %r128, %r2952, -2147483648;
+ mov.u64 %rd2515, 0;
+ mov.u32 %r7987, 0;
+ mov.u64 %rd694, __cudart_i2opi_f;
+ mov.u64 %rd2516, %rd2515;
+
+$L__BB0_118:
+ .pragma "nounroll";
+ shl.b64 %rd693, %rd2515, 2;
+ add.s64 %rd695, %rd694, %rd693;
+ ld.global.nc.u32 %r2953, [%rd695];
+ mad.wide.u32 %rd696, %r2953, %r128, %rd2516;
+ shr.u64 %rd2516, %rd696, 32;
+ add.s64 %rd697, %rd1, %rd693;
+ st.local.u32 [%rd697], %rd696;
+ add.s32 %r7987, %r7987, 1;
+ cvt.s64.s32 %rd2515, %r7987;
+ setp.ne.s32 %p122, %r7987, 6;
+ @%p122 bra $L__BB0_118;
+
+ add.s32 %r7902, %r2951, -128;
+ mov.b32 %r7901, %f5406;
+ shr.u32 %r7900, %r7901, 23;
+ and.b32 %r7899, %r7900, 255;
+ add.s32 %r7898, %r7899, -128;
+ shr.u32 %r7897, %r7898, 5;
+ st.local.u32 [%rd4], %rd2516;
+ mov.u32 %r2955, 6;
+ sub.s32 %r2956, %r2955, %r7897;
+ mul.wide.s32 %rd698, %r2956, 4;
+ add.s64 %rd699, %rd1, %rd698;
+ ld.local.u32 %r7988, [%rd699];
+ ld.local.u32 %r7989, [%rd699+-4];
+ and.b32 %r135, %r7898, 31;
+ setp.eq.s32 %p123, %r135, 0;
+ @%p123 bra $L__BB0_121;
+
+ mov.b32 %r7914, %f5406;
+ shr.u32 %r7913, %r7914, 23;
+ and.b32 %r7912, %r7913, 255;
+ add.s32 %r7911, %r7912, -128;
+ shr.u32 %r7910, %r7911, 5;
+ mov.u32 %r7909, 4;
+ sub.s32 %r7908, %r7909, %r7910;
+ mov.u32 %r2957, 32;
+ sub.s32 %r2958, %r2957, %r135;
+ shr.u32 %r2959, %r7989, %r2958;
+ shl.b32 %r2960, %r7988, %r135;
+ add.s32 %r7988, %r2959, %r2960;
+ mul.wide.s32 %rd700, %r7908, 4;
+ add.s64 %rd701, %rd1, %rd700;
+ ld.local.u32 %r2961, [%rd701];
+ shr.u32 %r2962, %r2961, %r2958;
+ shl.b32 %r2963, %r7989, %r135;
+ add.s32 %r7989, %r2962, %r2963;
+
+$L__BB0_121:
+ mov.b32 %r7903, %f5406;
+ and.b32 %r2964, %r7903, -2147483648;
+ shr.u32 %r2965, %r7989, 30;
+ shl.b32 %r2966, %r7988, 2;
+ or.b32 %r2967, %r2965, %r2966;
+ shr.u32 %r2968, %r2967, 31;
+ shr.u32 %r2969, %r7988, 30;
+ add.s32 %r2970, %r2968, %r2969;
+ neg.s32 %r2971, %r2970;
+ setp.eq.s32 %p124, %r2964, 0;
+ selp.b32 %r7990, %r2970, %r2971, %p124;
+ setp.ne.s32 %p125, %r2968, 0;
+ xor.b32 %r2972, %r2964, -2147483648;
+ selp.b32 %r2973, %r2972, %r2964, %p125;
+ selp.b32 %r2974, -1, 0, %p125;
+ xor.b32 %r2975, %r2967, %r2974;
+ shl.b32 %r2976, %r7989, 2;
+ xor.b32 %r2977, %r2976, %r2974;
+ cvt.u64.u32 %rd702, %r2975;
+ cvt.u64.u32 %rd703, %r2977;
+ bfi.b64 %rd704, %rd702, %rd703, 32, 32;
+ cvt.rn.f64.s64 %fd11, %rd704;
+ mul.f64 %fd12, %fd11, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f2431, %fd12;
+ setp.eq.s32 %p126, %r2973, 0;
+ neg.f32 %f2432, %f2431;
+ selp.f32 %f5276, %f2431, %f2432, %p126;
+
+$L__BB0_123:
+ add.s32 %r142, %r7990, 1;
+ and.b32 %r143, %r142, 1;
+ setp.eq.s32 %p127, %r143, 0;
+ selp.f32 %f123, %f5276, 0f3F800000, %p127;
+ mul.rn.f32 %f124, %f5276, %f5276;
+ mov.f32 %f5277, 0fB94D4153;
+ @%p127 bra $L__BB0_125;
+
+ mov.f32 %f2435, 0fBAB607ED;
+ mov.f32 %f2436, 0f37CBAC00;
+ fma.rn.f32 %f5277, %f2436, %f124, %f2435;
+
+$L__BB0_125:
+ selp.f32 %f2437, 0f3C0885E4, 0f3D2AAABB, %p127;
+ fma.rn.f32 %f2438, %f5277, %f124, %f2437;
+ selp.f32 %f2439, 0fBE2AAAA8, 0fBEFFFFFF, %p127;
+ fma.rn.f32 %f2440, %f2438, %f124, %f2439;
+ mov.f32 %f2441, 0f00000000;
+ fma.rn.f32 %f2442, %f124, %f123, %f2441;
+ fma.rn.f32 %f5283, %f2440, %f2442, %f123;
+ and.b32 %r2979, %r142, 2;
+ setp.eq.s32 %p129, %r2979, 0;
+ @%p129 bra $L__BB0_127;
+
+ mov.f32 %f2444, 0fBF800000;
+ fma.rn.f32 %f5283, %f5283, %f2444, %f2441;
+
+$L__BB0_127:
+ shl.b32 %r7907, %r12, 5;
+ neg.s32 %r7906, %r7907;
+ setp.lt.s32 %p1782, %r11, %r7906;
+ shl.b32 %r7905, %r12, 5;
+ neg.s32 %r7904, %r7905;
+ setp.ge.s32 %p1781, %r11, %r7904;
+ selp.f32 %f131, %f5283, %f5284, %p1782;
+ selp.f32 %f132, %f5281, %f5282, %p1782;
+ @%p1781 bra $L__BB0_129;
+
+ add.f32 %f5398, %f132, %f131;
+ mov.f32 %f5282, %f5281;
+ mov.f32 %f5284, %f5283;
+
+$L__BB0_129:
+ add.s32 %r7805, %r12, 1;
+ setp.gt.s32 %p1772, %r7805, 14;
+ @%p1772 bra $L__BB0_158;
+
+ shl.b32 %r2981, %r12, 5;
+ mov.u32 %r2982, -32;
+ sub.s32 %r144, %r2982, %r2981;
+ setp.ge.s32 %p133, %r11, %r144;
+ @%p133 bra $L__BB0_143;
+
+ mul.f32 %f2447, %f5413, 0f3F22F983;
+ cvt.rni.s32.f32 %r7994, %f2447;
+ cvt.rn.f32.s32 %f2448, %r7994;
+ mov.f32 %f2449, 0fBFC90FDA;
+ fma.rn.f32 %f2450, %f2448, %f2449, %f5413;
+ mov.f32 %f2451, 0fB3A22168;
+ fma.rn.f32 %f2452, %f2448, %f2451, %f2450;
+ mov.f32 %f2453, 0fA7C234C5;
+ fma.rn.f32 %f5285, %f2448, %f2453, %f2452;
+ abs.f32 %f140, %f5413;
+ setp.ltu.f32 %p134, %f140, 0f47CE4780;
+ @%p134 bra $L__BB0_139;
+
+ setp.eq.f32 %p135, %f140, 0f7F800000;
+ @%p135 bra $L__BB0_138;
+ bra.uni $L__BB0_133;
+
+$L__BB0_138:
+ mov.f32 %f2456, 0f00000000;
+ mul.rn.f32 %f5285, %f5413, %f2456;
+ mov.u32 %r7994, 0;
+ bra.uni $L__BB0_139;
+
+$L__BB0_133:
+ mov.b32 %r146, %f5413;
+ shr.u32 %r2984, %r146, 23;
+ and.b32 %r2985, %r2984, 255;
+ shl.b32 %r2986, %r146, 8;
+ or.b32 %r148, %r2986, -2147483648;
+ mov.u64 %rd2517, 0;
+ mov.u32 %r7991, 0;
+ mov.u64 %rd708, __cudart_i2opi_f;
+ mov.u64 %rd2518, %rd2517;
+
+$L__BB0_134:
+ .pragma "nounroll";
+ shl.b64 %rd707, %rd2517, 2;
+ add.s64 %rd709, %rd708, %rd707;
+ ld.global.nc.u32 %r2987, [%rd709];
+ mad.wide.u32 %rd710, %r2987, %r148, %rd2518;
+ shr.u64 %rd2518, %rd710, 32;
+ add.s64 %rd711, %rd1, %rd707;
+ st.local.u32 [%rd711], %rd710;
+ add.s32 %r7991, %r7991, 1;
+ cvt.s64.s32 %rd2517, %r7991;
+ setp.ne.s32 %p136, %r7991, 6;
+ @%p136 bra $L__BB0_134;
+
+ add.s32 %r7920, %r2985, -128;
+ mov.b32 %r7919, %f5413;
+ shr.u32 %r7918, %r7919, 23;
+ and.b32 %r7917, %r7918, 255;
+ add.s32 %r7916, %r7917, -128;
+ shr.u32 %r7915, %r7916, 5;
+ st.local.u32 [%rd4], %rd2518;
+ mov.u32 %r2988, 4;
+ sub.s32 %r152, %r2988, %r7915;
+ mov.u32 %r2989, 6;
+ sub.s32 %r2990, %r2989, %r7915;
+ mul.wide.s32 %rd712, %r2990, 4;
+ add.s64 %rd713, %rd1, %rd712;
+ ld.local.u32 %r7992, [%rd713];
+ ld.local.u32 %r7993, [%rd713+-4];
+ and.b32 %r155, %r7916, 31;
+ setp.eq.s32 %p137, %r155, 0;
+ @%p137 bra $L__BB0_137;
+
+ mov.u32 %r2991, 32;
+ sub.s32 %r2992, %r2991, %r155;
+ shr.u32 %r2993, %r7993, %r2992;
+ shl.b32 %r2994, %r7992, %r155;
+ add.s32 %r7992, %r2993, %r2994;
+ mul.wide.s32 %rd714, %r152, 4;
+ add.s64 %rd715, %rd1, %rd714;
+ ld.local.u32 %r2995, [%rd715];
+ shr.u32 %r2996, %r2995, %r2992;
+ shl.b32 %r2997, %r7993, %r155;
+ add.s32 %r7993, %r2996, %r2997;
+
+$L__BB0_137:
+ mov.b32 %r7921, %f5413;
+ and.b32 %r2998, %r7921, -2147483648;
+ shr.u32 %r2999, %r7993, 30;
+ shl.b32 %r3000, %r7992, 2;
+ or.b32 %r3001, %r2999, %r3000;
+ shr.u32 %r3002, %r3001, 31;
+ shr.u32 %r3003, %r7992, 30;
+ add.s32 %r3004, %r3002, %r3003;
+ neg.s32 %r3005, %r3004;
+ setp.eq.s32 %p138, %r2998, 0;
+ selp.b32 %r7994, %r3004, %r3005, %p138;
+ setp.ne.s32 %p139, %r3002, 0;
+ xor.b32 %r3006, %r2998, -2147483648;
+ selp.b32 %r3007, %r3006, %r2998, %p139;
+ selp.b32 %r3008, -1, 0, %p139;
+ xor.b32 %r3009, %r3001, %r3008;
+ shl.b32 %r3010, %r7993, 2;
+ xor.b32 %r3011, %r3010, %r3008;
+ cvt.u64.u32 %rd716, %r3009;
+ cvt.u64.u32 %rd717, %r3011;
+ bfi.b64 %rd718, %rd716, %rd717, 32, 32;
+ cvt.rn.f64.s64 %fd13, %rd718;
+ mul.f64 %fd14, %fd13, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f2454, %fd14;
+ setp.eq.s32 %p140, %r3007, 0;
+ neg.f32 %f2455, %f2454;
+ selp.f32 %f5285, %f2454, %f2455, %p140;
+
+$L__BB0_139:
+ and.b32 %r162, %r7994, 1;
+ setp.eq.s32 %p141, %r162, 0;
+ selp.f32 %f144, %f5285, 0f3F800000, %p141;
+ mul.rn.f32 %f145, %f5285, %f5285;
+ mov.f32 %f5286, 0fB94D4153;
+ @%p141 bra $L__BB0_141;
+
+ mov.f32 %f2458, 0fBAB607ED;
+ mov.f32 %f2459, 0f37CBAC00;
+ fma.rn.f32 %f5286, %f2459, %f145, %f2458;
+
+$L__BB0_141:
+ selp.f32 %f2460, 0f3C0885E4, 0f3D2AAABB, %p141;
+ fma.rn.f32 %f2461, %f5286, %f145, %f2460;
+ selp.f32 %f2462, 0fBE2AAAA8, 0fBEFFFFFF, %p141;
+ fma.rn.f32 %f2463, %f2461, %f145, %f2462;
+ mov.f32 %f2464, 0f00000000;
+ fma.rn.f32 %f2465, %f145, %f144, %f2464;
+ fma.rn.f32 %f5281, %f2463, %f2465, %f144;
+ and.b32 %r3013, %r7994, 2;
+ setp.eq.s32 %p143, %r3013, 0;
+ @%p143 bra $L__BB0_143;
+
+ mov.f32 %f2467, 0fBF800000;
+ fma.rn.f32 %f5281, %f5281, %f2467, %f2464;
+
+$L__BB0_143:
+ shl.b32 %r7955, %r12, 5;
+ mov.u32 %r7954, -32;
+ sub.s32 %r7953, %r7954, %r7955;
+ setp.ge.s32 %p1788, %r11, %r7953;
+ @%p1788 bra $L__BB0_156;
+
+ mul.f32 %f2468, %f5405, 0f3F22F983;
+ cvt.rni.s32.f32 %r7998, %f2468;
+ cvt.rn.f32.s32 %f2469, %r7998;
+ mov.f32 %f2470, 0fBFC90FDA;
+ fma.rn.f32 %f2471, %f2469, %f2470, %f5405;
+ mov.f32 %f2472, 0fB3A22168;
+ fma.rn.f32 %f2473, %f2469, %f2472, %f2471;
+ mov.f32 %f2474, 0fA7C234C5;
+ fma.rn.f32 %f5289, %f2469, %f2474, %f2473;
+ abs.f32 %f153, %f5405;
+ setp.ltu.f32 %p145, %f153, 0f47CE4780;
+ @%p145 bra $L__BB0_152;
+
+ setp.eq.f32 %p146, %f153, 0f7F800000;
+ @%p146 bra $L__BB0_151;
+ bra.uni $L__BB0_146;
+
+$L__BB0_151:
+ mov.f32 %f2477, 0f00000000;
+ mul.rn.f32 %f5289, %f5405, %f2477;
+ mov.u32 %r7998, 0;
+ bra.uni $L__BB0_152;
+
+$L__BB0_146:
+ mov.b32 %r164, %f5405;
+ shr.u32 %r3015, %r164, 23;
+ and.b32 %r3016, %r3015, 255;
+ shl.b32 %r3017, %r164, 8;
+ or.b32 %r166, %r3017, -2147483648;
+ mov.u64 %rd2519, 0;
+ mov.u32 %r7995, 0;
+ mov.u64 %rd722, __cudart_i2opi_f;
+ mov.u64 %rd2520, %rd2519;
+
+$L__BB0_147:
+ .pragma "nounroll";
+ shl.b64 %rd721, %rd2519, 2;
+ add.s64 %rd723, %rd722, %rd721;
+ ld.global.nc.u32 %r3018, [%rd723];
+ mad.wide.u32 %rd724, %r3018, %r166, %rd2520;
+ shr.u64 %rd2520, %rd724, 32;
+ add.s64 %rd725, %rd1, %rd721;
+ st.local.u32 [%rd725], %rd724;
+ add.s32 %r7995, %r7995, 1;
+ cvt.s64.s32 %rd2519, %r7995;
+ setp.ne.s32 %p147, %r7995, 6;
+ @%p147 bra $L__BB0_147;
+
+ add.s32 %r7930, %r3016, -128;
+ mov.b32 %r7929, %f5405;
+ shr.u32 %r7928, %r7929, 23;
+ and.b32 %r7927, %r7928, 255;
+ add.s32 %r7926, %r7927, -128;
+ shr.u32 %r7925, %r7926, 5;
+ st.local.u32 [%rd4], %rd2520;
+ mov.u32 %r3019, 4;
+ sub.s32 %r170, %r3019, %r7925;
+ mov.u32 %r3020, 6;
+ sub.s32 %r3021, %r3020, %r7925;
+ mul.wide.s32 %rd726, %r3021, 4;
+ add.s64 %rd727, %rd1, %rd726;
+ ld.local.u32 %r7996, [%rd727];
+ ld.local.u32 %r7997, [%rd727+-4];
+ and.b32 %r173, %r7926, 31;
+ setp.eq.s32 %p148, %r173, 0;
+ @%p148 bra $L__BB0_150;
+
+ mov.u32 %r3022, 32;
+ sub.s32 %r3023, %r3022, %r173;
+ shr.u32 %r3024, %r7997, %r3023;
+ shl.b32 %r3025, %r7996, %r173;
+ add.s32 %r7996, %r3024, %r3025;
+ mul.wide.s32 %rd728, %r170, 4;
+ add.s64 %rd729, %rd1, %rd728;
+ ld.local.u32 %r3026, [%rd729];
+ shr.u32 %r3027, %r3026, %r3023;
+ shl.b32 %r3028, %r7997, %r173;
+ add.s32 %r7997, %r3027, %r3028;
+
+$L__BB0_150:
+ mov.b32 %r7931, %f5405;
+ and.b32 %r3029, %r7931, -2147483648;
+ shr.u32 %r3030, %r7997, 30;
+ shl.b32 %r3031, %r7996, 2;
+ or.b32 %r3032, %r3030, %r3031;
+ shr.u32 %r3033, %r3032, 31;
+ shr.u32 %r3034, %r7996, 30;
+ add.s32 %r3035, %r3033, %r3034;
+ neg.s32 %r3036, %r3035;
+ setp.eq.s32 %p149, %r3029, 0;
+ selp.b32 %r7998, %r3035, %r3036, %p149;
+ setp.ne.s32 %p150, %r3033, 0;
+ xor.b32 %r3037, %r3029, -2147483648;
+ selp.b32 %r3038, %r3037, %r3029, %p150;
+ selp.b32 %r3039, -1, 0, %p150;
+ xor.b32 %r3040, %r3032, %r3039;
+ shl.b32 %r3041, %r7997, 2;
+ xor.b32 %r3042, %r3041, %r3039;
+ cvt.u64.u32 %rd730, %r3040;
+ cvt.u64.u32 %rd731, %r3042;
+ bfi.b64 %rd732, %rd730, %rd731, 32, 32;
+ cvt.rn.f64.s64 %fd15, %rd732;
+ mul.f64 %fd16, %fd15, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f2475, %fd16;
+ setp.eq.s32 %p151, %r3038, 0;
+ neg.f32 %f2476, %f2475;
+ selp.f32 %f5289, %f2475, %f2476, %p151;
+
+$L__BB0_152:
+ add.s32 %r180, %r7998, 1;
+ and.b32 %r181, %r180, 1;
+ setp.eq.s32 %p152, %r181, 0;
+ selp.f32 %f157, %f5289, 0f3F800000, %p152;
+ mul.rn.f32 %f158, %f5289, %f5289;
+ mov.f32 %f5290, 0fB94D4153;
+ @%p152 bra $L__BB0_154;
+
+ mov.f32 %f2479, 0fBAB607ED;
+ mov.f32 %f2480, 0f37CBAC00;
+ fma.rn.f32 %f5290, %f2480, %f158, %f2479;
+
+$L__BB0_154:
+ selp.f32 %f2481, 0f3C0885E4, 0f3D2AAABB, %p152;
+ fma.rn.f32 %f2482, %f5290, %f158, %f2481;
+ selp.f32 %f2483, 0fBE2AAAA8, 0fBEFFFFFF, %p152;
+ fma.rn.f32 %f2484, %f2482, %f158, %f2483;
+ mov.f32 %f2485, 0f00000000;
+ fma.rn.f32 %f2486, %f158, %f157, %f2485;
+ fma.rn.f32 %f5283, %f2484, %f2486, %f157;
+ and.b32 %r3044, %r180, 2;
+ setp.eq.s32 %p154, %r3044, 0;
+ @%p154 bra $L__BB0_156;
+
+ mov.f32 %f2488, 0fBF800000;
+ fma.rn.f32 %f5283, %f5283, %f2488, %f2485;
+
+$L__BB0_156:
+ shl.b32 %r7937, %r12, 5;
+ mov.u32 %r7936, -32;
+ sub.s32 %r7935, %r7936, %r7937;
+ setp.lt.s32 %p1784, %r11, %r7935;
+ shl.b32 %r7934, %r12, 5;
+ mov.u32 %r7933, -32;
+ sub.s32 %r7932, %r7933, %r7934;
+ setp.ge.s32 %p1783, %r11, %r7932;
+ selp.f32 %f165, %f5283, %f5284, %p1784;
+ selp.f32 %f166, %f5281, %f5282, %p1784;
+ @%p1783 bra $L__BB0_158;
+
+ add.f32 %f5397, %f166, %f165;
+ mov.f32 %f5282, %f5281;
+ mov.f32 %f5284, %f5283;
+
+$L__BB0_158:
+ add.s32 %r7777, %r12, 2;
+ setp.gt.s32 %p1763, %r7777, 14;
+ @%p1763 bra $L__BB0_187;
+
+ shl.b32 %r3046, %r12, 5;
+ neg.s32 %r182, %r3046;
+ setp.ge.s32 %p158, %r11, %r182;
+ @%p158 bra $L__BB0_172;
+
+ mul.f32 %f2491, %f5412, 0f3F22F983;
+ cvt.rni.s32.f32 %r8002, %f2491;
+ cvt.rn.f32.s32 %f2492, %r8002;
+ mov.f32 %f2493, 0fBFC90FDA;
+ fma.rn.f32 %f2494, %f2492, %f2493, %f5412;
+ mov.f32 %f2495, 0fB3A22168;
+ fma.rn.f32 %f2496, %f2492, %f2495, %f2494;
+ mov.f32 %f2497, 0fA7C234C5;
+ fma.rn.f32 %f5298, %f2492, %f2497, %f2496;
+ abs.f32 %f174, %f5412;
+ setp.ltu.f32 %p159, %f174, 0f47CE4780;
+ @%p159 bra $L__BB0_168;
+
+ setp.eq.f32 %p160, %f174, 0f7F800000;
+ @%p160 bra $L__BB0_167;
+ bra.uni $L__BB0_162;
+
+$L__BB0_167:
+ mov.f32 %f2500, 0f00000000;
+ mul.rn.f32 %f5298, %f5412, %f2500;
+ mov.u32 %r8002, 0;
+ bra.uni $L__BB0_168;
+
+$L__BB0_162:
+ mov.b32 %r184, %f5412;
+ shr.u32 %r3048, %r184, 23;
+ and.b32 %r3049, %r3048, 255;
+ shl.b32 %r3050, %r184, 8;
+ or.b32 %r186, %r3050, -2147483648;
+ mov.u64 %rd2521, 0;
+ mov.u32 %r7999, 0;
+ mov.u64 %rd736, __cudart_i2opi_f;
+ mov.u64 %rd2522, %rd2521;
+
+$L__BB0_163:
+ .pragma "nounroll";
+ shl.b64 %rd735, %rd2521, 2;
+ add.s64 %rd737, %rd736, %rd735;
+ ld.global.nc.u32 %r3051, [%rd737];
+ mad.wide.u32 %rd738, %r3051, %r186, %rd2522;
+ shr.u64 %rd2522, %rd738, 32;
+ add.s64 %rd739, %rd1, %rd735;
+ st.local.u32 [%rd739], %rd738;
+ add.s32 %r7999, %r7999, 1;
+ cvt.s64.s32 %rd2521, %r7999;
+ setp.ne.s32 %p161, %r7999, 6;
+ @%p161 bra $L__BB0_163;
+
+ add.s32 %r7961, %r3049, -128;
+ mov.b32 %r7960, %f5412;
+ shr.u32 %r7959, %r7960, 23;
+ and.b32 %r7958, %r7959, 255;
+ add.s32 %r7957, %r7958, -128;
+ shr.u32 %r7956, %r7957, 5;
+ st.local.u32 [%rd4], %rd2522;
+ mov.u32 %r3052, 4;
+ sub.s32 %r190, %r3052, %r7956;
+ mov.u32 %r3053, 6;
+ sub.s32 %r3054, %r3053, %r7956;
+ mul.wide.s32 %rd740, %r3054, 4;
+ add.s64 %rd741, %rd1, %rd740;
+ ld.local.u32 %r8000, [%rd741];
+ ld.local.u32 %r8001, [%rd741+-4];
+ and.b32 %r193, %r7957, 31;
+ setp.eq.s32 %p162, %r193, 0;
+ @%p162 bra $L__BB0_166;
+
+ mov.u32 %r3055, 32;
+ sub.s32 %r3056, %r3055, %r193;
+ shr.u32 %r3057, %r8001, %r3056;
+ shl.b32 %r3058, %r8000, %r193;
+ add.s32 %r8000, %r3057, %r3058;
+ mul.wide.s32 %rd742, %r190, 4;
+ add.s64 %rd743, %rd1, %rd742;
+ ld.local.u32 %r3059, [%rd743];
+ shr.u32 %r3060, %r3059, %r3056;
+ shl.b32 %r3061, %r8001, %r193;
+ add.s32 %r8001, %r3060, %r3061;
+
+$L__BB0_166:
+ mov.b32 %r7962, %f5412;
+ and.b32 %r3062, %r7962, -2147483648;
+ shr.u32 %r3063, %r8001, 30;
+ shl.b32 %r3064, %r8000, 2;
+ or.b32 %r3065, %r3063, %r3064;
+ shr.u32 %r3066, %r3065, 31;
+ shr.u32 %r3067, %r8000, 30;
+ add.s32 %r3068, %r3066, %r3067;
+ neg.s32 %r3069, %r3068;
+ setp.eq.s32 %p163, %r3062, 0;
+ selp.b32 %r8002, %r3068, %r3069, %p163;
+ setp.ne.s32 %p164, %r3066, 0;
+ xor.b32 %r3070, %r3062, -2147483648;
+ selp.b32 %r3071, %r3070, %r3062, %p164;
+ selp.b32 %r3072, -1, 0, %p164;
+ xor.b32 %r3073, %r3065, %r3072;
+ shl.b32 %r3074, %r8001, 2;
+ xor.b32 %r3075, %r3074, %r3072;
+ cvt.u64.u32 %rd744, %r3073;
+ cvt.u64.u32 %rd745, %r3075;
+ bfi.b64 %rd746, %rd744, %rd745, 32, 32;
+ cvt.rn.f64.s64 %fd17, %rd746;
+ mul.f64 %fd18, %fd17, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f2498, %fd18;
+ setp.eq.s32 %p165, %r3071, 0;
+ neg.f32 %f2499, %f2498;
+ selp.f32 %f5298, %f2498, %f2499, %p165;
+
+$L__BB0_168:
+ and.b32 %r200, %r8002, 1;
+ setp.eq.s32 %p166, %r200, 0;
+ selp.f32 %f178, %f5298, 0f3F800000, %p166;
+ mul.rn.f32 %f179, %f5298, %f5298;
+ mov.f32 %f5299, 0fB94D4153;
+ @%p166 bra $L__BB0_170;
+
+ mov.f32 %f2502, 0fBAB607ED;
+ mov.f32 %f2503, 0f37CBAC00;
+ fma.rn.f32 %f5299, %f2503, %f179, %f2502;
+
+$L__BB0_170:
+ selp.f32 %f2504, 0f3C0885E4, 0f3D2AAABB, %p166;
+ fma.rn.f32 %f2505, %f5299, %f179, %f2504;
+ selp.f32 %f2506, 0fBE2AAAA8, 0fBEFFFFFF, %p166;
+ fma.rn.f32 %f2507, %f2505, %f179, %f2506;
+ mov.f32 %f2508, 0f00000000;
+ fma.rn.f32 %f2509, %f179, %f178, %f2508;
+ fma.rn.f32 %f5281, %f2507, %f2509, %f178;
+ and.b32 %r3077, %r8002, 2;
+ setp.eq.s32 %p168, %r3077, 0;
+ @%p168 bra $L__BB0_172;
+
+ mov.f32 %f2511, 0fBF800000;
+ fma.rn.f32 %f5281, %f5281, %f2511, %f2508;
+
+$L__BB0_172:
+ shl.b32 %r7964, %r12, 5;
+ neg.s32 %r7963, %r7964;
+ setp.lt.s32 %p4, %r11, %r7963;
+ @%p158 bra $L__BB0_185;
+
+ mul.f32 %f2512, %f5404, 0f3F22F983;
+ cvt.rni.s32.f32 %r8006, %f2512;
+ cvt.rn.f32.s32 %f2513, %r8006;
+ mov.f32 %f2514, 0fBFC90FDA;
+ fma.rn.f32 %f2515, %f2513, %f2514, %f5404;
+ mov.f32 %f2516, 0fB3A22168;
+ fma.rn.f32 %f2517, %f2513, %f2516, %f2515;
+ mov.f32 %f2518, 0fA7C234C5;
+ fma.rn.f32 %f5302, %f2513, %f2518, %f2517;
+ abs.f32 %f187, %f5404;
+ setp.ltu.f32 %p170, %f187, 0f47CE4780;
+ @%p170 bra $L__BB0_181;
+
+ setp.eq.f32 %p171, %f187, 0f7F800000;
+ @%p171 bra $L__BB0_180;
+ bra.uni $L__BB0_175;
+
+$L__BB0_180:
+ mov.f32 %f2521, 0f00000000;
+ mul.rn.f32 %f5302, %f5404, %f2521;
+ mov.u32 %r8006, 0;
+ bra.uni $L__BB0_181;
+
+$L__BB0_175:
+ mov.b32 %r202, %f5404;
+ shr.u32 %r3079, %r202, 23;
+ and.b32 %r3080, %r3079, 255;
+ add.s32 %r203, %r3080, -128;
+ shl.b32 %r3081, %r202, 8;
+ or.b32 %r204, %r3081, -2147483648;
+ shr.u32 %r205, %r203, 5;
+ mov.u64 %rd2523, 0;
+ mov.u32 %r8003, 0;
+ mov.u64 %rd750, __cudart_i2opi_f;
+ mov.u64 %rd2524, %rd2523;
+
+$L__BB0_176:
+ .pragma "nounroll";
+ shl.b64 %rd749, %rd2523, 2;
+ add.s64 %rd751, %rd750, %rd749;
+ ld.global.nc.u32 %r3082, [%rd751];
+ mad.wide.u32 %rd752, %r3082, %r204, %rd2524;
+ shr.u64 %rd2524, %rd752, 32;
+ add.s64 %rd753, %rd1, %rd749;
+ st.local.u32 [%rd753], %rd752;
+ add.s32 %r8003, %r8003, 1;
+ cvt.s64.s32 %rd2523, %r8003;
+ setp.ne.s32 %p172, %r8003, 6;
+ @%p172 bra $L__BB0_176;
+
+ st.local.u32 [%rd4], %rd2524;
+ mov.u32 %r3083, 4;
+ sub.s32 %r208, %r3083, %r205;
+ mov.u32 %r3084, 6;
+ sub.s32 %r3085, %r3084, %r205;
+ mul.wide.s32 %rd754, %r3085, 4;
+ add.s64 %rd755, %rd1, %rd754;
+ ld.local.u32 %r8004, [%rd755];
+ ld.local.u32 %r8005, [%rd755+-4];
+ and.b32 %r211, %r203, 31;
+ setp.eq.s32 %p173, %r211, 0;
+ @%p173 bra $L__BB0_179;
+
+ mov.u32 %r3086, 32;
+ sub.s32 %r3087, %r3086, %r211;
+ shr.u32 %r3088, %r8005, %r3087;
+ shl.b32 %r3089, %r8004, %r211;
+ add.s32 %r8004, %r3088, %r3089;
+ mul.wide.s32 %rd756, %r208, 4;
+ add.s64 %rd757, %rd1, %rd756;
+ ld.local.u32 %r3090, [%rd757];
+ shr.u32 %r3091, %r3090, %r3087;
+ shl.b32 %r3092, %r8005, %r211;
+ add.s32 %r8005, %r3091, %r3092;
+
+$L__BB0_179:
+ and.b32 %r3093, %r202, -2147483648;
+ shr.u32 %r3094, %r8005, 30;
+ shl.b32 %r3095, %r8004, 2;
+ or.b32 %r3096, %r3094, %r3095;
+ shr.u32 %r3097, %r3096, 31;
+ shr.u32 %r3098, %r8004, 30;
+ add.s32 %r3099, %r3097, %r3098;
+ neg.s32 %r3100, %r3099;
+ setp.eq.s32 %p174, %r3093, 0;
+ selp.b32 %r8006, %r3099, %r3100, %p174;
+ setp.ne.s32 %p175, %r3097, 0;
+ xor.b32 %r3101, %r3093, -2147483648;
+ selp.b32 %r3102, %r3101, %r3093, %p175;
+ selp.b32 %r3103, -1, 0, %p175;
+ xor.b32 %r3104, %r3096, %r3103;
+ shl.b32 %r3105, %r8005, 2;
+ xor.b32 %r3106, %r3105, %r3103;
+ cvt.u64.u32 %rd758, %r3104;
+ cvt.u64.u32 %rd759, %r3106;
+ bfi.b64 %rd760, %rd758, %rd759, 32, 32;
+ cvt.rn.f64.s64 %fd19, %rd760;
+ mul.f64 %fd20, %fd19, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f2519, %fd20;
+ setp.eq.s32 %p176, %r3102, 0;
+ neg.f32 %f2520, %f2519;
+ selp.f32 %f5302, %f2519, %f2520, %p176;
+
+$L__BB0_181:
+ add.s32 %r218, %r8006, 1;
+ and.b32 %r219, %r218, 1;
+ setp.eq.s32 %p177, %r219, 0;
+ selp.f32 %f191, %f5302, 0f3F800000, %p177;
+ mul.rn.f32 %f192, %f5302, %f5302;
+ mov.f32 %f5303, 0fB94D4153;
+ @%p177 bra $L__BB0_183;
+
+ mov.f32 %f2523, 0fBAB607ED;
+ mov.f32 %f2524, 0f37CBAC00;
+ fma.rn.f32 %f5303, %f2524, %f192, %f2523;
+
+$L__BB0_183:
+ selp.f32 %f2525, 0f3C0885E4, 0f3D2AAABB, %p177;
+ fma.rn.f32 %f2526, %f5303, %f192, %f2525;
+ selp.f32 %f2527, 0fBE2AAAA8, 0fBEFFFFFF, %p177;
+ fma.rn.f32 %f2528, %f2526, %f192, %f2527;
+ mov.f32 %f2529, 0f00000000;
+ fma.rn.f32 %f2530, %f192, %f191, %f2529;
+ fma.rn.f32 %f5283, %f2528, %f2530, %f191;
+ and.b32 %r3108, %r218, 2;
+ setp.eq.s32 %p179, %r3108, 0;
+ @%p179 bra $L__BB0_185;
+
+ mov.f32 %f2532, 0fBF800000;
+ fma.rn.f32 %f5283, %f5283, %f2532, %f2529;
+
+$L__BB0_185:
+ selp.f32 %f199, %f5283, %f5284, %p4;
+ selp.f32 %f200, %f5281, %f5282, %p4;
+ @%p158 bra $L__BB0_187;
+
+ add.f32 %f5396, %f200, %f199;
+ mov.f32 %f5282, %f5281;
+ mov.f32 %f5284, %f5283;
+
+$L__BB0_187:
+ add.s32 %r7778, %r12, 2;
+ setp.gt.s32 %p1764, %r7778, 14;
+ @%p1764 bra $L__BB0_216;
+
+ shl.b32 %r3110, %r12, 5;
+ mov.u32 %r3111, -32;
+ sub.s32 %r220, %r3111, %r3110;
+ setp.ge.s32 %p183, %r11, %r220;
+ @%p183 bra $L__BB0_201;
+
+ mul.f32 %f2535, %f5411, 0f3F22F983;
+ cvt.rni.s32.f32 %r8010, %f2535;
+ cvt.rn.f32.s32 %f2536, %r8010;
+ mov.f32 %f2537, 0fBFC90FDA;
+ fma.rn.f32 %f2538, %f2536, %f2537, %f5411;
+ mov.f32 %f2539, 0fB3A22168;
+ fma.rn.f32 %f2540, %f2536, %f2539, %f2538;
+ mov.f32 %f2541, 0fA7C234C5;
+ fma.rn.f32 %f5311, %f2536, %f2541, %f2540;
+ abs.f32 %f208, %f5411;
+ setp.ltu.f32 %p184, %f208, 0f47CE4780;
+ @%p184 bra $L__BB0_197;
+
+ setp.eq.f32 %p185, %f208, 0f7F800000;
+ @%p185 bra $L__BB0_196;
+ bra.uni $L__BB0_191;
+
+$L__BB0_196:
+ mov.f32 %f2544, 0f00000000;
+ mul.rn.f32 %f5311, %f5411, %f2544;
+ mov.u32 %r8010, 0;
+ bra.uni $L__BB0_197;
+
+$L__BB0_191:
+ mov.b32 %r222, %f5411;
+ shr.u32 %r3113, %r222, 23;
+ and.b32 %r3114, %r3113, 255;
+ add.s32 %r223, %r3114, -128;
+ shl.b32 %r3115, %r222, 8;
+ or.b32 %r224, %r3115, -2147483648;
+ shr.u32 %r225, %r223, 5;
+ mov.u64 %rd2525, 0;
+ mov.u32 %r8007, 0;
+ mov.u64 %rd764, __cudart_i2opi_f;
+ mov.u64 %rd2526, %rd2525;
+
+$L__BB0_192:
+ .pragma "nounroll";
+ shl.b64 %rd763, %rd2525, 2;
+ add.s64 %rd765, %rd764, %rd763;
+ ld.global.nc.u32 %r3116, [%rd765];
+ mad.wide.u32 %rd766, %r3116, %r224, %rd2526;
+ shr.u64 %rd2526, %rd766, 32;
+ add.s64 %rd767, %rd1, %rd763;
+ st.local.u32 [%rd767], %rd766;
+ add.s32 %r8007, %r8007, 1;
+ cvt.s64.s32 %rd2525, %r8007;
+ setp.ne.s32 %p186, %r8007, 6;
+ @%p186 bra $L__BB0_192;
+
+ st.local.u32 [%rd4], %rd2526;
+ mov.u32 %r3117, 4;
+ sub.s32 %r228, %r3117, %r225;
+ mov.u32 %r3118, 6;
+ sub.s32 %r3119, %r3118, %r225;
+ mul.wide.s32 %rd768, %r3119, 4;
+ add.s64 %rd769, %rd1, %rd768;
+ ld.local.u32 %r8008, [%rd769];
+ ld.local.u32 %r8009, [%rd769+-4];
+ and.b32 %r231, %r223, 31;
+ setp.eq.s32 %p187, %r231, 0;
+ @%p187 bra $L__BB0_195;
+
+ mov.u32 %r3120, 32;
+ sub.s32 %r3121, %r3120, %r231;
+ shr.u32 %r3122, %r8009, %r3121;
+ shl.b32 %r3123, %r8008, %r231;
+ add.s32 %r8008, %r3122, %r3123;
+ mul.wide.s32 %rd770, %r228, 4;
+ add.s64 %rd771, %rd1, %rd770;
+ ld.local.u32 %r3124, [%rd771];
+ shr.u32 %r3125, %r3124, %r3121;
+ shl.b32 %r3126, %r8009, %r231;
+ add.s32 %r8009, %r3125, %r3126;
+
+$L__BB0_195:
+ and.b32 %r3127, %r222, -2147483648;
+ shr.u32 %r3128, %r8009, 30;
+ shl.b32 %r3129, %r8008, 2;
+ or.b32 %r3130, %r3128, %r3129;
+ shr.u32 %r3131, %r3130, 31;
+ shr.u32 %r3132, %r8008, 30;
+ add.s32 %r3133, %r3131, %r3132;
+ neg.s32 %r3134, %r3133;
+ setp.eq.s32 %p188, %r3127, 0;
+ selp.b32 %r8010, %r3133, %r3134, %p188;
+ setp.ne.s32 %p189, %r3131, 0;
+ xor.b32 %r3135, %r3127, -2147483648;
+ selp.b32 %r3136, %r3135, %r3127, %p189;
+ selp.b32 %r3137, -1, 0, %p189;
+ xor.b32 %r3138, %r3130, %r3137;
+ shl.b32 %r3139, %r8009, 2;
+ xor.b32 %r3140, %r3139, %r3137;
+ cvt.u64.u32 %rd772, %r3138;
+ cvt.u64.u32 %rd773, %r3140;
+ bfi.b64 %rd774, %rd772, %rd773, 32, 32;
+ cvt.rn.f64.s64 %fd21, %rd774;
+ mul.f64 %fd22, %fd21, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f2542, %fd22;
+ setp.eq.s32 %p190, %r3136, 0;
+ neg.f32 %f2543, %f2542;
+ selp.f32 %f5311, %f2542, %f2543, %p190;
+
+$L__BB0_197:
+ and.b32 %r238, %r8010, 1;
+ setp.eq.s32 %p191, %r238, 0;
+ selp.f32 %f212, %f5311, 0f3F800000, %p191;
+ mul.rn.f32 %f213, %f5311, %f5311;
+ mov.f32 %f5312, 0fB94D4153;
+ @%p191 bra $L__BB0_199;
+
+ mov.f32 %f2546, 0fBAB607ED;
+ mov.f32 %f2547, 0f37CBAC00;
+ fma.rn.f32 %f5312, %f2547, %f213, %f2546;
+
+$L__BB0_199:
+ selp.f32 %f2548, 0f3C0885E4, 0f3D2AAABB, %p191;
+ fma.rn.f32 %f2549, %f5312, %f213, %f2548;
+ selp.f32 %f2550, 0fBE2AAAA8, 0fBEFFFFFF, %p191;
+ fma.rn.f32 %f2551, %f2549, %f213, %f2550;
+ mov.f32 %f2552, 0f00000000;
+ fma.rn.f32 %f2553, %f213, %f212, %f2552;
+ fma.rn.f32 %f5281, %f2551, %f2553, %f212;
+ and.b32 %r3142, %r8010, 2;
+ setp.eq.s32 %p193, %r3142, 0;
+ @%p193 bra $L__BB0_201;
+
+ mov.f32 %f2555, 0fBF800000;
+ fma.rn.f32 %f5281, %f5281, %f2555, %f2552;
+
+$L__BB0_201:
+ setp.lt.s32 %p5, %r11, %r220;
+ @%p183 bra $L__BB0_214;
+
+ mul.f32 %f2556, %f5403, 0f3F22F983;
+ cvt.rni.s32.f32 %r8014, %f2556;
+ cvt.rn.f32.s32 %f2557, %r8014;
+ mov.f32 %f2558, 0fBFC90FDA;
+ fma.rn.f32 %f2559, %f2557, %f2558, %f5403;
+ mov.f32 %f2560, 0fB3A22168;
+ fma.rn.f32 %f2561, %f2557, %f2560, %f2559;
+ mov.f32 %f2562, 0fA7C234C5;
+ fma.rn.f32 %f5315, %f2557, %f2562, %f2561;
+ abs.f32 %f221, %f5403;
+ setp.ltu.f32 %p195, %f221, 0f47CE4780;
+ @%p195 bra $L__BB0_210;
+
+ setp.eq.f32 %p196, %f221, 0f7F800000;
+ @%p196 bra $L__BB0_209;
+ bra.uni $L__BB0_204;
+
+$L__BB0_209:
+ mov.f32 %f2565, 0f00000000;
+ mul.rn.f32 %f5315, %f5403, %f2565;
+ mov.u32 %r8014, 0;
+ bra.uni $L__BB0_210;
+
+$L__BB0_204:
+ mov.b32 %r240, %f5403;
+ shr.u32 %r3144, %r240, 23;
+ and.b32 %r3145, %r3144, 255;
+ add.s32 %r241, %r3145, -128;
+ shl.b32 %r3146, %r240, 8;
+ or.b32 %r242, %r3146, -2147483648;
+ shr.u32 %r243, %r241, 5;
+ mov.u64 %rd2527, 0;
+ mov.u32 %r8011, 0;
+ mov.u64 %rd778, __cudart_i2opi_f;
+ mov.u64 %rd2528, %rd2527;
+
+$L__BB0_205:
+ .pragma "nounroll";
+ shl.b64 %rd777, %rd2527, 2;
+ add.s64 %rd779, %rd778, %rd777;
+ ld.global.nc.u32 %r3147, [%rd779];
+ mad.wide.u32 %rd780, %r3147, %r242, %rd2528;
+ shr.u64 %rd2528, %rd780, 32;
+ add.s64 %rd781, %rd1, %rd777;
+ st.local.u32 [%rd781], %rd780;
+ add.s32 %r8011, %r8011, 1;
+ cvt.s64.s32 %rd2527, %r8011;
+ setp.ne.s32 %p197, %r8011, 6;
+ @%p197 bra $L__BB0_205;
+
+ st.local.u32 [%rd4], %rd2528;
+ mov.u32 %r3148, 4;
+ sub.s32 %r246, %r3148, %r243;
+ mov.u32 %r3149, 6;
+ sub.s32 %r3150, %r3149, %r243;
+ mul.wide.s32 %rd782, %r3150, 4;
+ add.s64 %rd783, %rd1, %rd782;
+ ld.local.u32 %r8012, [%rd783];
+ ld.local.u32 %r8013, [%rd783+-4];
+ and.b32 %r249, %r241, 31;
+ setp.eq.s32 %p198, %r249, 0;
+ @%p198 bra $L__BB0_208;
+
+ mov.u32 %r3151, 32;
+ sub.s32 %r3152, %r3151, %r249;
+ shr.u32 %r3153, %r8013, %r3152;
+ shl.b32 %r3154, %r8012, %r249;
+ add.s32 %r8012, %r3153, %r3154;
+ mul.wide.s32 %rd784, %r246, 4;
+ add.s64 %rd785, %rd1, %rd784;
+ ld.local.u32 %r3155, [%rd785];
+ shr.u32 %r3156, %r3155, %r3152;
+ shl.b32 %r3157, %r8013, %r249;
+ add.s32 %r8013, %r3156, %r3157;
+
+$L__BB0_208:
+ and.b32 %r3158, %r240, -2147483648;
+ shr.u32 %r3159, %r8013, 30;
+ shl.b32 %r3160, %r8012, 2;
+ or.b32 %r3161, %r3159, %r3160;
+ shr.u32 %r3162, %r3161, 31;
+ shr.u32 %r3163, %r8012, 30;
+ add.s32 %r3164, %r3162, %r3163;
+ neg.s32 %r3165, %r3164;
+ setp.eq.s32 %p199, %r3158, 0;
+ selp.b32 %r8014, %r3164, %r3165, %p199;
+ setp.ne.s32 %p200, %r3162, 0;
+ xor.b32 %r3166, %r3158, -2147483648;
+ selp.b32 %r3167, %r3166, %r3158, %p200;
+ selp.b32 %r3168, -1, 0, %p200;
+ xor.b32 %r3169, %r3161, %r3168;
+ shl.b32 %r3170, %r8013, 2;
+ xor.b32 %r3171, %r3170, %r3168;
+ cvt.u64.u32 %rd786, %r3169;
+ cvt.u64.u32 %rd787, %r3171;
+ bfi.b64 %rd788, %rd786, %rd787, 32, 32;
+ cvt.rn.f64.s64 %fd23, %rd788;
+ mul.f64 %fd24, %fd23, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f2563, %fd24;
+ setp.eq.s32 %p201, %r3167, 0;
+ neg.f32 %f2564, %f2563;
+ selp.f32 %f5315, %f2563, %f2564, %p201;
+
+$L__BB0_210:
+ add.s32 %r256, %r8014, 1;
+ and.b32 %r257, %r256, 1;
+ setp.eq.s32 %p202, %r257, 0;
+ selp.f32 %f225, %f5315, 0f3F800000, %p202;
+ mul.rn.f32 %f226, %f5315, %f5315;
+ mov.f32 %f5316, 0fB94D4153;
+ @%p202 bra $L__BB0_212;
+
+ mov.f32 %f2567, 0fBAB607ED;
+ mov.f32 %f2568, 0f37CBAC00;
+ fma.rn.f32 %f5316, %f2568, %f226, %f2567;
+
+$L__BB0_212:
+ selp.f32 %f2569, 0f3C0885E4, 0f3D2AAABB, %p202;
+ fma.rn.f32 %f2570, %f5316, %f226, %f2569;
+ selp.f32 %f2571, 0fBE2AAAA8, 0fBEFFFFFF, %p202;
+ fma.rn.f32 %f2572, %f2570, %f226, %f2571;
+ mov.f32 %f2573, 0f00000000;
+ fma.rn.f32 %f2574, %f226, %f225, %f2573;
+ fma.rn.f32 %f5283, %f2572, %f2574, %f225;
+ and.b32 %r3173, %r256, 2;
+ setp.eq.s32 %p204, %r3173, 0;
+ @%p204 bra $L__BB0_214;
+
+ mov.f32 %f2576, 0fBF800000;
+ fma.rn.f32 %f5283, %f5283, %f2576, %f2573;
+
+$L__BB0_214:
+ selp.f32 %f233, %f5283, %f5284, %p5;
+ selp.f32 %f234, %f5281, %f5282, %p5;
+ @%p183 bra $L__BB0_216;
+
+ add.f32 %f5395, %f234, %f233;
+ mov.f32 %f5282, %f5281;
+ mov.f32 %f5284, %f5283;
+
+$L__BB0_216:
+ add.s32 %r7779, %r12, 3;
+ setp.gt.s32 %p1765, %r7779, 14;
+ @%p1765 bra $L__BB0_245;
+
+ shl.b32 %r3175, %r12, 5;
+ neg.s32 %r258, %r3175;
+ setp.ge.s32 %p208, %r11, %r258;
+ @%p208 bra $L__BB0_230;
+
+ mul.f32 %f2579, %f5410, 0f3F22F983;
+ cvt.rni.s32.f32 %r8018, %f2579;
+ cvt.rn.f32.s32 %f2580, %r8018;
+ mov.f32 %f2581, 0fBFC90FDA;
+ fma.rn.f32 %f2582, %f2580, %f2581, %f5410;
+ mov.f32 %f2583, 0fB3A22168;
+ fma.rn.f32 %f2584, %f2580, %f2583, %f2582;
+ mov.f32 %f2585, 0fA7C234C5;
+ fma.rn.f32 %f5324, %f2580, %f2585, %f2584;
+ abs.f32 %f242, %f5410;
+ setp.ltu.f32 %p209, %f242, 0f47CE4780;
+ @%p209 bra $L__BB0_226;
+
+ setp.eq.f32 %p210, %f242, 0f7F800000;
+ @%p210 bra $L__BB0_225;
+ bra.uni $L__BB0_220;
+
+$L__BB0_225:
+ mov.f32 %f2588, 0f00000000;
+ mul.rn.f32 %f5324, %f5410, %f2588;
+ mov.u32 %r8018, 0;
+ bra.uni $L__BB0_226;
+
+$L__BB0_220:
+ mov.b32 %r260, %f5410;
+ shr.u32 %r3177, %r260, 23;
+ and.b32 %r3178, %r3177, 255;
+ add.s32 %r261, %r3178, -128;
+ shl.b32 %r3179, %r260, 8;
+ or.b32 %r262, %r3179, -2147483648;
+ shr.u32 %r263, %r261, 5;
+ mov.u64 %rd2529, 0;
+ mov.u32 %r8015, 0;
+ mov.u64 %rd792, __cudart_i2opi_f;
+ mov.u64 %rd2530, %rd2529;
+
+$L__BB0_221:
+ .pragma "nounroll";
+ shl.b64 %rd791, %rd2529, 2;
+ add.s64 %rd793, %rd792, %rd791;
+ ld.global.nc.u32 %r3180, [%rd793];
+ mad.wide.u32 %rd794, %r3180, %r262, %rd2530;
+ shr.u64 %rd2530, %rd794, 32;
+ add.s64 %rd795, %rd1, %rd791;
+ st.local.u32 [%rd795], %rd794;
+ add.s32 %r8015, %r8015, 1;
+ cvt.s64.s32 %rd2529, %r8015;
+ setp.ne.s32 %p211, %r8015, 6;
+ @%p211 bra $L__BB0_221;
+
+ st.local.u32 [%rd4], %rd2530;
+ mov.u32 %r3181, 4;
+ sub.s32 %r266, %r3181, %r263;
+ mov.u32 %r3182, 6;
+ sub.s32 %r3183, %r3182, %r263;
+ mul.wide.s32 %rd796, %r3183, 4;
+ add.s64 %rd797, %rd1, %rd796;
+ ld.local.u32 %r8016, [%rd797];
+ ld.local.u32 %r8017, [%rd797+-4];
+ and.b32 %r269, %r261, 31;
+ setp.eq.s32 %p212, %r269, 0;
+ @%p212 bra $L__BB0_224;
+
+ mov.u32 %r3184, 32;
+ sub.s32 %r3185, %r3184, %r269;
+ shr.u32 %r3186, %r8017, %r3185;
+ shl.b32 %r3187, %r8016, %r269;
+ add.s32 %r8016, %r3186, %r3187;
+ mul.wide.s32 %rd798, %r266, 4;
+ add.s64 %rd799, %rd1, %rd798;
+ ld.local.u32 %r3188, [%rd799];
+ shr.u32 %r3189, %r3188, %r3185;
+ shl.b32 %r3190, %r8017, %r269;
+ add.s32 %r8017, %r3189, %r3190;
+
+$L__BB0_224:
+ and.b32 %r3191, %r260, -2147483648;
+ shr.u32 %r3192, %r8017, 30;
+ shl.b32 %r3193, %r8016, 2;
+ or.b32 %r3194, %r3192, %r3193;
+ shr.u32 %r3195, %r3194, 31;
+ shr.u32 %r3196, %r8016, 30;
+ add.s32 %r3197, %r3195, %r3196;
+ neg.s32 %r3198, %r3197;
+ setp.eq.s32 %p213, %r3191, 0;
+ selp.b32 %r8018, %r3197, %r3198, %p213;
+ setp.ne.s32 %p214, %r3195, 0;
+ xor.b32 %r3199, %r3191, -2147483648;
+ selp.b32 %r3200, %r3199, %r3191, %p214;
+ selp.b32 %r3201, -1, 0, %p214;
+ xor.b32 %r3202, %r3194, %r3201;
+ shl.b32 %r3203, %r8017, 2;
+ xor.b32 %r3204, %r3203, %r3201;
+ cvt.u64.u32 %rd800, %r3202;
+ cvt.u64.u32 %rd801, %r3204;
+ bfi.b64 %rd802, %rd800, %rd801, 32, 32;
+ cvt.rn.f64.s64 %fd25, %rd802;
+ mul.f64 %fd26, %fd25, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f2586, %fd26;
+ setp.eq.s32 %p215, %r3200, 0;
+ neg.f32 %f2587, %f2586;
+ selp.f32 %f5324, %f2586, %f2587, %p215;
+
+$L__BB0_226:
+ and.b32 %r276, %r8018, 1;
+ setp.eq.s32 %p216, %r276, 0;
+ selp.f32 %f246, %f5324, 0f3F800000, %p216;
+ mul.rn.f32 %f247, %f5324, %f5324;
+ mov.f32 %f5325, 0fB94D4153;
+ @%p216 bra $L__BB0_228;
+
+ mov.f32 %f2590, 0fBAB607ED;
+ mov.f32 %f2591, 0f37CBAC00;
+ fma.rn.f32 %f5325, %f2591, %f247, %f2590;
+
+$L__BB0_228:
+ selp.f32 %f2592, 0f3C0885E4, 0f3D2AAABB, %p216;
+ fma.rn.f32 %f2593, %f5325, %f247, %f2592;
+ selp.f32 %f2594, 0fBE2AAAA8, 0fBEFFFFFF, %p216;
+ fma.rn.f32 %f2595, %f2593, %f247, %f2594;
+ mov.f32 %f2596, 0f00000000;
+ fma.rn.f32 %f2597, %f247, %f246, %f2596;
+ fma.rn.f32 %f5281, %f2595, %f2597, %f246;
+ and.b32 %r3206, %r8018, 2;
+ setp.eq.s32 %p218, %r3206, 0;
+ @%p218 bra $L__BB0_230;
+
+ mov.f32 %f2599, 0fBF800000;
+ fma.rn.f32 %f5281, %f5281, %f2599, %f2596;
+
+$L__BB0_230:
+ setp.lt.s32 %p6, %r11, %r258;
+ @%p208 bra $L__BB0_243;
+
+ mul.f32 %f2600, %f5402, 0f3F22F983;
+ cvt.rni.s32.f32 %r8022, %f2600;
+ cvt.rn.f32.s32 %f2601, %r8022;
+ mov.f32 %f2602, 0fBFC90FDA;
+ fma.rn.f32 %f2603, %f2601, %f2602, %f5402;
+ mov.f32 %f2604, 0fB3A22168;
+ fma.rn.f32 %f2605, %f2601, %f2604, %f2603;
+ mov.f32 %f2606, 0fA7C234C5;
+ fma.rn.f32 %f5328, %f2601, %f2606, %f2605;
+ abs.f32 %f255, %f5402;
+ setp.ltu.f32 %p220, %f255, 0f47CE4780;
+ @%p220 bra $L__BB0_239;
+
+ setp.eq.f32 %p221, %f255, 0f7F800000;
+ @%p221 bra $L__BB0_238;
+ bra.uni $L__BB0_233;
+
+$L__BB0_238:
+ mov.f32 %f2609, 0f00000000;
+ mul.rn.f32 %f5328, %f5402, %f2609;
+ mov.u32 %r8022, 0;
+ bra.uni $L__BB0_239;
+
+$L__BB0_233:
+ mov.b32 %r278, %f5402;
+ shr.u32 %r3208, %r278, 23;
+ and.b32 %r3209, %r3208, 255;
+ add.s32 %r279, %r3209, -128;
+ shl.b32 %r3210, %r278, 8;
+ or.b32 %r280, %r3210, -2147483648;
+ shr.u32 %r281, %r279, 5;
+ mov.u64 %rd2531, 0;
+ mov.u32 %r8019, 0;
+ mov.u64 %rd806, __cudart_i2opi_f;
+ mov.u64 %rd2532, %rd2531;
+
+$L__BB0_234:
+ .pragma "nounroll";
+ shl.b64 %rd805, %rd2531, 2;
+ add.s64 %rd807, %rd806, %rd805;
+ ld.global.nc.u32 %r3211, [%rd807];
+ mad.wide.u32 %rd808, %r3211, %r280, %rd2532;
+ shr.u64 %rd2532, %rd808, 32;
+ add.s64 %rd809, %rd1, %rd805;
+ st.local.u32 [%rd809], %rd808;
+ add.s32 %r8019, %r8019, 1;
+ cvt.s64.s32 %rd2531, %r8019;
+ setp.ne.s32 %p222, %r8019, 6;
+ @%p222 bra $L__BB0_234;
+
+ st.local.u32 [%rd4], %rd2532;
+ mov.u32 %r3212, 4;
+ sub.s32 %r284, %r3212, %r281;
+ mov.u32 %r3213, 6;
+ sub.s32 %r3214, %r3213, %r281;
+ mul.wide.s32 %rd810, %r3214, 4;
+ add.s64 %rd811, %rd1, %rd810;
+ ld.local.u32 %r8020, [%rd811];
+ ld.local.u32 %r8021, [%rd811+-4];
+ and.b32 %r287, %r279, 31;
+ setp.eq.s32 %p223, %r287, 0;
+ @%p223 bra $L__BB0_237;
+
+ mov.u32 %r3215, 32;
+ sub.s32 %r3216, %r3215, %r287;
+ shr.u32 %r3217, %r8021, %r3216;
+ shl.b32 %r3218, %r8020, %r287;
+ add.s32 %r8020, %r3217, %r3218;
+ mul.wide.s32 %rd812, %r284, 4;
+ add.s64 %rd813, %rd1, %rd812;
+ ld.local.u32 %r3219, [%rd813];
+ shr.u32 %r3220, %r3219, %r3216;
+ shl.b32 %r3221, %r8021, %r287;
+ add.s32 %r8021, %r3220, %r3221;
+
+$L__BB0_237:
+ and.b32 %r3222, %r278, -2147483648;
+ shr.u32 %r3223, %r8021, 30;
+ shl.b32 %r3224, %r8020, 2;
+ or.b32 %r3225, %r3223, %r3224;
+ shr.u32 %r3226, %r3225, 31;
+ shr.u32 %r3227, %r8020, 30;
+ add.s32 %r3228, %r3226, %r3227;
+ neg.s32 %r3229, %r3228;
+ setp.eq.s32 %p224, %r3222, 0;
+ selp.b32 %r8022, %r3228, %r3229, %p224;
+ setp.ne.s32 %p225, %r3226, 0;
+ xor.b32 %r3230, %r3222, -2147483648;
+ selp.b32 %r3231, %r3230, %r3222, %p225;
+ selp.b32 %r3232, -1, 0, %p225;
+ xor.b32 %r3233, %r3225, %r3232;
+ shl.b32 %r3234, %r8021, 2;
+ xor.b32 %r3235, %r3234, %r3232;
+ cvt.u64.u32 %rd814, %r3233;
+ cvt.u64.u32 %rd815, %r3235;
+ bfi.b64 %rd816, %rd814, %rd815, 32, 32;
+ cvt.rn.f64.s64 %fd27, %rd816;
+ mul.f64 %fd28, %fd27, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f2607, %fd28;
+ setp.eq.s32 %p226, %r3231, 0;
+ neg.f32 %f2608, %f2607;
+ selp.f32 %f5328, %f2607, %f2608, %p226;
+
+$L__BB0_239:
+ add.s32 %r294, %r8022, 1;
+ and.b32 %r295, %r294, 1;
+ setp.eq.s32 %p227, %r295, 0;
+ selp.f32 %f259, %f5328, 0f3F800000, %p227;
+ mul.rn.f32 %f260, %f5328, %f5328;
+ mov.f32 %f5329, 0fB94D4153;
+ @%p227 bra $L__BB0_241;
+
+ mov.f32 %f2611, 0fBAB607ED;
+ mov.f32 %f2612, 0f37CBAC00;
+ fma.rn.f32 %f5329, %f2612, %f260, %f2611;
+
+$L__BB0_241:
+ selp.f32 %f2613, 0f3C0885E4, 0f3D2AAABB, %p227;
+ fma.rn.f32 %f2614, %f5329, %f260, %f2613;
+ selp.f32 %f2615, 0fBE2AAAA8, 0fBEFFFFFF, %p227;
+ fma.rn.f32 %f2616, %f2614, %f260, %f2615;
+ mov.f32 %f2617, 0f00000000;
+ fma.rn.f32 %f2618, %f260, %f259, %f2617;
+ fma.rn.f32 %f5283, %f2616, %f2618, %f259;
+ and.b32 %r3237, %r294, 2;
+ setp.eq.s32 %p229, %r3237, 0;
+ @%p229 bra $L__BB0_243;
+
+ mov.f32 %f2620, 0fBF800000;
+ fma.rn.f32 %f5283, %f5283, %f2620, %f2617;
+
+$L__BB0_243:
+ selp.f32 %f267, %f5283, %f5284, %p6;
+ selp.f32 %f268, %f5281, %f5282, %p6;
+ @%p208 bra $L__BB0_245;
+
+ add.f32 %f5394, %f268, %f267;
+ mov.f32 %f5282, %f5281;
+ mov.f32 %f5284, %f5283;
+
+$L__BB0_245:
+ add.s32 %r7780, %r12, 3;
+ setp.gt.s32 %p1766, %r7780, 14;
+ @%p1766 bra $L__BB0_467;
+
+ shl.b32 %r3239, %r12, 5;
+ mov.u32 %r3240, -32;
+ sub.s32 %r296, %r3240, %r3239;
+ setp.ge.s32 %p233, %r11, %r296;
+ @%p233 bra $L__BB0_259;
+
+ mul.f32 %f2623, %f5409, 0f3F22F983;
+ cvt.rni.s32.f32 %r8026, %f2623;
+ cvt.rn.f32.s32 %f2624, %r8026;
+ mov.f32 %f2625, 0fBFC90FDA;
+ fma.rn.f32 %f2626, %f2624, %f2625, %f5409;
+ mov.f32 %f2627, 0fB3A22168;
+ fma.rn.f32 %f2628, %f2624, %f2627, %f2626;
+ mov.f32 %f2629, 0fA7C234C5;
+ fma.rn.f32 %f5337, %f2624, %f2629, %f2628;
+ abs.f32 %f276, %f5409;
+ setp.ltu.f32 %p234, %f276, 0f47CE4780;
+ @%p234 bra $L__BB0_255;
+
+ setp.eq.f32 %p235, %f276, 0f7F800000;
+ @%p235 bra $L__BB0_254;
+ bra.uni $L__BB0_249;
+
+$L__BB0_254:
+ mov.f32 %f2632, 0f00000000;
+ mul.rn.f32 %f5337, %f5409, %f2632;
+ mov.u32 %r8026, 0;
+ bra.uni $L__BB0_255;
+
+$L__BB0_249:
+ mov.b32 %r298, %f5409;
+ shr.u32 %r3242, %r298, 23;
+ and.b32 %r3243, %r3242, 255;
+ add.s32 %r299, %r3243, -128;
+ shl.b32 %r3244, %r298, 8;
+ or.b32 %r300, %r3244, -2147483648;
+ shr.u32 %r301, %r299, 5;
+ mov.u64 %rd2533, 0;
+ mov.u32 %r8023, 0;
+ mov.u64 %rd820, __cudart_i2opi_f;
+ mov.u64 %rd2534, %rd2533;
+
+$L__BB0_250:
+ .pragma "nounroll";
+ shl.b64 %rd819, %rd2533, 2;
+ add.s64 %rd821, %rd820, %rd819;
+ ld.global.nc.u32 %r3245, [%rd821];
+ mad.wide.u32 %rd822, %r3245, %r300, %rd2534;
+ shr.u64 %rd2534, %rd822, 32;
+ add.s64 %rd823, %rd1, %rd819;
+ st.local.u32 [%rd823], %rd822;
+ add.s32 %r8023, %r8023, 1;
+ cvt.s64.s32 %rd2533, %r8023;
+ setp.ne.s32 %p236, %r8023, 6;
+ @%p236 bra $L__BB0_250;
+
+ st.local.u32 [%rd4], %rd2534;
+ mov.u32 %r3246, 4;
+ sub.s32 %r304, %r3246, %r301;
+ mov.u32 %r3247, 6;
+ sub.s32 %r3248, %r3247, %r301;
+ mul.wide.s32 %rd824, %r3248, 4;
+ add.s64 %rd825, %rd1, %rd824;
+ ld.local.u32 %r8024, [%rd825];
+ ld.local.u32 %r8025, [%rd825+-4];
+ and.b32 %r307, %r299, 31;
+ setp.eq.s32 %p237, %r307, 0;
+ @%p237 bra $L__BB0_253;
+
+ mov.u32 %r3249, 32;
+ sub.s32 %r3250, %r3249, %r307;
+ shr.u32 %r3251, %r8025, %r3250;
+ shl.b32 %r3252, %r8024, %r307;
+ add.s32 %r8024, %r3251, %r3252;
+ mul.wide.s32 %rd826, %r304, 4;
+ add.s64 %rd827, %rd1, %rd826;
+ ld.local.u32 %r3253, [%rd827];
+ shr.u32 %r3254, %r3253, %r3250;
+ shl.b32 %r3255, %r8025, %r307;
+ add.s32 %r8025, %r3254, %r3255;
+
+$L__BB0_253:
+ and.b32 %r3256, %r298, -2147483648;
+ shr.u32 %r3257, %r8025, 30;
+ shl.b32 %r3258, %r8024, 2;
+ or.b32 %r3259, %r3257, %r3258;
+ shr.u32 %r3260, %r3259, 31;
+ shr.u32 %r3261, %r8024, 30;
+ add.s32 %r3262, %r3260, %r3261;
+ neg.s32 %r3263, %r3262;
+ setp.eq.s32 %p238, %r3256, 0;
+ selp.b32 %r8026, %r3262, %r3263, %p238;
+ setp.ne.s32 %p239, %r3260, 0;
+ xor.b32 %r3264, %r3256, -2147483648;
+ selp.b32 %r3265, %r3264, %r3256, %p239;
+ selp.b32 %r3266, -1, 0, %p239;
+ xor.b32 %r3267, %r3259, %r3266;
+ shl.b32 %r3268, %r8025, 2;
+ xor.b32 %r3269, %r3268, %r3266;
+ cvt.u64.u32 %rd828, %r3267;
+ cvt.u64.u32 %rd829, %r3269;
+ bfi.b64 %rd830, %rd828, %rd829, 32, 32;
+ cvt.rn.f64.s64 %fd29, %rd830;
+ mul.f64 %fd30, %fd29, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f2630, %fd30;
+ setp.eq.s32 %p240, %r3265, 0;
+ neg.f32 %f2631, %f2630;
+ selp.f32 %f5337, %f2630, %f2631, %p240;
+
+$L__BB0_255:
+ and.b32 %r314, %r8026, 1;
+ setp.eq.s32 %p241, %r314, 0;
+ selp.f32 %f280, %f5337, 0f3F800000, %p241;
+ mul.rn.f32 %f281, %f5337, %f5337;
+ mov.f32 %f5338, 0fB94D4153;
+ @%p241 bra $L__BB0_257;
+
+ mov.f32 %f2634, 0fBAB607ED;
+ mov.f32 %f2635, 0f37CBAC00;
+ fma.rn.f32 %f5338, %f2635, %f281, %f2634;
+
+$L__BB0_257:
+ selp.f32 %f2636, 0f3C0885E4, 0f3D2AAABB, %p241;
+ fma.rn.f32 %f2637, %f5338, %f281, %f2636;
+ selp.f32 %f2638, 0fBE2AAAA8, 0fBEFFFFFF, %p241;
+ fma.rn.f32 %f2639, %f2637, %f281, %f2638;
+ mov.f32 %f2640, 0f00000000;
+ fma.rn.f32 %f2641, %f281, %f280, %f2640;
+ fma.rn.f32 %f5281, %f2639, %f2641, %f280;
+ and.b32 %r3271, %r8026, 2;
+ setp.eq.s32 %p243, %r3271, 0;
+ @%p243 bra $L__BB0_259;
+
+ mov.f32 %f2643, 0fBF800000;
+ fma.rn.f32 %f5281, %f5281, %f2643, %f2640;
+
+$L__BB0_259:
+ setp.lt.s32 %p7, %r11, %r296;
+ @%p233 bra $L__BB0_272;
+
+ mul.f32 %f2644, %f5401, 0f3F22F983;
+ cvt.rni.s32.f32 %r8030, %f2644;
+ cvt.rn.f32.s32 %f2645, %r8030;
+ mov.f32 %f2646, 0fBFC90FDA;
+ fma.rn.f32 %f2647, %f2645, %f2646, %f5401;
+ mov.f32 %f2648, 0fB3A22168;
+ fma.rn.f32 %f2649, %f2645, %f2648, %f2647;
+ mov.f32 %f2650, 0fA7C234C5;
+ fma.rn.f32 %f5341, %f2645, %f2650, %f2649;
+ abs.f32 %f289, %f5401;
+ setp.ltu.f32 %p245, %f289, 0f47CE4780;
+ @%p245 bra $L__BB0_268;
+
+ setp.eq.f32 %p246, %f289, 0f7F800000;
+ @%p246 bra $L__BB0_267;
+ bra.uni $L__BB0_262;
+
+$L__BB0_267:
+ mov.f32 %f2653, 0f00000000;
+ mul.rn.f32 %f5341, %f5401, %f2653;
+ mov.u32 %r8030, 0;
+ bra.uni $L__BB0_268;
+
+$L__BB0_262:
+ mov.b32 %r316, %f5401;
+ shr.u32 %r3273, %r316, 23;
+ and.b32 %r3274, %r3273, 255;
+ add.s32 %r317, %r3274, -128;
+ shl.b32 %r3275, %r316, 8;
+ or.b32 %r318, %r3275, -2147483648;
+ shr.u32 %r319, %r317, 5;
+ mov.u64 %rd2535, 0;
+ mov.u32 %r8027, 0;
+ mov.u64 %rd834, __cudart_i2opi_f;
+ mov.u64 %rd2536, %rd2535;
+
+$L__BB0_263:
+ .pragma "nounroll";
+ shl.b64 %rd833, %rd2535, 2;
+ add.s64 %rd835, %rd834, %rd833;
+ ld.global.nc.u32 %r3276, [%rd835];
+ mad.wide.u32 %rd836, %r3276, %r318, %rd2536;
+ shr.u64 %rd2536, %rd836, 32;
+ add.s64 %rd837, %rd1, %rd833;
+ st.local.u32 [%rd837], %rd836;
+ add.s32 %r8027, %r8027, 1;
+ cvt.s64.s32 %rd2535, %r8027;
+ setp.ne.s32 %p247, %r8027, 6;
+ @%p247 bra $L__BB0_263;
+
+ st.local.u32 [%rd4], %rd2536;
+ mov.u32 %r3277, 4;
+ sub.s32 %r322, %r3277, %r319;
+ mov.u32 %r3278, 6;
+ sub.s32 %r3279, %r3278, %r319;
+ mul.wide.s32 %rd838, %r3279, 4;
+ add.s64 %rd839, %rd1, %rd838;
+ ld.local.u32 %r8028, [%rd839];
+ ld.local.u32 %r8029, [%rd839+-4];
+ and.b32 %r325, %r317, 31;
+ setp.eq.s32 %p248, %r325, 0;
+ @%p248 bra $L__BB0_266;
+
+ mov.u32 %r3280, 32;
+ sub.s32 %r3281, %r3280, %r325;
+ shr.u32 %r3282, %r8029, %r3281;
+ shl.b32 %r3283, %r8028, %r325;
+ add.s32 %r8028, %r3282, %r3283;
+ mul.wide.s32 %rd840, %r322, 4;
+ add.s64 %rd841, %rd1, %rd840;
+ ld.local.u32 %r3284, [%rd841];
+ shr.u32 %r3285, %r3284, %r3281;
+ shl.b32 %r3286, %r8029, %r325;
+ add.s32 %r8029, %r3285, %r3286;
+
+$L__BB0_266:
+ and.b32 %r3287, %r316, -2147483648;
+ shr.u32 %r3288, %r8029, 30;
+ shl.b32 %r3289, %r8028, 2;
+ or.b32 %r3290, %r3288, %r3289;
+ shr.u32 %r3291, %r3290, 31;
+ shr.u32 %r3292, %r8028, 30;
+ add.s32 %r3293, %r3291, %r3292;
+ neg.s32 %r3294, %r3293;
+ setp.eq.s32 %p249, %r3287, 0;
+ selp.b32 %r8030, %r3293, %r3294, %p249;
+ setp.ne.s32 %p250, %r3291, 0;
+ xor.b32 %r3295, %r3287, -2147483648;
+ selp.b32 %r3296, %r3295, %r3287, %p250;
+ selp.b32 %r3297, -1, 0, %p250;
+ xor.b32 %r3298, %r3290, %r3297;
+ shl.b32 %r3299, %r8029, 2;
+ xor.b32 %r3300, %r3299, %r3297;
+ cvt.u64.u32 %rd842, %r3298;
+ cvt.u64.u32 %rd843, %r3300;
+ bfi.b64 %rd844, %rd842, %rd843, 32, 32;
+ cvt.rn.f64.s64 %fd31, %rd844;
+ mul.f64 %fd32, %fd31, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f2651, %fd32;
+ setp.eq.s32 %p251, %r3296, 0;
+ neg.f32 %f2652, %f2651;
+ selp.f32 %f5341, %f2651, %f2652, %p251;
+
+$L__BB0_268:
+ add.s32 %r332, %r8030, 1;
+ and.b32 %r333, %r332, 1;
+ setp.eq.s32 %p252, %r333, 0;
+ selp.f32 %f293, %f5341, 0f3F800000, %p252;
+ mul.rn.f32 %f294, %f5341, %f5341;
+ mov.f32 %f5342, 0fB94D4153;
+ @%p252 bra $L__BB0_270;
+
+ mov.f32 %f2655, 0fBAB607ED;
+ mov.f32 %f2656, 0f37CBAC00;
+ fma.rn.f32 %f5342, %f2656, %f294, %f2655;
+
+$L__BB0_270:
+ selp.f32 %f2657, 0f3C0885E4, 0f3D2AAABB, %p252;
+ fma.rn.f32 %f2658, %f5342, %f294, %f2657;
+ selp.f32 %f2659, 0fBE2AAAA8, 0fBEFFFFFF, %p252;
+ fma.rn.f32 %f2660, %f2658, %f294, %f2659;
+ mov.f32 %f2661, 0f00000000;
+ fma.rn.f32 %f2662, %f294, %f293, %f2661;
+ fma.rn.f32 %f5283, %f2660, %f2662, %f293;
+ and.b32 %r3302, %r332, 2;
+ setp.eq.s32 %p254, %r3302, 0;
+ @%p254 bra $L__BB0_272;
+
+ mov.f32 %f2664, 0fBF800000;
+ fma.rn.f32 %f5283, %f5283, %f2664, %f2661;
+
+$L__BB0_272:
+ selp.f32 %f301, %f5283, %f5284, %p7;
+ selp.f32 %f302, %f5281, %f5282, %p7;
+ @%p233 bra $L__BB0_467;
+
+ add.f32 %f5393, %f302, %f301;
+ mov.f32 %f5282, %f5281;
+ mov.f32 %f5284, %f5283;
+
+$L__BB0_467:
+ setp.lt.s32 %p417, %r12, 2;
+ and.pred %p419, %p33, %p417;
+ @%p419 bra $L__BB0_740;
+ bra.uni $L__BB0_468;
+
+$L__BB0_740:
+ mov.u32 %r7787, %ctaid.x;
+ shl.b32 %r4542, %r12, 5;
+ add.s32 %r4543, %r4542, %r1;
+ mul.hi.s32 %r4544, %r4543, -1840700269;
+ add.s32 %r4545, %r4544, %r4543;
+ shr.u32 %r4546, %r4545, 31;
+ shr.s32 %r4547, %r4545, 2;
+ add.s32 %r4548, %r4547, %r4546;
+ mul.lo.s32 %r4549, %r4548, %r2589;
+ shl.b32 %r4550, %r2587, 2;
+ add.s32 %r4551, %r14, %r4550;
+ add.s32 %r4552, %r4551, %r4549;
+ mul.lo.s32 %r4553, %r4548, 7;
+ sub.s32 %r4554, %r4543, %r4553;
+ mul.lo.s32 %r4555, %r4554, %r2590;
+ add.s32 %r4556, %r4552, %r4555;
+ mul.wide.s32 %rd1349, %r4556, 4;
+ add.s64 %rd1350, %rd3, %rd1349;
+ ld.global.f32 %f855, [%rd1350];
+ add.s32 %r4557, %r4543, 32;
+ mul.hi.s32 %r4558, %r4557, -1840700269;
+ add.s32 %r4559, %r4558, %r4557;
+ shr.u32 %r4560, %r4559, 31;
+ shr.s32 %r4561, %r4559, 2;
+ add.s32 %r4562, %r4561, %r4560;
+ mul.lo.s32 %r4563, %r4562, %r2589;
+ add.s32 %r4564, %r4551, %r4563;
+ mul.lo.s32 %r4565, %r4562, 7;
+ sub.s32 %r4566, %r4557, %r4565;
+ mul.lo.s32 %r4567, %r4566, %r2590;
+ add.s32 %r4568, %r4564, %r4567;
+ mul.wide.s32 %rd1351, %r4568, 4;
+ add.s64 %rd1352, %rd3, %rd1351;
+ ld.global.f32 %f856, [%rd1352];
+ add.s32 %r4569, %r2586, %r14;
+ add.s32 %r4570, %r4569, %r4549;
+ add.s32 %r4571, %r4570, %r4555;
+ mul.wide.s32 %rd1353, %r4571, 4;
+ add.s64 %rd1354, %rd3, %rd1353;
+ ld.global.f32 %f857, [%rd1354];
+ add.s32 %r4572, %r4569, %r4563;
+ add.s32 %r4573, %r4572, %r4567;
+ mul.wide.s32 %rd1355, %r4573, 4;
+ add.s64 %rd1356, %rd3, %rd1355;
+ ld.global.f32 %f858, [%rd1356];
+ mul.wide.s32 %rd1357, %r2587, 4;
+ add.s64 %rd1358, %rd1354, %rd1357;
+ ld.global.f32 %f859, [%rd1358];
+ add.s64 %rd1359, %rd1356, %rd1357;
+ ld.global.f32 %f860, [%rd1359];
+ add.s64 %rd1360, %rd1358, %rd1357;
+ ld.global.f32 %f861, [%rd1360];
+ add.s64 %rd1361, %rd1359, %rd1357;
+ ld.global.f32 %f862, [%rd1361];
+ mul.hi.s32 %r4575, %r4543, 954437177;
+ shr.u32 %r4576, %r4575, 31;
+ shr.s32 %r4577, %r4575, 1;
+ add.s32 %r4578, %r4577, %r4576;
+ mul.lo.s32 %r4579, %r4578, %r2579;
+ mad.lo.s32 %r4580, %r2578, %r7787, %r2576;
+ add.s32 %r4581, %r4580, %r2577;
+ add.s32 %r4582, %r4581, %r4579;
+ mul.lo.s32 %r4583, %r4578, 9;
+ sub.s32 %r4584, %r4543, %r4583;
+ mul.lo.s32 %r4585, %r4584, %r2580;
+ add.s32 %r4586, %r4582, %r4585;
+ mul.wide.s32 %rd1362, %r4586, 4;
+ add.s64 %rd1363, %rd2, %rd1362;
+ ld.global.f32 %f863, [%rd1363];
+ mul.hi.s32 %r4587, %r4557, 954437177;
+ shr.u32 %r4588, %r4587, 31;
+ shr.s32 %r4589, %r4587, 1;
+ add.s32 %r4590, %r4589, %r4588;
+ mul.lo.s32 %r4591, %r4590, %r2579;
+ add.s32 %r4592, %r4581, %r4591;
+ mul.lo.s32 %r4593, %r4590, 9;
+ sub.s32 %r4594, %r4557, %r4593;
+ mul.lo.s32 %r4595, %r4594, %r2580;
+ add.s32 %r4596, %r4592, %r4595;
+ mul.wide.s32 %rd1364, %r4596, 4;
+ add.s64 %rd1365, %rd2, %rd1364;
+ ld.global.f32 %f864, [%rd1365];
+ add.s32 %r4597, %r4581, %r2577;
+ add.s32 %r4598, %r4597, %r4579;
+ add.s32 %r4599, %r4598, %r4585;
+ mul.wide.s32 %rd1366, %r4599, 4;
+ add.s64 %rd1367, %rd2, %rd1366;
+ ld.global.f32 %f865, [%rd1367];
+ add.s32 %r4600, %r4597, %r4591;
+ add.s32 %r4601, %r4600, %r4595;
+ mul.wide.s32 %rd1368, %r4601, 4;
+ add.s64 %rd1369, %rd2, %rd1368;
+ ld.global.f32 %f866, [%rd1369];
+ add.s32 %r4602, %r4580, %r2576;
+ add.s32 %r4603, %r4602, %r4579;
+ add.s32 %r4604, %r4603, %r4585;
+ mul.wide.s32 %rd1370, %r4604, 4;
+ add.s64 %rd1371, %rd2, %rd1370;
+ ld.global.f32 %f867, [%rd1371];
+ add.s32 %r4605, %r4602, %r4591;
+ add.s32 %r4606, %r4605, %r4595;
+ mul.wide.s32 %rd1372, %r4606, 4;
+ add.s64 %rd1373, %rd2, %rd1372;
+ ld.global.f32 %f868, [%rd1373];
+ add.s32 %r4607, %r4602, %r2577;
+ add.s32 %r4608, %r4607, %r4579;
+ add.s32 %r4609, %r4608, %r4585;
+ mul.wide.s32 %rd1374, %r4609, 4;
+ add.s64 %rd1375, %rd2, %rd1374;
+ ld.global.f32 %f869, [%rd1375];
+ add.s32 %r4610, %r4607, %r4591;
+ add.s32 %r4611, %r4610, %r4595;
+ mul.wide.s32 %rd1376, %r4611, 4;
+ add.s64 %rd1377, %rd2, %rd1376;
+ ld.global.f32 %f870, [%rd1377];
+ mul.f32 %f3355, %f863, 0f3F22F983;
+ cvt.rni.s32.f32 %r8162, %f3355;
+ cvt.rn.f32.s32 %f3356, %r8162;
+ mov.f32 %f3357, 0fBFC90FDA;
+ fma.rn.f32 %f3358, %f3356, %f3357, %f863;
+ mov.f32 %f3359, 0fB3A22168;
+ fma.rn.f32 %f3360, %f3356, %f3359, %f3358;
+ mov.f32 %f3361, 0fA7C234C5;
+ fma.rn.f32 %f5544, %f3356, %f3361, %f3360;
+ abs.f32 %f872, %f863;
+ setp.ltu.f32 %p644, %f872, 0f47CE4780;
+ @%p644 bra $L__BB0_748;
+
+ setp.eq.f32 %p645, %f872, 0f7F800000;
+ @%p645 bra $L__BB0_747;
+ bra.uni $L__BB0_742;
+
+$L__BB0_747:
+ mov.f32 %f3364, 0f00000000;
+ mul.rn.f32 %f5544, %f863, %f3364;
+ mov.u32 %r8162, 0;
+ bra.uni $L__BB0_748;
+
+$L__BB0_468:
+ add.s32 %r630, %r12, 4;
+ setp.gt.s32 %p420, %r630, 14;
+ shl.b32 %r3846, %r2587, 2;
+ add.s32 %r631, %r14, %r3846;
+ @%p420 bra $L__BB0_473;
+
+ shl.b32 %r632, %r12, 5;
+ neg.s32 %r3847, %r632;
+ setp.ge.s32 %p421, %r11, %r3847;
+ @%p421 bra $L__BB0_471;
+
+ add.s32 %r3848, %r632, %r1;
+ mul.hi.s32 %r3849, %r3848, -1840700269;
+ add.s32 %r3850, %r3849, %r3848;
+ shr.u32 %r3851, %r3850, 31;
+ shr.s32 %r3852, %r3850, 2;
+ add.s32 %r3853, %r3852, %r3851;
+ mad.lo.s32 %r3854, %r3853, %r2589, %r631;
+ mul.lo.s32 %r3855, %r3853, 7;
+ sub.s32 %r3856, %r3848, %r3855;
+ mad.lo.s32 %r3857, %r3856, %r2590, %r3854;
+ mul.wide.s32 %rd1093, %r3857, 4;
+ add.s64 %rd1094, %rd3, %rd1093;
+ ld.global.f32 %f5607, [%rd1094];
$L__BB0_471:
- add.s32 %r641, %r8353, 1;
- and.b32 %r642, %r641, 1;
- setp.eq.s32 %p421, %r642, 0;
- selp.f32 %f504, %f5322, 0f3F800000, %p421;
- mul.rn.f32 %f505, %f5322, %f5322;
- mov.f32 %f5323, 0fB94D4153;
- @%p421 bra $L__BB0_473;
-
- mov.f32 %f2936, 0fBAB607ED;
- mov.f32 %f2937, 0f37CBAC00;
- fma.rn.f32 %f5323, %f2937, %f505, %f2936;
+ mov.u32 %r3858, -32;
+ sub.s32 %r3859, %r3858, %r632;
+ setp.ge.s32 %p422, %r11, %r3859;
+ @%p422 bra $L__BB0_473;
+
+ add.s32 %r3860, %r632, %r1;
+ add.s32 %r3861, %r3860, 32;
+ mul.hi.s32 %r3862, %r3861, -1840700269;
+ add.s32 %r3863, %r3862, %r3861;
+ shr.u32 %r3864, %r3863, 31;
+ shr.s32 %r3865, %r3863, 2;
+ add.s32 %r3866, %r3865, %r3864;
+ mad.lo.s32 %r3867, %r3866, %r2589, %r631;
+ mul.lo.s32 %r3868, %r3866, 7;
+ sub.s32 %r3869, %r3861, %r3868;
+ mad.lo.s32 %r3870, %r3869, %r2590, %r3867;
+ mul.wide.s32 %rd1095, %r3870, 4;
+ add.s64 %rd1096, %rd3, %rd1095;
+ ld.global.f32 %f5606, [%rd1096];
$L__BB0_473:
- selp.f32 %f2938, 0f3C0885E4, 0f3D2AAABB, %p421;
- fma.rn.f32 %f2939, %f5323, %f505, %f2938;
- selp.f32 %f2940, 0fBE2AAAA8, 0fBEFFFFFF, %p421;
- fma.rn.f32 %f2941, %f2939, %f505, %f2940;
- mov.f32 %f2942, 0f00000000;
- fma.rn.f32 %f2943, %f505, %f504, %f2942;
- fma.rn.f32 %f5324, %f2941, %f2943, %f504;
- and.b32 %r3907, %r641, 2;
- setp.eq.s32 %p423, %r3907, 0;
- @%p423 bra $L__BB0_475;
-
- mov.f32 %f2945, 0fBF800000;
- fma.rn.f32 %f5324, %f5324, %f2945, %f2942;
-
-$L__BB0_475:
- add.f32 %f5325, %f5321, %f5324;
- bra.uni $L__BB0_476;
-
-$L__BB0_56:
- mov.b32 %r2830, %f5348;
- shl.b32 %r2831, %r2830, 8;
- or.b32 %r47, %r2831, -2147483648;
- mov.u64 %rd2498, 0;
+ add.s32 %r633, %r12, 5;
+ setp.gt.s32 %p423, %r633, 14;
+ add.s32 %r634, %r2586, %r14;
+ @%p423 bra $L__BB0_478;
+
+ shl.b32 %r635, %r12, 5;
+ neg.s32 %r3871, %r635;
+ setp.ge.s32 %p424, %r11, %r3871;
+ @%p424 bra $L__BB0_476;
+
+ add.s32 %r3872, %r635, %r1;
+ mul.hi.s32 %r3873, %r3872, -1840700269;
+ add.s32 %r3874, %r3873, %r3872;
+ shr.u32 %r3875, %r3874, 31;
+ shr.s32 %r3876, %r3874, 2;
+ add.s32 %r3877, %r3876, %r3875;
+ mad.lo.s32 %r3878, %r3877, %r2589, %r634;
+ mul.lo.s32 %r3879, %r3877, 7;
+ sub.s32 %r3880, %r3872, %r3879;
+ mad.lo.s32 %r3881, %r3880, %r2590, %r3878;
+ mul.wide.s32 %rd1097, %r3881, 4;
+ add.s64 %rd1098, %rd3, %rd1097;
+ ld.global.f32 %f5406, [%rd1098];
+
+$L__BB0_476:
+ mov.u32 %r3882, -32;
+ sub.s32 %r3883, %r3882, %r635;
+ setp.ge.s32 %p425, %r11, %r3883;
+ @%p425 bra $L__BB0_478;
+
+ add.s32 %r3884, %r635, %r1;
+ add.s32 %r3885, %r3884, 32;
+ mul.hi.s32 %r3886, %r3885, -1840700269;
+ add.s32 %r3887, %r3886, %r3885;
+ shr.u32 %r3888, %r3887, 31;
+ shr.s32 %r3889, %r3887, 2;
+ add.s32 %r3890, %r3889, %r3888;
+ mad.lo.s32 %r3891, %r3890, %r2589, %r634;
+ mul.lo.s32 %r3892, %r3890, 7;
+ sub.s32 %r3893, %r3885, %r3892;
+ mad.lo.s32 %r3894, %r3893, %r2590, %r3891;
+ mul.wide.s32 %rd1099, %r3894, 4;
+ add.s64 %rd1100, %rd3, %rd1099;
+ ld.global.f32 %f5405, [%rd1100];
+
+$L__BB0_478:
+ add.s32 %r636, %r12, 6;
+ setp.gt.s32 %p426, %r636, 14;
+ add.s32 %r637, %r634, %r2587;
+ @%p426 bra $L__BB0_483;
+
+ shl.b32 %r638, %r12, 5;
+ neg.s32 %r3895, %r638;
+ setp.ge.s32 %p427, %r11, %r3895;
+ @%p427 bra $L__BB0_481;
+
+ add.s32 %r3896, %r638, %r1;
+ mul.hi.s32 %r3897, %r3896, -1840700269;
+ add.s32 %r3898, %r3897, %r3896;
+ shr.u32 %r3899, %r3898, 31;
+ shr.s32 %r3900, %r3898, 2;
+ add.s32 %r3901, %r3900, %r3899;
+ mad.lo.s32 %r3902, %r3901, %r2589, %r637;
+ mul.lo.s32 %r3903, %r3901, 7;
+ sub.s32 %r3904, %r3896, %r3903;
+ mad.lo.s32 %r3905, %r3904, %r2590, %r3902;
+ mul.wide.s32 %rd1101, %r3905, 4;
+ add.s64 %rd1102, %rd3, %rd1101;
+ ld.global.f32 %f5404, [%rd1102];
+
+$L__BB0_481:
+ mov.u32 %r3906, -32;
+ sub.s32 %r3907, %r3906, %r638;
+ setp.ge.s32 %p428, %r11, %r3907;
+ @%p428 bra $L__BB0_483;
+
+ add.s32 %r3908, %r638, %r1;
+ add.s32 %r3909, %r3908, 32;
+ mul.hi.s32 %r3910, %r3909, -1840700269;
+ add.s32 %r3911, %r3910, %r3909;
+ shr.u32 %r3912, %r3911, 31;
+ shr.s32 %r3913, %r3911, 2;
+ add.s32 %r3914, %r3913, %r3912;
+ mad.lo.s32 %r3915, %r3914, %r2589, %r637;
+ mul.lo.s32 %r3916, %r3914, 7;
+ sub.s32 %r3917, %r3909, %r3916;
+ mad.lo.s32 %r3918, %r3917, %r2590, %r3915;
+ mul.wide.s32 %rd1103, %r3918, 4;
+ add.s64 %rd1104, %rd3, %rd1103;
+ ld.global.f32 %f5403, [%rd1104];
+
+$L__BB0_483:
+ add.s32 %r639, %r12, 7;
+ setp.gt.s32 %p429, %r639, 14;
+ add.s32 %r640, %r637, %r2587;
+ @%p429 bra $L__BB0_488;
+
+ shl.b32 %r641, %r12, 5;
+ neg.s32 %r3919, %r641;
+ setp.ge.s32 %p430, %r11, %r3919;
+ @%p430 bra $L__BB0_486;
+
+ add.s32 %r3920, %r641, %r1;
+ mul.hi.s32 %r3921, %r3920, -1840700269;
+ add.s32 %r3922, %r3921, %r3920;
+ shr.u32 %r3923, %r3922, 31;
+ shr.s32 %r3924, %r3922, 2;
+ add.s32 %r3925, %r3924, %r3923;
+ mad.lo.s32 %r3926, %r3925, %r2589, %r640;
+ mul.lo.s32 %r3927, %r3925, 7;
+ sub.s32 %r3928, %r3920, %r3927;
+ mad.lo.s32 %r3929, %r3928, %r2590, %r3926;
+ mul.wide.s32 %rd1105, %r3929, 4;
+ add.s64 %rd1106, %rd3, %rd1105;
+ ld.global.f32 %f5402, [%rd1106];
+
+$L__BB0_486:
+ mov.u32 %r3930, -32;
+ sub.s32 %r3931, %r3930, %r641;
+ setp.ge.s32 %p431, %r11, %r3931;
+ @%p431 bra $L__BB0_488;
+
+ add.s32 %r3932, %r641, %r1;
+ add.s32 %r3933, %r3932, 32;
+ mul.hi.s32 %r3934, %r3933, -1840700269;
+ add.s32 %r3935, %r3934, %r3933;
+ shr.u32 %r3936, %r3935, 31;
+ shr.s32 %r3937, %r3935, 2;
+ add.s32 %r3938, %r3937, %r3936;
+ mad.lo.s32 %r3939, %r3938, %r2589, %r640;
+ mul.lo.s32 %r3940, %r3938, 7;
+ sub.s32 %r3941, %r3933, %r3940;
+ mad.lo.s32 %r3942, %r3941, %r2590, %r3939;
+ mul.wide.s32 %rd1107, %r3942, 4;
+ add.s64 %rd1108, %rd3, %rd1107;
+ ld.global.f32 %f5401, [%rd1108];
+
+$L__BB0_488:
+ mov.u32 %r7781, %ctaid.x;
+ mul.lo.s32 %r642, %r2578, %r7781;
+ add.s32 %r3944, %r2576, %r642;
+ add.s32 %r643, %r3944, %r2577;
+ @%p420 bra $L__BB0_493;
+
+ shl.b32 %r644, %r12, 5;
+ neg.s32 %r3945, %r644;
+ setp.ge.s32 %p433, %r11, %r3945;
+ @%p433 bra $L__BB0_491;
+
+ add.s32 %r3946, %r644, %r1;
+ mul.hi.s32 %r3947, %r3946, 954437177;
+ shr.u32 %r3948, %r3947, 31;
+ shr.s32 %r3949, %r3947, 1;
+ add.s32 %r3950, %r3949, %r3948;
+ mad.lo.s32 %r3951, %r3950, %r2579, %r643;
+ mul.lo.s32 %r3952, %r3950, 9;
+ sub.s32 %r3953, %r3946, %r3952;
+ mad.lo.s32 %r3954, %r3953, %r2580, %r3951;
+ mul.wide.s32 %rd1109, %r3954, 4;
+ add.s64 %rd1110, %rd2, %rd1109;
+ ld.global.f32 %f5416, [%rd1110];
+
+$L__BB0_491:
+ mov.u32 %r3955, -32;
+ sub.s32 %r3956, %r3955, %r644;
+ setp.ge.s32 %p434, %r11, %r3956;
+ @%p434 bra $L__BB0_493;
+
+ add.s32 %r3957, %r644, %r1;
+ add.s32 %r3958, %r3957, 32;
+ mul.hi.s32 %r3959, %r3958, 954437177;
+ shr.u32 %r3960, %r3959, 31;
+ shr.s32 %r3961, %r3959, 1;
+ add.s32 %r3962, %r3961, %r3960;
+ mad.lo.s32 %r3963, %r3962, %r2579, %r643;
+ mul.lo.s32 %r3964, %r3962, 9;
+ sub.s32 %r3965, %r3958, %r3964;
+ mad.lo.s32 %r3966, %r3965, %r2580, %r3963;
+ mul.wide.s32 %rd1111, %r3966, 4;
+ add.s64 %rd1112, %rd2, %rd1111;
+ ld.global.f32 %f5415, [%rd1112];
+
+$L__BB0_493:
+ add.s32 %r645, %r643, %r2577;
+ @%p423 bra $L__BB0_498;
+
+ shl.b32 %r646, %r12, 5;
+ neg.s32 %r3967, %r646;
+ setp.ge.s32 %p436, %r11, %r3967;
+ @%p436 bra $L__BB0_496;
+
+ add.s32 %r3968, %r646, %r1;
+ mul.hi.s32 %r3969, %r3968, 954437177;
+ shr.u32 %r3970, %r3969, 31;
+ shr.s32 %r3971, %r3969, 1;
+ add.s32 %r3972, %r3971, %r3970;
+ mad.lo.s32 %r3973, %r3972, %r2579, %r645;
+ mul.lo.s32 %r3974, %r3972, 9;
+ sub.s32 %r3975, %r3968, %r3974;
+ mad.lo.s32 %r3976, %r3975, %r2580, %r3973;
+ mul.wide.s32 %rd1113, %r3976, 4;
+ add.s64 %rd1114, %rd2, %rd1113;
+ ld.global.f32 %f5414, [%rd1114];
+
+$L__BB0_496:
+ mov.u32 %r3977, -32;
+ sub.s32 %r3978, %r3977, %r646;
+ setp.ge.s32 %p437, %r11, %r3978;
+ @%p437 bra $L__BB0_498;
+
+ add.s32 %r3979, %r646, %r1;
+ add.s32 %r3980, %r3979, 32;
+ mul.hi.s32 %r3981, %r3980, 954437177;
+ shr.u32 %r3982, %r3981, 31;
+ shr.s32 %r3983, %r3981, 1;
+ add.s32 %r3984, %r3983, %r3982;
+ mad.lo.s32 %r3985, %r3984, %r2579, %r645;
+ mul.lo.s32 %r3986, %r3984, 9;
+ sub.s32 %r3987, %r3980, %r3986;
+ mad.lo.s32 %r3988, %r3987, %r2580, %r3985;
+ mul.wide.s32 %rd1115, %r3988, 4;
+ add.s64 %rd1116, %rd2, %rd1115;
+ ld.global.f32 %f5413, [%rd1116];
+
+$L__BB0_498:
+ shl.b32 %r3989, %r2576, 1;
+ add.s32 %r647, %r3989, %r642;
+ @%p426 bra $L__BB0_503;
+
+ shl.b32 %r648, %r12, 5;
+ neg.s32 %r3990, %r648;
+ setp.ge.s32 %p439, %r11, %r3990;
+ @%p439 bra $L__BB0_501;
+
+ add.s32 %r3991, %r648, %r1;
+ mul.hi.s32 %r3992, %r3991, 954437177;
+ shr.u32 %r3993, %r3992, 31;
+ shr.s32 %r3994, %r3992, 1;
+ add.s32 %r3995, %r3994, %r3993;
+ mad.lo.s32 %r3996, %r3995, %r2579, %r647;
+ mul.lo.s32 %r3997, %r3995, 9;
+ sub.s32 %r3998, %r3991, %r3997;
+ mad.lo.s32 %r3999, %r3998, %r2580, %r3996;
+ mul.wide.s32 %rd1117, %r3999, 4;
+ add.s64 %rd1118, %rd2, %rd1117;
+ ld.global.f32 %f5412, [%rd1118];
+
+$L__BB0_501:
+ mov.u32 %r4000, -32;
+ sub.s32 %r4001, %r4000, %r648;
+ setp.ge.s32 %p440, %r11, %r4001;
+ @%p440 bra $L__BB0_503;
+
+ add.s32 %r4002, %r648, %r1;
+ add.s32 %r4003, %r4002, 32;
+ mul.hi.s32 %r4004, %r4003, 954437177;
+ shr.u32 %r4005, %r4004, 31;
+ shr.s32 %r4006, %r4004, 1;
+ add.s32 %r4007, %r4006, %r4005;
+ mad.lo.s32 %r4008, %r4007, %r2579, %r647;
+ mul.lo.s32 %r4009, %r4007, 9;
+ sub.s32 %r4010, %r4003, %r4009;
+ mad.lo.s32 %r4011, %r4010, %r2580, %r4008;
+ mul.wide.s32 %rd1119, %r4011, 4;
+ add.s64 %rd1120, %rd2, %rd1119;
+ ld.global.f32 %f5411, [%rd1120];
+
+$L__BB0_503:
+ add.s32 %r649, %r647, %r2577;
+ @%p429 bra $L__BB0_508;
+
+ shl.b32 %r650, %r12, 5;
+ neg.s32 %r4012, %r650;
+ setp.ge.s32 %p442, %r11, %r4012;
+ @%p442 bra $L__BB0_506;
+
+ add.s32 %r4013, %r650, %r1;
+ mul.hi.s32 %r4014, %r4013, 954437177;
+ shr.u32 %r4015, %r4014, 31;
+ shr.s32 %r4016, %r4014, 1;
+ add.s32 %r4017, %r4016, %r4015;
+ mad.lo.s32 %r4018, %r4017, %r2579, %r649;
+ mul.lo.s32 %r4019, %r4017, 9;
+ sub.s32 %r4020, %r4013, %r4019;
+ mad.lo.s32 %r4021, %r4020, %r2580, %r4018;
+ mul.wide.s32 %rd1121, %r4021, 4;
+ add.s64 %rd1122, %rd2, %rd1121;
+ ld.global.f32 %f5410, [%rd1122];
+
+$L__BB0_506:
+ mov.u32 %r4022, -32;
+ sub.s32 %r4023, %r4022, %r650;
+ setp.ge.s32 %p443, %r11, %r4023;
+ @%p443 bra $L__BB0_508;
+
+ add.s32 %r4024, %r650, %r1;
+ add.s32 %r4025, %r4024, 32;
+ mul.hi.s32 %r4026, %r4025, 954437177;
+ shr.u32 %r4027, %r4026, 31;
+ shr.s32 %r4028, %r4026, 1;
+ add.s32 %r4029, %r4028, %r4027;
+ mad.lo.s32 %r4030, %r4029, %r2579, %r649;
+ mul.lo.s32 %r4031, %r4029, 9;
+ sub.s32 %r4032, %r4025, %r4031;
+ mad.lo.s32 %r4033, %r4032, %r2580, %r4030;
+ mul.wide.s32 %rd1123, %r4033, 4;
+ add.s64 %rd1124, %rd2, %rd1123;
+ ld.global.f32 %f5409, [%rd1124];
+
+$L__BB0_508:
+ @%p420 bra $L__BB0_537;
+
+ shl.b32 %r4034, %r12, 5;
+ neg.s32 %r651, %r4034;
+ setp.ge.s32 %p445, %r11, %r651;
+ @%p445 bra $L__BB0_522;
+
+ mul.f32 %f3004, %f5416, 0f3F22F983;
+ cvt.rni.s32.f32 %r8098, %f3004;
+ cvt.rn.f32.s32 %f3005, %r8098;
+ mov.f32 %f3006, 0fBFC90FDA;
+ fma.rn.f32 %f3007, %f3005, %f3006, %f5416;
+ mov.f32 %f3008, 0fB3A22168;
+ fma.rn.f32 %f3009, %f3005, %f3008, %f3007;
+ mov.f32 %f3010, 0fA7C234C5;
+ fma.rn.f32 %f5445, %f3005, %f3010, %f3009;
+ abs.f32 %f589, %f5416;
+ setp.ltu.f32 %p446, %f589, 0f47CE4780;
+ @%p446 bra $L__BB0_518;
+
+ setp.eq.f32 %p447, %f589, 0f7F800000;
+ @%p447 bra $L__BB0_517;
+ bra.uni $L__BB0_512;
+
+$L__BB0_517:
+ mov.f32 %f3013, 0f00000000;
+ mul.rn.f32 %f5445, %f5416, %f3013;
+ mov.u32 %r8098, 0;
+ bra.uni $L__BB0_518;
+
+$L__BB0_742:
+ mov.b32 %r956, %f863;
+ shr.u32 %r4613, %r956, 23;
+ and.b32 %r4614, %r4613, 255;
+ add.s32 %r957, %r4614, -128;
+ shl.b32 %r4615, %r956, 8;
+ or.b32 %r958, %r4615, -2147483648;
+ shr.u32 %r959, %r957, 5;
+ mov.u64 %rd2601, 0;
+ mov.u32 %r8159, 0;
+ mov.u64 %rd1381, __cudart_i2opi_f;
+ mov.u64 %rd2602, %rd2601;
+
+$L__BB0_743:
+ .pragma "nounroll";
+ shl.b64 %rd1380, %rd2601, 2;
+ add.s64 %rd1382, %rd1381, %rd1380;
+ ld.global.nc.u32 %r4616, [%rd1382];
+ mad.wide.u32 %rd1383, %r4616, %r958, %rd2602;
+ shr.u64 %rd2602, %rd1383, 32;
+ add.s64 %rd1384, %rd1, %rd1380;
+ st.local.u32 [%rd1384], %rd1383;
+ add.s32 %r8159, %r8159, 1;
+ cvt.s64.s32 %rd2601, %r8159;
+ setp.ne.s32 %p646, %r8159, 6;
+ @%p646 bra $L__BB0_743;
+
+ st.local.u32 [%rd4], %rd2602;
+ mov.u32 %r4617, 4;
+ sub.s32 %r962, %r4617, %r959;
+ mov.u32 %r4618, 6;
+ sub.s32 %r4619, %r4618, %r959;
+ mul.wide.s32 %rd1385, %r4619, 4;
+ add.s64 %rd1386, %rd1, %rd1385;
+ ld.local.u32 %r8160, [%rd1386];
+ ld.local.u32 %r8161, [%rd1386+-4];
+ and.b32 %r965, %r957, 31;
+ setp.eq.s32 %p647, %r965, 0;
+ @%p647 bra $L__BB0_746;
+
+ mov.u32 %r4620, 32;
+ sub.s32 %r4621, %r4620, %r965;
+ shr.u32 %r4622, %r8161, %r4621;
+ shl.b32 %r4623, %r8160, %r965;
+ add.s32 %r8160, %r4622, %r4623;
+ mul.wide.s32 %rd1387, %r962, 4;
+ add.s64 %rd1388, %rd1, %rd1387;
+ ld.local.u32 %r4624, [%rd1388];
+ shr.u32 %r4625, %r4624, %r4621;
+ shl.b32 %r4626, %r8161, %r965;
+ add.s32 %r8161, %r4625, %r4626;
+
+$L__BB0_746:
+ and.b32 %r4627, %r956, -2147483648;
+ shr.u32 %r4628, %r8161, 30;
+ shl.b32 %r4629, %r8160, 2;
+ or.b32 %r4630, %r4628, %r4629;
+ shr.u32 %r4631, %r4630, 31;
+ shr.u32 %r4632, %r8160, 30;
+ add.s32 %r4633, %r4631, %r4632;
+ neg.s32 %r4634, %r4633;
+ setp.eq.s32 %p648, %r4627, 0;
+ selp.b32 %r8162, %r4633, %r4634, %p648;
+ setp.ne.s32 %p649, %r4631, 0;
+ xor.b32 %r4635, %r4627, -2147483648;
+ selp.b32 %r4636, %r4635, %r4627, %p649;
+ selp.b32 %r4637, -1, 0, %p649;
+ xor.b32 %r4638, %r4630, %r4637;
+ shl.b32 %r4639, %r8161, 2;
+ xor.b32 %r4640, %r4639, %r4637;
+ cvt.u64.u32 %rd1389, %r4638;
+ cvt.u64.u32 %rd1390, %r4640;
+ bfi.b64 %rd1391, %rd1389, %rd1390, 32, 32;
+ cvt.rn.f64.s64 %fd97, %rd1391;
+ mul.f64 %fd98, %fd97, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3362, %fd98;
+ setp.eq.s32 %p650, %r4636, 0;
+ neg.f32 %f3363, %f3362;
+ selp.f32 %f5544, %f3362, %f3363, %p650;
+
+$L__BB0_748:
+ and.b32 %r972, %r8162, 1;
+ setp.eq.s32 %p651, %r972, 0;
+ selp.f32 %f876, %f5544, 0f3F800000, %p651;
+ mul.rn.f32 %f877, %f5544, %f5544;
+ mov.f32 %f5545, 0fB94D4153;
+ @%p651 bra $L__BB0_750;
+
+ mov.f32 %f3366, 0fBAB607ED;
+ mov.f32 %f3367, 0f37CBAC00;
+ fma.rn.f32 %f5545, %f3367, %f877, %f3366;
+
+$L__BB0_750:
+ selp.f32 %f3368, 0f3C0885E4, 0f3D2AAABB, %p651;
+ fma.rn.f32 %f3369, %f5545, %f877, %f3368;
+ selp.f32 %f3370, 0fBE2AAAA8, 0fBEFFFFFF, %p651;
+ fma.rn.f32 %f3371, %f3369, %f877, %f3370;
+ mov.f32 %f3372, 0f00000000;
+ fma.rn.f32 %f3373, %f877, %f876, %f3372;
+ fma.rn.f32 %f5546, %f3371, %f3373, %f876;
+ and.b32 %r4642, %r8162, 2;
+ setp.eq.s32 %p653, %r4642, 0;
+ @%p653 bra $L__BB0_752;
+
+ mov.f32 %f3375, 0fBF800000;
+ fma.rn.f32 %f5546, %f5546, %f3375, %f3372;
+
+$L__BB0_752:
+ mul.f32 %f3376, %f855, 0f3F22F983;
+ cvt.rni.s32.f32 %r8166, %f3376;
+ cvt.rn.f32.s32 %f3377, %r8166;
+ mov.f32 %f3378, 0fBFC90FDA;
+ fma.rn.f32 %f3379, %f3377, %f3378, %f855;
+ mov.f32 %f3380, 0fB3A22168;
+ fma.rn.f32 %f3381, %f3377, %f3380, %f3379;
+ mov.f32 %f3382, 0fA7C234C5;
+ fma.rn.f32 %f5547, %f3377, %f3382, %f3381;
+ abs.f32 %f884, %f855;
+ setp.ltu.f32 %p654, %f884, 0f47CE4780;
+ @%p654 bra $L__BB0_760;
+
+ setp.eq.f32 %p655, %f884, 0f7F800000;
+ @%p655 bra $L__BB0_759;
+ bra.uni $L__BB0_754;
+
+$L__BB0_759:
+ mov.f32 %f3385, 0f00000000;
+ mul.rn.f32 %f5547, %f855, %f3385;
+ mov.u32 %r8166, 0;
+ bra.uni $L__BB0_760;
+
+$L__BB0_754:
+ mov.b32 %r974, %f855;
+ shr.u32 %r4644, %r974, 23;
+ and.b32 %r4645, %r4644, 255;
+ add.s32 %r975, %r4645, -128;
+ shl.b32 %r4646, %r974, 8;
+ or.b32 %r976, %r4646, -2147483648;
+ shr.u32 %r977, %r975, 5;
+ mov.u64 %rd2603, 0;
+ mov.u32 %r8163, 0;
+ mov.u64 %rd1395, __cudart_i2opi_f;
+ mov.u64 %rd2604, %rd2603;
+
+$L__BB0_755:
+ .pragma "nounroll";
+ shl.b64 %rd1394, %rd2603, 2;
+ add.s64 %rd1396, %rd1395, %rd1394;
+ ld.global.nc.u32 %r4647, [%rd1396];
+ mad.wide.u32 %rd1397, %r4647, %r976, %rd2604;
+ shr.u64 %rd2604, %rd1397, 32;
+ add.s64 %rd1398, %rd1, %rd1394;
+ st.local.u32 [%rd1398], %rd1397;
+ add.s32 %r8163, %r8163, 1;
+ cvt.s64.s32 %rd2603, %r8163;
+ setp.ne.s32 %p656, %r8163, 6;
+ @%p656 bra $L__BB0_755;
+
+ st.local.u32 [%rd4], %rd2604;
+ mov.u32 %r4648, 4;
+ sub.s32 %r980, %r4648, %r977;
+ mov.u32 %r4649, 6;
+ sub.s32 %r4650, %r4649, %r977;
+ mul.wide.s32 %rd1399, %r4650, 4;
+ add.s64 %rd1400, %rd1, %rd1399;
+ ld.local.u32 %r8164, [%rd1400];
+ ld.local.u32 %r8165, [%rd1400+-4];
+ and.b32 %r983, %r975, 31;
+ setp.eq.s32 %p657, %r983, 0;
+ @%p657 bra $L__BB0_758;
+
+ mov.u32 %r4651, 32;
+ sub.s32 %r4652, %r4651, %r983;
+ shr.u32 %r4653, %r8165, %r4652;
+ shl.b32 %r4654, %r8164, %r983;
+ add.s32 %r8164, %r4653, %r4654;
+ mul.wide.s32 %rd1401, %r980, 4;
+ add.s64 %rd1402, %rd1, %rd1401;
+ ld.local.u32 %r4655, [%rd1402];
+ shr.u32 %r4656, %r4655, %r4652;
+ shl.b32 %r4657, %r8165, %r983;
+ add.s32 %r8165, %r4656, %r4657;
+
+$L__BB0_758:
+ and.b32 %r4658, %r974, -2147483648;
+ shr.u32 %r4659, %r8165, 30;
+ shl.b32 %r4660, %r8164, 2;
+ or.b32 %r4661, %r4659, %r4660;
+ shr.u32 %r4662, %r4661, 31;
+ shr.u32 %r4663, %r8164, 30;
+ add.s32 %r4664, %r4662, %r4663;
+ neg.s32 %r4665, %r4664;
+ setp.eq.s32 %p658, %r4658, 0;
+ selp.b32 %r8166, %r4664, %r4665, %p658;
+ setp.ne.s32 %p659, %r4662, 0;
+ xor.b32 %r4666, %r4658, -2147483648;
+ selp.b32 %r4667, %r4666, %r4658, %p659;
+ selp.b32 %r4668, -1, 0, %p659;
+ xor.b32 %r4669, %r4661, %r4668;
+ shl.b32 %r4670, %r8165, 2;
+ xor.b32 %r4671, %r4670, %r4668;
+ cvt.u64.u32 %rd1403, %r4669;
+ cvt.u64.u32 %rd1404, %r4671;
+ bfi.b64 %rd1405, %rd1403, %rd1404, 32, 32;
+ cvt.rn.f64.s64 %fd99, %rd1405;
+ mul.f64 %fd100, %fd99, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3383, %fd100;
+ setp.eq.s32 %p660, %r4667, 0;
+ neg.f32 %f3384, %f3383;
+ selp.f32 %f5547, %f3383, %f3384, %p660;
+
+$L__BB0_760:
+ add.s32 %r990, %r8166, 1;
+ and.b32 %r991, %r990, 1;
+ setp.eq.s32 %p661, %r991, 0;
+ selp.f32 %f888, %f5547, 0f3F800000, %p661;
+ mul.rn.f32 %f889, %f5547, %f5547;
+ mov.f32 %f5548, 0fB94D4153;
+ @%p661 bra $L__BB0_762;
+
+ mov.f32 %f3387, 0fBAB607ED;
+ mov.f32 %f3388, 0f37CBAC00;
+ fma.rn.f32 %f5548, %f3388, %f889, %f3387;
+
+$L__BB0_762:
+ selp.f32 %f3389, 0f3C0885E4, 0f3D2AAABB, %p661;
+ fma.rn.f32 %f3390, %f5548, %f889, %f3389;
+ selp.f32 %f3391, 0fBE2AAAA8, 0fBEFFFFFF, %p661;
+ fma.rn.f32 %f3392, %f3390, %f889, %f3391;
+ mov.f32 %f3393, 0f00000000;
+ fma.rn.f32 %f3394, %f889, %f888, %f3393;
+ fma.rn.f32 %f5549, %f3392, %f3394, %f888;
+ and.b32 %r4673, %r990, 2;
+ setp.eq.s32 %p663, %r4673, 0;
+ @%p663 bra $L__BB0_764;
+
+ mov.f32 %f3396, 0fBF800000;
+ fma.rn.f32 %f5549, %f5549, %f3396, %f3393;
+
+$L__BB0_764:
+ add.f32 %f5599, %f5546, %f5549;
+ mul.f32 %f3397, %f864, 0f3F22F983;
+ cvt.rni.s32.f32 %r8170, %f3397;
+ cvt.rn.f32.s32 %f3398, %r8170;
+ mov.f32 %f3399, 0fBFC90FDA;
+ fma.rn.f32 %f3400, %f3398, %f3399, %f864;
+ mov.f32 %f3401, 0fB3A22168;
+ fma.rn.f32 %f3402, %f3398, %f3401, %f3400;
+ mov.f32 %f3403, 0fA7C234C5;
+ fma.rn.f32 %f5550, %f3398, %f3403, %f3402;
+ abs.f32 %f897, %f864;
+ setp.ltu.f32 %p664, %f897, 0f47CE4780;
+ @%p664 bra $L__BB0_772;
+
+ setp.eq.f32 %p665, %f897, 0f7F800000;
+ @%p665 bra $L__BB0_771;
+ bra.uni $L__BB0_766;
+
+$L__BB0_771:
+ mov.f32 %f3406, 0f00000000;
+ mul.rn.f32 %f5550, %f864, %f3406;
+ mov.u32 %r8170, 0;
+ bra.uni $L__BB0_772;
+
+$L__BB0_766:
+ mov.b32 %r993, %f864;
+ shr.u32 %r4675, %r993, 23;
+ and.b32 %r4676, %r4675, 255;
+ add.s32 %r994, %r4676, -128;
+ shl.b32 %r4677, %r993, 8;
+ or.b32 %r995, %r4677, -2147483648;
+ shr.u32 %r996, %r994, 5;
+ mov.u64 %rd2605, 0;
+ mov.u32 %r8167, 0;
+ mov.u64 %rd1409, __cudart_i2opi_f;
+ mov.u64 %rd2606, %rd2605;
+
+$L__BB0_767:
+ .pragma "nounroll";
+ shl.b64 %rd1408, %rd2605, 2;
+ add.s64 %rd1410, %rd1409, %rd1408;
+ ld.global.nc.u32 %r4678, [%rd1410];
+ mad.wide.u32 %rd1411, %r4678, %r995, %rd2606;
+ shr.u64 %rd2606, %rd1411, 32;
+ add.s64 %rd1412, %rd1, %rd1408;
+ st.local.u32 [%rd1412], %rd1411;
+ add.s32 %r8167, %r8167, 1;
+ cvt.s64.s32 %rd2605, %r8167;
+ setp.ne.s32 %p666, %r8167, 6;
+ @%p666 bra $L__BB0_767;
+
+ st.local.u32 [%rd4], %rd2606;
+ mov.u32 %r4679, 4;
+ sub.s32 %r999, %r4679, %r996;
+ mov.u32 %r4680, 6;
+ sub.s32 %r4681, %r4680, %r996;
+ mul.wide.s32 %rd1413, %r4681, 4;
+ add.s64 %rd1414, %rd1, %rd1413;
+ ld.local.u32 %r8168, [%rd1414];
+ ld.local.u32 %r8169, [%rd1414+-4];
+ and.b32 %r1002, %r994, 31;
+ setp.eq.s32 %p667, %r1002, 0;
+ @%p667 bra $L__BB0_770;
+
+ mov.u32 %r4682, 32;
+ sub.s32 %r4683, %r4682, %r1002;
+ shr.u32 %r4684, %r8169, %r4683;
+ shl.b32 %r4685, %r8168, %r1002;
+ add.s32 %r8168, %r4684, %r4685;
+ mul.wide.s32 %rd1415, %r999, 4;
+ add.s64 %rd1416, %rd1, %rd1415;
+ ld.local.u32 %r4686, [%rd1416];
+ shr.u32 %r4687, %r4686, %r4683;
+ shl.b32 %r4688, %r8169, %r1002;
+ add.s32 %r8169, %r4687, %r4688;
+
+$L__BB0_770:
+ and.b32 %r4689, %r993, -2147483648;
+ shr.u32 %r4690, %r8169, 30;
+ shl.b32 %r4691, %r8168, 2;
+ or.b32 %r4692, %r4690, %r4691;
+ shr.u32 %r4693, %r4692, 31;
+ shr.u32 %r4694, %r8168, 30;
+ add.s32 %r4695, %r4693, %r4694;
+ neg.s32 %r4696, %r4695;
+ setp.eq.s32 %p668, %r4689, 0;
+ selp.b32 %r8170, %r4695, %r4696, %p668;
+ setp.ne.s32 %p669, %r4693, 0;
+ xor.b32 %r4697, %r4689, -2147483648;
+ selp.b32 %r4698, %r4697, %r4689, %p669;
+ selp.b32 %r4699, -1, 0, %p669;
+ xor.b32 %r4700, %r4692, %r4699;
+ shl.b32 %r4701, %r8169, 2;
+ xor.b32 %r4702, %r4701, %r4699;
+ cvt.u64.u32 %rd1417, %r4700;
+ cvt.u64.u32 %rd1418, %r4702;
+ bfi.b64 %rd1419, %rd1417, %rd1418, 32, 32;
+ cvt.rn.f64.s64 %fd101, %rd1419;
+ mul.f64 %fd102, %fd101, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3404, %fd102;
+ setp.eq.s32 %p670, %r4698, 0;
+ neg.f32 %f3405, %f3404;
+ selp.f32 %f5550, %f3404, %f3405, %p670;
+
+$L__BB0_772:
+ and.b32 %r1009, %r8170, 1;
+ setp.eq.s32 %p671, %r1009, 0;
+ selp.f32 %f901, %f5550, 0f3F800000, %p671;
+ mul.rn.f32 %f902, %f5550, %f5550;
+ mov.f32 %f5551, 0fB94D4153;
+ @%p671 bra $L__BB0_774;
+
+ mov.f32 %f3408, 0fBAB607ED;
+ mov.f32 %f3409, 0f37CBAC00;
+ fma.rn.f32 %f5551, %f3409, %f902, %f3408;
+
+$L__BB0_774:
+ selp.f32 %f3410, 0f3C0885E4, 0f3D2AAABB, %p671;
+ fma.rn.f32 %f3411, %f5551, %f902, %f3410;
+ selp.f32 %f3412, 0fBE2AAAA8, 0fBEFFFFFF, %p671;
+ fma.rn.f32 %f3413, %f3411, %f902, %f3412;
+ mov.f32 %f3414, 0f00000000;
+ fma.rn.f32 %f3415, %f902, %f901, %f3414;
+ fma.rn.f32 %f5552, %f3413, %f3415, %f901;
+ and.b32 %r4704, %r8170, 2;
+ setp.eq.s32 %p673, %r4704, 0;
+ @%p673 bra $L__BB0_776;
+
+ mov.f32 %f3417, 0fBF800000;
+ fma.rn.f32 %f5552, %f5552, %f3417, %f3414;
+
+$L__BB0_776:
+ mul.f32 %f3418, %f856, 0f3F22F983;
+ cvt.rni.s32.f32 %r8174, %f3418;
+ cvt.rn.f32.s32 %f3419, %r8174;
+ mov.f32 %f3420, 0fBFC90FDA;
+ fma.rn.f32 %f3421, %f3419, %f3420, %f856;
+ mov.f32 %f3422, 0fB3A22168;
+ fma.rn.f32 %f3423, %f3419, %f3422, %f3421;
+ mov.f32 %f3424, 0fA7C234C5;
+ fma.rn.f32 %f5553, %f3419, %f3424, %f3423;
+ abs.f32 %f909, %f856;
+ setp.ltu.f32 %p674, %f909, 0f47CE4780;
+ @%p674 bra $L__BB0_784;
+
+ setp.eq.f32 %p675, %f909, 0f7F800000;
+ @%p675 bra $L__BB0_783;
+ bra.uni $L__BB0_778;
+
+$L__BB0_783:
+ mov.f32 %f3427, 0f00000000;
+ mul.rn.f32 %f5553, %f856, %f3427;
+ mov.u32 %r8174, 0;
+ bra.uni $L__BB0_784;
+
+$L__BB0_778:
+ mov.b32 %r1011, %f856;
+ shr.u32 %r4706, %r1011, 23;
+ and.b32 %r4707, %r4706, 255;
+ add.s32 %r1012, %r4707, -128;
+ shl.b32 %r4708, %r1011, 8;
+ or.b32 %r1013, %r4708, -2147483648;
+ shr.u32 %r1014, %r1012, 5;
+ mov.u64 %rd2607, 0;
+ mov.u32 %r8171, 0;
+ mov.u64 %rd1423, __cudart_i2opi_f;
+ mov.u64 %rd2608, %rd2607;
+
+$L__BB0_779:
+ .pragma "nounroll";
+ shl.b64 %rd1422, %rd2607, 2;
+ add.s64 %rd1424, %rd1423, %rd1422;
+ ld.global.nc.u32 %r4709, [%rd1424];
+ mad.wide.u32 %rd1425, %r4709, %r1013, %rd2608;
+ shr.u64 %rd2608, %rd1425, 32;
+ add.s64 %rd1426, %rd1, %rd1422;
+ st.local.u32 [%rd1426], %rd1425;
+ add.s32 %r8171, %r8171, 1;
+ cvt.s64.s32 %rd2607, %r8171;
+ setp.ne.s32 %p676, %r8171, 6;
+ @%p676 bra $L__BB0_779;
+
+ st.local.u32 [%rd4], %rd2608;
+ mov.u32 %r4710, 4;
+ sub.s32 %r1017, %r4710, %r1014;
+ mov.u32 %r4711, 6;
+ sub.s32 %r4712, %r4711, %r1014;
+ mul.wide.s32 %rd1427, %r4712, 4;
+ add.s64 %rd1428, %rd1, %rd1427;
+ ld.local.u32 %r8172, [%rd1428];
+ ld.local.u32 %r8173, [%rd1428+-4];
+ and.b32 %r1020, %r1012, 31;
+ setp.eq.s32 %p677, %r1020, 0;
+ @%p677 bra $L__BB0_782;
+
+ mov.u32 %r4713, 32;
+ sub.s32 %r4714, %r4713, %r1020;
+ shr.u32 %r4715, %r8173, %r4714;
+ shl.b32 %r4716, %r8172, %r1020;
+ add.s32 %r8172, %r4715, %r4716;
+ mul.wide.s32 %rd1429, %r1017, 4;
+ add.s64 %rd1430, %rd1, %rd1429;
+ ld.local.u32 %r4717, [%rd1430];
+ shr.u32 %r4718, %r4717, %r4714;
+ shl.b32 %r4719, %r8173, %r1020;
+ add.s32 %r8173, %r4718, %r4719;
+
+$L__BB0_782:
+ and.b32 %r4720, %r1011, -2147483648;
+ shr.u32 %r4721, %r8173, 30;
+ shl.b32 %r4722, %r8172, 2;
+ or.b32 %r4723, %r4721, %r4722;
+ shr.u32 %r4724, %r4723, 31;
+ shr.u32 %r4725, %r8172, 30;
+ add.s32 %r4726, %r4724, %r4725;
+ neg.s32 %r4727, %r4726;
+ setp.eq.s32 %p678, %r4720, 0;
+ selp.b32 %r8174, %r4726, %r4727, %p678;
+ setp.ne.s32 %p679, %r4724, 0;
+ xor.b32 %r4728, %r4720, -2147483648;
+ selp.b32 %r4729, %r4728, %r4720, %p679;
+ selp.b32 %r4730, -1, 0, %p679;
+ xor.b32 %r4731, %r4723, %r4730;
+ shl.b32 %r4732, %r8173, 2;
+ xor.b32 %r4733, %r4732, %r4730;
+ cvt.u64.u32 %rd1431, %r4731;
+ cvt.u64.u32 %rd1432, %r4733;
+ bfi.b64 %rd1433, %rd1431, %rd1432, 32, 32;
+ cvt.rn.f64.s64 %fd103, %rd1433;
+ mul.f64 %fd104, %fd103, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3425, %fd104;
+ setp.eq.s32 %p680, %r4729, 0;
+ neg.f32 %f3426, %f3425;
+ selp.f32 %f5553, %f3425, %f3426, %p680;
+
+$L__BB0_784:
+ add.s32 %r1027, %r8174, 1;
+ and.b32 %r1028, %r1027, 1;
+ setp.eq.s32 %p681, %r1028, 0;
+ selp.f32 %f913, %f5553, 0f3F800000, %p681;
+ mul.rn.f32 %f914, %f5553, %f5553;
+ mov.f32 %f5554, 0fB94D4153;
+ @%p681 bra $L__BB0_786;
+
+ mov.f32 %f3429, 0fBAB607ED;
+ mov.f32 %f3430, 0f37CBAC00;
+ fma.rn.f32 %f5554, %f3430, %f914, %f3429;
+
+$L__BB0_786:
+ selp.f32 %f3431, 0f3C0885E4, 0f3D2AAABB, %p681;
+ fma.rn.f32 %f3432, %f5554, %f914, %f3431;
+ selp.f32 %f3433, 0fBE2AAAA8, 0fBEFFFFFF, %p681;
+ fma.rn.f32 %f3434, %f3432, %f914, %f3433;
+ mov.f32 %f3435, 0f00000000;
+ fma.rn.f32 %f3436, %f914, %f913, %f3435;
+ fma.rn.f32 %f5555, %f3434, %f3436, %f913;
+ and.b32 %r4735, %r1027, 2;
+ setp.eq.s32 %p683, %r4735, 0;
+ @%p683 bra $L__BB0_788;
+
+ mov.f32 %f3438, 0fBF800000;
+ fma.rn.f32 %f5555, %f5555, %f3438, %f3435;
+
+$L__BB0_788:
+ add.f32 %f5598, %f5552, %f5555;
+ mul.f32 %f3439, %f865, 0f3F22F983;
+ cvt.rni.s32.f32 %r8178, %f3439;
+ cvt.rn.f32.s32 %f3440, %r8178;
+ mov.f32 %f3441, 0fBFC90FDA;
+ fma.rn.f32 %f3442, %f3440, %f3441, %f865;
+ mov.f32 %f3443, 0fB3A22168;
+ fma.rn.f32 %f3444, %f3440, %f3443, %f3442;
+ mov.f32 %f3445, 0fA7C234C5;
+ fma.rn.f32 %f5556, %f3440, %f3445, %f3444;
+ abs.f32 %f922, %f865;
+ setp.ltu.f32 %p684, %f922, 0f47CE4780;
+ @%p684 bra $L__BB0_796;
+
+ setp.eq.f32 %p685, %f922, 0f7F800000;
+ @%p685 bra $L__BB0_795;
+ bra.uni $L__BB0_790;
+
+$L__BB0_795:
+ mov.f32 %f3448, 0f00000000;
+ mul.rn.f32 %f5556, %f865, %f3448;
+ mov.u32 %r8178, 0;
+ bra.uni $L__BB0_796;
+
+$L__BB0_790:
+ mov.b32 %r1030, %f865;
+ shr.u32 %r4737, %r1030, 23;
+ and.b32 %r4738, %r4737, 255;
+ add.s32 %r1031, %r4738, -128;
+ shl.b32 %r4739, %r1030, 8;
+ or.b32 %r1032, %r4739, -2147483648;
+ shr.u32 %r1033, %r1031, 5;
+ mov.u64 %rd2609, 0;
+ mov.u32 %r8175, 0;
+ mov.u64 %rd1437, __cudart_i2opi_f;
+ mov.u64 %rd2610, %rd2609;
+
+$L__BB0_791:
+ .pragma "nounroll";
+ shl.b64 %rd1436, %rd2609, 2;
+ add.s64 %rd1438, %rd1437, %rd1436;
+ ld.global.nc.u32 %r4740, [%rd1438];
+ mad.wide.u32 %rd1439, %r4740, %r1032, %rd2610;
+ shr.u64 %rd2610, %rd1439, 32;
+ add.s64 %rd1440, %rd1, %rd1436;
+ st.local.u32 [%rd1440], %rd1439;
+ add.s32 %r8175, %r8175, 1;
+ cvt.s64.s32 %rd2609, %r8175;
+ setp.ne.s32 %p686, %r8175, 6;
+ @%p686 bra $L__BB0_791;
+
+ st.local.u32 [%rd4], %rd2610;
+ mov.u32 %r4741, 4;
+ sub.s32 %r1036, %r4741, %r1033;
+ mov.u32 %r4742, 6;
+ sub.s32 %r4743, %r4742, %r1033;
+ mul.wide.s32 %rd1441, %r4743, 4;
+ add.s64 %rd1442, %rd1, %rd1441;
+ ld.local.u32 %r8176, [%rd1442];
+ ld.local.u32 %r8177, [%rd1442+-4];
+ and.b32 %r1039, %r1031, 31;
+ setp.eq.s32 %p687, %r1039, 0;
+ @%p687 bra $L__BB0_794;
+
+ mov.u32 %r4744, 32;
+ sub.s32 %r4745, %r4744, %r1039;
+ shr.u32 %r4746, %r8177, %r4745;
+ shl.b32 %r4747, %r8176, %r1039;
+ add.s32 %r8176, %r4746, %r4747;
+ mul.wide.s32 %rd1443, %r1036, 4;
+ add.s64 %rd1444, %rd1, %rd1443;
+ ld.local.u32 %r4748, [%rd1444];
+ shr.u32 %r4749, %r4748, %r4745;
+ shl.b32 %r4750, %r8177, %r1039;
+ add.s32 %r8177, %r4749, %r4750;
+
+$L__BB0_794:
+ and.b32 %r4751, %r1030, -2147483648;
+ shr.u32 %r4752, %r8177, 30;
+ shl.b32 %r4753, %r8176, 2;
+ or.b32 %r4754, %r4752, %r4753;
+ shr.u32 %r4755, %r4754, 31;
+ shr.u32 %r4756, %r8176, 30;
+ add.s32 %r4757, %r4755, %r4756;
+ neg.s32 %r4758, %r4757;
+ setp.eq.s32 %p688, %r4751, 0;
+ selp.b32 %r8178, %r4757, %r4758, %p688;
+ setp.ne.s32 %p689, %r4755, 0;
+ xor.b32 %r4759, %r4751, -2147483648;
+ selp.b32 %r4760, %r4759, %r4751, %p689;
+ selp.b32 %r4761, -1, 0, %p689;
+ xor.b32 %r4762, %r4754, %r4761;
+ shl.b32 %r4763, %r8177, 2;
+ xor.b32 %r4764, %r4763, %r4761;
+ cvt.u64.u32 %rd1445, %r4762;
+ cvt.u64.u32 %rd1446, %r4764;
+ bfi.b64 %rd1447, %rd1445, %rd1446, 32, 32;
+ cvt.rn.f64.s64 %fd105, %rd1447;
+ mul.f64 %fd106, %fd105, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3446, %fd106;
+ setp.eq.s32 %p690, %r4760, 0;
+ neg.f32 %f3447, %f3446;
+ selp.f32 %f5556, %f3446, %f3447, %p690;
+
+$L__BB0_796:
+ and.b32 %r1046, %r8178, 1;
+ setp.eq.s32 %p691, %r1046, 0;
+ selp.f32 %f926, %f5556, 0f3F800000, %p691;
+ mul.rn.f32 %f927, %f5556, %f5556;
+ mov.f32 %f5557, 0fB94D4153;
+ @%p691 bra $L__BB0_798;
+
+ mov.f32 %f3450, 0fBAB607ED;
+ mov.f32 %f3451, 0f37CBAC00;
+ fma.rn.f32 %f5557, %f3451, %f927, %f3450;
+
+$L__BB0_798:
+ selp.f32 %f3452, 0f3C0885E4, 0f3D2AAABB, %p691;
+ fma.rn.f32 %f3453, %f5557, %f927, %f3452;
+ selp.f32 %f3454, 0fBE2AAAA8, 0fBEFFFFFF, %p691;
+ fma.rn.f32 %f3455, %f3453, %f927, %f3454;
+ mov.f32 %f3456, 0f00000000;
+ fma.rn.f32 %f3457, %f927, %f926, %f3456;
+ fma.rn.f32 %f5558, %f3455, %f3457, %f926;
+ and.b32 %r4766, %r8178, 2;
+ setp.eq.s32 %p693, %r4766, 0;
+ @%p693 bra $L__BB0_800;
+
+ mov.f32 %f3459, 0fBF800000;
+ fma.rn.f32 %f5558, %f5558, %f3459, %f3456;
+
+$L__BB0_800:
+ mul.f32 %f3460, %f857, 0f3F22F983;
+ cvt.rni.s32.f32 %r8182, %f3460;
+ cvt.rn.f32.s32 %f3461, %r8182;
+ mov.f32 %f3462, 0fBFC90FDA;
+ fma.rn.f32 %f3463, %f3461, %f3462, %f857;
+ mov.f32 %f3464, 0fB3A22168;
+ fma.rn.f32 %f3465, %f3461, %f3464, %f3463;
+ mov.f32 %f3466, 0fA7C234C5;
+ fma.rn.f32 %f5559, %f3461, %f3466, %f3465;
+ abs.f32 %f934, %f857;
+ setp.ltu.f32 %p694, %f934, 0f47CE4780;
+ @%p694 bra $L__BB0_808;
+
+ setp.eq.f32 %p695, %f934, 0f7F800000;
+ @%p695 bra $L__BB0_807;
+ bra.uni $L__BB0_802;
+
+$L__BB0_807:
+ mov.f32 %f3469, 0f00000000;
+ mul.rn.f32 %f5559, %f857, %f3469;
+ mov.u32 %r8182, 0;
+ bra.uni $L__BB0_808;
+
+$L__BB0_802:
+ mov.b32 %r1048, %f857;
+ shr.u32 %r4768, %r1048, 23;
+ and.b32 %r4769, %r4768, 255;
+ add.s32 %r1049, %r4769, -128;
+ shl.b32 %r4770, %r1048, 8;
+ or.b32 %r1050, %r4770, -2147483648;
+ shr.u32 %r1051, %r1049, 5;
+ mov.u64 %rd2611, 0;
+ mov.u32 %r8179, 0;
+ mov.u64 %rd1451, __cudart_i2opi_f;
+ mov.u64 %rd2612, %rd2611;
+
+$L__BB0_803:
+ .pragma "nounroll";
+ shl.b64 %rd1450, %rd2611, 2;
+ add.s64 %rd1452, %rd1451, %rd1450;
+ ld.global.nc.u32 %r4771, [%rd1452];
+ mad.wide.u32 %rd1453, %r4771, %r1050, %rd2612;
+ shr.u64 %rd2612, %rd1453, 32;
+ add.s64 %rd1454, %rd1, %rd1450;
+ st.local.u32 [%rd1454], %rd1453;
+ add.s32 %r8179, %r8179, 1;
+ cvt.s64.s32 %rd2611, %r8179;
+ setp.ne.s32 %p696, %r8179, 6;
+ @%p696 bra $L__BB0_803;
+
+ st.local.u32 [%rd4], %rd2612;
+ mov.u32 %r4772, 4;
+ sub.s32 %r1054, %r4772, %r1051;
+ mov.u32 %r4773, 6;
+ sub.s32 %r4774, %r4773, %r1051;
+ mul.wide.s32 %rd1455, %r4774, 4;
+ add.s64 %rd1456, %rd1, %rd1455;
+ ld.local.u32 %r8180, [%rd1456];
+ ld.local.u32 %r8181, [%rd1456+-4];
+ and.b32 %r1057, %r1049, 31;
+ setp.eq.s32 %p697, %r1057, 0;
+ @%p697 bra $L__BB0_806;
+
+ mov.u32 %r4775, 32;
+ sub.s32 %r4776, %r4775, %r1057;
+ shr.u32 %r4777, %r8181, %r4776;
+ shl.b32 %r4778, %r8180, %r1057;
+ add.s32 %r8180, %r4777, %r4778;
+ mul.wide.s32 %rd1457, %r1054, 4;
+ add.s64 %rd1458, %rd1, %rd1457;
+ ld.local.u32 %r4779, [%rd1458];
+ shr.u32 %r4780, %r4779, %r4776;
+ shl.b32 %r4781, %r8181, %r1057;
+ add.s32 %r8181, %r4780, %r4781;
+
+$L__BB0_806:
+ and.b32 %r4782, %r1048, -2147483648;
+ shr.u32 %r4783, %r8181, 30;
+ shl.b32 %r4784, %r8180, 2;
+ or.b32 %r4785, %r4783, %r4784;
+ shr.u32 %r4786, %r4785, 31;
+ shr.u32 %r4787, %r8180, 30;
+ add.s32 %r4788, %r4786, %r4787;
+ neg.s32 %r4789, %r4788;
+ setp.eq.s32 %p698, %r4782, 0;
+ selp.b32 %r8182, %r4788, %r4789, %p698;
+ setp.ne.s32 %p699, %r4786, 0;
+ xor.b32 %r4790, %r4782, -2147483648;
+ selp.b32 %r4791, %r4790, %r4782, %p699;
+ selp.b32 %r4792, -1, 0, %p699;
+ xor.b32 %r4793, %r4785, %r4792;
+ shl.b32 %r4794, %r8181, 2;
+ xor.b32 %r4795, %r4794, %r4792;
+ cvt.u64.u32 %rd1459, %r4793;
+ cvt.u64.u32 %rd1460, %r4795;
+ bfi.b64 %rd1461, %rd1459, %rd1460, 32, 32;
+ cvt.rn.f64.s64 %fd107, %rd1461;
+ mul.f64 %fd108, %fd107, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3467, %fd108;
+ setp.eq.s32 %p700, %r4791, 0;
+ neg.f32 %f3468, %f3467;
+ selp.f32 %f5559, %f3467, %f3468, %p700;
+
+$L__BB0_808:
+ add.s32 %r1064, %r8182, 1;
+ and.b32 %r1065, %r1064, 1;
+ setp.eq.s32 %p701, %r1065, 0;
+ selp.f32 %f938, %f5559, 0f3F800000, %p701;
+ mul.rn.f32 %f939, %f5559, %f5559;
+ mov.f32 %f5560, 0fB94D4153;
+ @%p701 bra $L__BB0_810;
+
+ mov.f32 %f3471, 0fBAB607ED;
+ mov.f32 %f3472, 0f37CBAC00;
+ fma.rn.f32 %f5560, %f3472, %f939, %f3471;
+
+$L__BB0_810:
+ selp.f32 %f3473, 0f3C0885E4, 0f3D2AAABB, %p701;
+ fma.rn.f32 %f3474, %f5560, %f939, %f3473;
+ selp.f32 %f3475, 0fBE2AAAA8, 0fBEFFFFFF, %p701;
+ fma.rn.f32 %f3476, %f3474, %f939, %f3475;
+ mov.f32 %f3477, 0f00000000;
+ fma.rn.f32 %f3478, %f939, %f938, %f3477;
+ fma.rn.f32 %f5561, %f3476, %f3478, %f938;
+ and.b32 %r4797, %r1064, 2;
+ setp.eq.s32 %p703, %r4797, 0;
+ @%p703 bra $L__BB0_812;
+
+ mov.f32 %f3480, 0fBF800000;
+ fma.rn.f32 %f5561, %f5561, %f3480, %f3477;
+
+$L__BB0_812:
+ add.f32 %f5597, %f5558, %f5561;
+ mul.f32 %f3481, %f866, 0f3F22F983;
+ cvt.rni.s32.f32 %r8186, %f3481;
+ cvt.rn.f32.s32 %f3482, %r8186;
+ mov.f32 %f3483, 0fBFC90FDA;
+ fma.rn.f32 %f3484, %f3482, %f3483, %f866;
+ mov.f32 %f3485, 0fB3A22168;
+ fma.rn.f32 %f3486, %f3482, %f3485, %f3484;
+ mov.f32 %f3487, 0fA7C234C5;
+ fma.rn.f32 %f5562, %f3482, %f3487, %f3486;
+ abs.f32 %f947, %f866;
+ setp.ltu.f32 %p704, %f947, 0f47CE4780;
+ @%p704 bra $L__BB0_820;
+
+ setp.eq.f32 %p705, %f947, 0f7F800000;
+ @%p705 bra $L__BB0_819;
+ bra.uni $L__BB0_814;
+
+$L__BB0_819:
+ mov.f32 %f3490, 0f00000000;
+ mul.rn.f32 %f5562, %f866, %f3490;
+ mov.u32 %r8186, 0;
+ bra.uni $L__BB0_820;
+
+$L__BB0_814:
+ mov.b32 %r1067, %f866;
+ shr.u32 %r4799, %r1067, 23;
+ and.b32 %r4800, %r4799, 255;
+ add.s32 %r1068, %r4800, -128;
+ shl.b32 %r4801, %r1067, 8;
+ or.b32 %r1069, %r4801, -2147483648;
+ shr.u32 %r1070, %r1068, 5;
+ mov.u64 %rd2613, 0;
+ mov.u32 %r8183, 0;
+ mov.u64 %rd1465, __cudart_i2opi_f;
+ mov.u64 %rd2614, %rd2613;
+
+$L__BB0_815:
+ .pragma "nounroll";
+ shl.b64 %rd1464, %rd2613, 2;
+ add.s64 %rd1466, %rd1465, %rd1464;
+ ld.global.nc.u32 %r4802, [%rd1466];
+ mad.wide.u32 %rd1467, %r4802, %r1069, %rd2614;
+ shr.u64 %rd2614, %rd1467, 32;
+ add.s64 %rd1468, %rd1, %rd1464;
+ st.local.u32 [%rd1468], %rd1467;
+ add.s32 %r8183, %r8183, 1;
+ cvt.s64.s32 %rd2613, %r8183;
+ setp.ne.s32 %p706, %r8183, 6;
+ @%p706 bra $L__BB0_815;
+
+ st.local.u32 [%rd4], %rd2614;
+ mov.u32 %r4803, 4;
+ sub.s32 %r1073, %r4803, %r1070;
+ mov.u32 %r4804, 6;
+ sub.s32 %r4805, %r4804, %r1070;
+ mul.wide.s32 %rd1469, %r4805, 4;
+ add.s64 %rd1470, %rd1, %rd1469;
+ ld.local.u32 %r8184, [%rd1470];
+ ld.local.u32 %r8185, [%rd1470+-4];
+ and.b32 %r1076, %r1068, 31;
+ setp.eq.s32 %p707, %r1076, 0;
+ @%p707 bra $L__BB0_818;
+
+ mov.u32 %r4806, 32;
+ sub.s32 %r4807, %r4806, %r1076;
+ shr.u32 %r4808, %r8185, %r4807;
+ shl.b32 %r4809, %r8184, %r1076;
+ add.s32 %r8184, %r4808, %r4809;
+ mul.wide.s32 %rd1471, %r1073, 4;
+ add.s64 %rd1472, %rd1, %rd1471;
+ ld.local.u32 %r4810, [%rd1472];
+ shr.u32 %r4811, %r4810, %r4807;
+ shl.b32 %r4812, %r8185, %r1076;
+ add.s32 %r8185, %r4811, %r4812;
+
+$L__BB0_818:
+ and.b32 %r4813, %r1067, -2147483648;
+ shr.u32 %r4814, %r8185, 30;
+ shl.b32 %r4815, %r8184, 2;
+ or.b32 %r4816, %r4814, %r4815;
+ shr.u32 %r4817, %r4816, 31;
+ shr.u32 %r4818, %r8184, 30;
+ add.s32 %r4819, %r4817, %r4818;
+ neg.s32 %r4820, %r4819;
+ setp.eq.s32 %p708, %r4813, 0;
+ selp.b32 %r8186, %r4819, %r4820, %p708;
+ setp.ne.s32 %p709, %r4817, 0;
+ xor.b32 %r4821, %r4813, -2147483648;
+ selp.b32 %r4822, %r4821, %r4813, %p709;
+ selp.b32 %r4823, -1, 0, %p709;
+ xor.b32 %r4824, %r4816, %r4823;
+ shl.b32 %r4825, %r8185, 2;
+ xor.b32 %r4826, %r4825, %r4823;
+ cvt.u64.u32 %rd1473, %r4824;
+ cvt.u64.u32 %rd1474, %r4826;
+ bfi.b64 %rd1475, %rd1473, %rd1474, 32, 32;
+ cvt.rn.f64.s64 %fd109, %rd1475;
+ mul.f64 %fd110, %fd109, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3488, %fd110;
+ setp.eq.s32 %p710, %r4822, 0;
+ neg.f32 %f3489, %f3488;
+ selp.f32 %f5562, %f3488, %f3489, %p710;
+
+$L__BB0_820:
+ and.b32 %r1083, %r8186, 1;
+ setp.eq.s32 %p711, %r1083, 0;
+ selp.f32 %f951, %f5562, 0f3F800000, %p711;
+ mul.rn.f32 %f952, %f5562, %f5562;
+ mov.f32 %f5563, 0fB94D4153;
+ @%p711 bra $L__BB0_822;
+
+ mov.f32 %f3492, 0fBAB607ED;
+ mov.f32 %f3493, 0f37CBAC00;
+ fma.rn.f32 %f5563, %f3493, %f952, %f3492;
+
+$L__BB0_822:
+ selp.f32 %f3494, 0f3C0885E4, 0f3D2AAABB, %p711;
+ fma.rn.f32 %f3495, %f5563, %f952, %f3494;
+ selp.f32 %f3496, 0fBE2AAAA8, 0fBEFFFFFF, %p711;
+ fma.rn.f32 %f3497, %f3495, %f952, %f3496;
+ mov.f32 %f3498, 0f00000000;
+ fma.rn.f32 %f3499, %f952, %f951, %f3498;
+ fma.rn.f32 %f5564, %f3497, %f3499, %f951;
+ and.b32 %r4828, %r8186, 2;
+ setp.eq.s32 %p713, %r4828, 0;
+ @%p713 bra $L__BB0_824;
+
+ mov.f32 %f3501, 0fBF800000;
+ fma.rn.f32 %f5564, %f5564, %f3501, %f3498;
+
+$L__BB0_824:
+ mul.f32 %f3502, %f858, 0f3F22F983;
+ cvt.rni.s32.f32 %r8190, %f3502;
+ cvt.rn.f32.s32 %f3503, %r8190;
+ mov.f32 %f3504, 0fBFC90FDA;
+ fma.rn.f32 %f3505, %f3503, %f3504, %f858;
+ mov.f32 %f3506, 0fB3A22168;
+ fma.rn.f32 %f3507, %f3503, %f3506, %f3505;
+ mov.f32 %f3508, 0fA7C234C5;
+ fma.rn.f32 %f5565, %f3503, %f3508, %f3507;
+ abs.f32 %f959, %f858;
+ setp.ltu.f32 %p714, %f959, 0f47CE4780;
+ @%p714 bra $L__BB0_832;
+
+ setp.eq.f32 %p715, %f959, 0f7F800000;
+ @%p715 bra $L__BB0_831;
+ bra.uni $L__BB0_826;
+
+$L__BB0_831:
+ mov.f32 %f3511, 0f00000000;
+ mul.rn.f32 %f5565, %f858, %f3511;
+ mov.u32 %r8190, 0;
+ bra.uni $L__BB0_832;
+
+$L__BB0_826:
+ mov.b32 %r1085, %f858;
+ shr.u32 %r4830, %r1085, 23;
+ and.b32 %r4831, %r4830, 255;
+ add.s32 %r1086, %r4831, -128;
+ shl.b32 %r4832, %r1085, 8;
+ or.b32 %r1087, %r4832, -2147483648;
+ shr.u32 %r1088, %r1086, 5;
+ mov.u64 %rd2615, 0;
+ mov.u32 %r8187, 0;
+ mov.u64 %rd1479, __cudart_i2opi_f;
+ mov.u64 %rd2616, %rd2615;
+
+$L__BB0_827:
+ .pragma "nounroll";
+ shl.b64 %rd1478, %rd2615, 2;
+ add.s64 %rd1480, %rd1479, %rd1478;
+ ld.global.nc.u32 %r4833, [%rd1480];
+ mad.wide.u32 %rd1481, %r4833, %r1087, %rd2616;
+ shr.u64 %rd2616, %rd1481, 32;
+ add.s64 %rd1482, %rd1, %rd1478;
+ st.local.u32 [%rd1482], %rd1481;
+ add.s32 %r8187, %r8187, 1;
+ cvt.s64.s32 %rd2615, %r8187;
+ setp.ne.s32 %p716, %r8187, 6;
+ @%p716 bra $L__BB0_827;
+
+ st.local.u32 [%rd4], %rd2616;
+ mov.u32 %r4834, 4;
+ sub.s32 %r1091, %r4834, %r1088;
+ mov.u32 %r4835, 6;
+ sub.s32 %r4836, %r4835, %r1088;
+ mul.wide.s32 %rd1483, %r4836, 4;
+ add.s64 %rd1484, %rd1, %rd1483;
+ ld.local.u32 %r8188, [%rd1484];
+ ld.local.u32 %r8189, [%rd1484+-4];
+ and.b32 %r1094, %r1086, 31;
+ setp.eq.s32 %p717, %r1094, 0;
+ @%p717 bra $L__BB0_830;
+
+ mov.u32 %r4837, 32;
+ sub.s32 %r4838, %r4837, %r1094;
+ shr.u32 %r4839, %r8189, %r4838;
+ shl.b32 %r4840, %r8188, %r1094;
+ add.s32 %r8188, %r4839, %r4840;
+ mul.wide.s32 %rd1485, %r1091, 4;
+ add.s64 %rd1486, %rd1, %rd1485;
+ ld.local.u32 %r4841, [%rd1486];
+ shr.u32 %r4842, %r4841, %r4838;
+ shl.b32 %r4843, %r8189, %r1094;
+ add.s32 %r8189, %r4842, %r4843;
+
+$L__BB0_830:
+ and.b32 %r4844, %r1085, -2147483648;
+ shr.u32 %r4845, %r8189, 30;
+ shl.b32 %r4846, %r8188, 2;
+ or.b32 %r4847, %r4845, %r4846;
+ shr.u32 %r4848, %r4847, 31;
+ shr.u32 %r4849, %r8188, 30;
+ add.s32 %r4850, %r4848, %r4849;
+ neg.s32 %r4851, %r4850;
+ setp.eq.s32 %p718, %r4844, 0;
+ selp.b32 %r8190, %r4850, %r4851, %p718;
+ setp.ne.s32 %p719, %r4848, 0;
+ xor.b32 %r4852, %r4844, -2147483648;
+ selp.b32 %r4853, %r4852, %r4844, %p719;
+ selp.b32 %r4854, -1, 0, %p719;
+ xor.b32 %r4855, %r4847, %r4854;
+ shl.b32 %r4856, %r8189, 2;
+ xor.b32 %r4857, %r4856, %r4854;
+ cvt.u64.u32 %rd1487, %r4855;
+ cvt.u64.u32 %rd1488, %r4857;
+ bfi.b64 %rd1489, %rd1487, %rd1488, 32, 32;
+ cvt.rn.f64.s64 %fd111, %rd1489;
+ mul.f64 %fd112, %fd111, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3509, %fd112;
+ setp.eq.s32 %p720, %r4853, 0;
+ neg.f32 %f3510, %f3509;
+ selp.f32 %f5565, %f3509, %f3510, %p720;
+
+$L__BB0_832:
+ add.s32 %r1101, %r8190, 1;
+ and.b32 %r1102, %r1101, 1;
+ setp.eq.s32 %p721, %r1102, 0;
+ selp.f32 %f963, %f5565, 0f3F800000, %p721;
+ mul.rn.f32 %f964, %f5565, %f5565;
+ mov.f32 %f5566, 0fB94D4153;
+ @%p721 bra $L__BB0_834;
+
+ mov.f32 %f3513, 0fBAB607ED;
+ mov.f32 %f3514, 0f37CBAC00;
+ fma.rn.f32 %f5566, %f3514, %f964, %f3513;
+
+$L__BB0_834:
+ selp.f32 %f3515, 0f3C0885E4, 0f3D2AAABB, %p721;
+ fma.rn.f32 %f3516, %f5566, %f964, %f3515;
+ selp.f32 %f3517, 0fBE2AAAA8, 0fBEFFFFFF, %p721;
+ fma.rn.f32 %f3518, %f3516, %f964, %f3517;
+ mov.f32 %f3519, 0f00000000;
+ fma.rn.f32 %f3520, %f964, %f963, %f3519;
+ fma.rn.f32 %f5567, %f3518, %f3520, %f963;
+ and.b32 %r4859, %r1101, 2;
+ setp.eq.s32 %p723, %r4859, 0;
+ @%p723 bra $L__BB0_836;
+
+ mov.f32 %f3522, 0fBF800000;
+ fma.rn.f32 %f5567, %f5567, %f3522, %f3519;
+
+$L__BB0_836:
+ add.f32 %f5596, %f5564, %f5567;
+ mul.f32 %f3523, %f867, 0f3F22F983;
+ cvt.rni.s32.f32 %r8194, %f3523;
+ cvt.rn.f32.s32 %f3524, %r8194;
+ mov.f32 %f3525, 0fBFC90FDA;
+ fma.rn.f32 %f3526, %f3524, %f3525, %f867;
+ mov.f32 %f3527, 0fB3A22168;
+ fma.rn.f32 %f3528, %f3524, %f3527, %f3526;
+ mov.f32 %f3529, 0fA7C234C5;
+ fma.rn.f32 %f5568, %f3524, %f3529, %f3528;
+ abs.f32 %f972, %f867;
+ setp.ltu.f32 %p724, %f972, 0f47CE4780;
+ @%p724 bra $L__BB0_844;
+
+ setp.eq.f32 %p725, %f972, 0f7F800000;
+ @%p725 bra $L__BB0_843;
+ bra.uni $L__BB0_838;
+
+$L__BB0_843:
+ mov.f32 %f3532, 0f00000000;
+ mul.rn.f32 %f5568, %f867, %f3532;
+ mov.u32 %r8194, 0;
+ bra.uni $L__BB0_844;
+
+$L__BB0_838:
+ mov.b32 %r1104, %f867;
+ shr.u32 %r4861, %r1104, 23;
+ and.b32 %r4862, %r4861, 255;
+ add.s32 %r1105, %r4862, -128;
+ shl.b32 %r4863, %r1104, 8;
+ or.b32 %r1106, %r4863, -2147483648;
+ shr.u32 %r1107, %r1105, 5;
+ mov.u64 %rd2617, 0;
+ mov.u32 %r8191, 0;
+ mov.u64 %rd1493, __cudart_i2opi_f;
+ mov.u64 %rd2618, %rd2617;
+
+$L__BB0_839:
+ .pragma "nounroll";
+ shl.b64 %rd1492, %rd2617, 2;
+ add.s64 %rd1494, %rd1493, %rd1492;
+ ld.global.nc.u32 %r4864, [%rd1494];
+ mad.wide.u32 %rd1495, %r4864, %r1106, %rd2618;
+ shr.u64 %rd2618, %rd1495, 32;
+ add.s64 %rd1496, %rd1, %rd1492;
+ st.local.u32 [%rd1496], %rd1495;
+ add.s32 %r8191, %r8191, 1;
+ cvt.s64.s32 %rd2617, %r8191;
+ setp.ne.s32 %p726, %r8191, 6;
+ @%p726 bra $L__BB0_839;
+
+ st.local.u32 [%rd4], %rd2618;
+ mov.u32 %r4865, 4;
+ sub.s32 %r1110, %r4865, %r1107;
+ mov.u32 %r4866, 6;
+ sub.s32 %r4867, %r4866, %r1107;
+ mul.wide.s32 %rd1497, %r4867, 4;
+ add.s64 %rd1498, %rd1, %rd1497;
+ ld.local.u32 %r8192, [%rd1498];
+ ld.local.u32 %r8193, [%rd1498+-4];
+ and.b32 %r1113, %r1105, 31;
+ setp.eq.s32 %p727, %r1113, 0;
+ @%p727 bra $L__BB0_842;
+
+ mov.u32 %r4868, 32;
+ sub.s32 %r4869, %r4868, %r1113;
+ shr.u32 %r4870, %r8193, %r4869;
+ shl.b32 %r4871, %r8192, %r1113;
+ add.s32 %r8192, %r4870, %r4871;
+ mul.wide.s32 %rd1499, %r1110, 4;
+ add.s64 %rd1500, %rd1, %rd1499;
+ ld.local.u32 %r4872, [%rd1500];
+ shr.u32 %r4873, %r4872, %r4869;
+ shl.b32 %r4874, %r8193, %r1113;
+ add.s32 %r8193, %r4873, %r4874;
+
+$L__BB0_842:
+ and.b32 %r4875, %r1104, -2147483648;
+ shr.u32 %r4876, %r8193, 30;
+ shl.b32 %r4877, %r8192, 2;
+ or.b32 %r4878, %r4876, %r4877;
+ shr.u32 %r4879, %r4878, 31;
+ shr.u32 %r4880, %r8192, 30;
+ add.s32 %r4881, %r4879, %r4880;
+ neg.s32 %r4882, %r4881;
+ setp.eq.s32 %p728, %r4875, 0;
+ selp.b32 %r8194, %r4881, %r4882, %p728;
+ setp.ne.s32 %p729, %r4879, 0;
+ xor.b32 %r4883, %r4875, -2147483648;
+ selp.b32 %r4884, %r4883, %r4875, %p729;
+ selp.b32 %r4885, -1, 0, %p729;
+ xor.b32 %r4886, %r4878, %r4885;
+ shl.b32 %r4887, %r8193, 2;
+ xor.b32 %r4888, %r4887, %r4885;
+ cvt.u64.u32 %rd1501, %r4886;
+ cvt.u64.u32 %rd1502, %r4888;
+ bfi.b64 %rd1503, %rd1501, %rd1502, 32, 32;
+ cvt.rn.f64.s64 %fd113, %rd1503;
+ mul.f64 %fd114, %fd113, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3530, %fd114;
+ setp.eq.s32 %p730, %r4884, 0;
+ neg.f32 %f3531, %f3530;
+ selp.f32 %f5568, %f3530, %f3531, %p730;
+
+$L__BB0_844:
+ and.b32 %r1120, %r8194, 1;
+ setp.eq.s32 %p731, %r1120, 0;
+ selp.f32 %f976, %f5568, 0f3F800000, %p731;
+ mul.rn.f32 %f977, %f5568, %f5568;
+ mov.f32 %f5569, 0fB94D4153;
+ @%p731 bra $L__BB0_846;
+
+ mov.f32 %f3534, 0fBAB607ED;
+ mov.f32 %f3535, 0f37CBAC00;
+ fma.rn.f32 %f5569, %f3535, %f977, %f3534;
+
+$L__BB0_846:
+ selp.f32 %f3536, 0f3C0885E4, 0f3D2AAABB, %p731;
+ fma.rn.f32 %f3537, %f5569, %f977, %f3536;
+ selp.f32 %f3538, 0fBE2AAAA8, 0fBEFFFFFF, %p731;
+ fma.rn.f32 %f3539, %f3537, %f977, %f3538;
+ mov.f32 %f3540, 0f00000000;
+ fma.rn.f32 %f3541, %f977, %f976, %f3540;
+ fma.rn.f32 %f5570, %f3539, %f3541, %f976;
+ and.b32 %r4890, %r8194, 2;
+ setp.eq.s32 %p733, %r4890, 0;
+ @%p733 bra $L__BB0_848;
+
+ mov.f32 %f3543, 0fBF800000;
+ fma.rn.f32 %f5570, %f5570, %f3543, %f3540;
+
+$L__BB0_848:
+ mul.f32 %f3544, %f859, 0f3F22F983;
+ cvt.rni.s32.f32 %r8198, %f3544;
+ cvt.rn.f32.s32 %f3545, %r8198;
+ mov.f32 %f3546, 0fBFC90FDA;
+ fma.rn.f32 %f3547, %f3545, %f3546, %f859;
+ mov.f32 %f3548, 0fB3A22168;
+ fma.rn.f32 %f3549, %f3545, %f3548, %f3547;
+ mov.f32 %f3550, 0fA7C234C5;
+ fma.rn.f32 %f5571, %f3545, %f3550, %f3549;
+ abs.f32 %f984, %f859;
+ setp.ltu.f32 %p734, %f984, 0f47CE4780;
+ @%p734 bra $L__BB0_856;
+
+ setp.eq.f32 %p735, %f984, 0f7F800000;
+ @%p735 bra $L__BB0_855;
+ bra.uni $L__BB0_850;
+
+$L__BB0_855:
+ mov.f32 %f3553, 0f00000000;
+ mul.rn.f32 %f5571, %f859, %f3553;
+ mov.u32 %r8198, 0;
+ bra.uni $L__BB0_856;
+
+$L__BB0_850:
+ mov.b32 %r1122, %f859;
+ shr.u32 %r4892, %r1122, 23;
+ and.b32 %r4893, %r4892, 255;
+ add.s32 %r1123, %r4893, -128;
+ shl.b32 %r4894, %r1122, 8;
+ or.b32 %r1124, %r4894, -2147483648;
+ shr.u32 %r1125, %r1123, 5;
+ mov.u64 %rd2619, 0;
+ mov.u32 %r8195, 0;
+ mov.u64 %rd1507, __cudart_i2opi_f;
+ mov.u64 %rd2620, %rd2619;
+
+$L__BB0_851:
+ .pragma "nounroll";
+ shl.b64 %rd1506, %rd2619, 2;
+ add.s64 %rd1508, %rd1507, %rd1506;
+ ld.global.nc.u32 %r4895, [%rd1508];
+ mad.wide.u32 %rd1509, %r4895, %r1124, %rd2620;
+ shr.u64 %rd2620, %rd1509, 32;
+ add.s64 %rd1510, %rd1, %rd1506;
+ st.local.u32 [%rd1510], %rd1509;
+ add.s32 %r8195, %r8195, 1;
+ cvt.s64.s32 %rd2619, %r8195;
+ setp.ne.s32 %p736, %r8195, 6;
+ @%p736 bra $L__BB0_851;
+
+ st.local.u32 [%rd4], %rd2620;
+ mov.u32 %r4896, 4;
+ sub.s32 %r1128, %r4896, %r1125;
+ mov.u32 %r4897, 6;
+ sub.s32 %r4898, %r4897, %r1125;
+ mul.wide.s32 %rd1511, %r4898, 4;
+ add.s64 %rd1512, %rd1, %rd1511;
+ ld.local.u32 %r8196, [%rd1512];
+ ld.local.u32 %r8197, [%rd1512+-4];
+ and.b32 %r1131, %r1123, 31;
+ setp.eq.s32 %p737, %r1131, 0;
+ @%p737 bra $L__BB0_854;
+
+ mov.u32 %r4899, 32;
+ sub.s32 %r4900, %r4899, %r1131;
+ shr.u32 %r4901, %r8197, %r4900;
+ shl.b32 %r4902, %r8196, %r1131;
+ add.s32 %r8196, %r4901, %r4902;
+ mul.wide.s32 %rd1513, %r1128, 4;
+ add.s64 %rd1514, %rd1, %rd1513;
+ ld.local.u32 %r4903, [%rd1514];
+ shr.u32 %r4904, %r4903, %r4900;
+ shl.b32 %r4905, %r8197, %r1131;
+ add.s32 %r8197, %r4904, %r4905;
+
+$L__BB0_854:
+ and.b32 %r4906, %r1122, -2147483648;
+ shr.u32 %r4907, %r8197, 30;
+ shl.b32 %r4908, %r8196, 2;
+ or.b32 %r4909, %r4907, %r4908;
+ shr.u32 %r4910, %r4909, 31;
+ shr.u32 %r4911, %r8196, 30;
+ add.s32 %r4912, %r4910, %r4911;
+ neg.s32 %r4913, %r4912;
+ setp.eq.s32 %p738, %r4906, 0;
+ selp.b32 %r8198, %r4912, %r4913, %p738;
+ setp.ne.s32 %p739, %r4910, 0;
+ xor.b32 %r4914, %r4906, -2147483648;
+ selp.b32 %r4915, %r4914, %r4906, %p739;
+ selp.b32 %r4916, -1, 0, %p739;
+ xor.b32 %r4917, %r4909, %r4916;
+ shl.b32 %r4918, %r8197, 2;
+ xor.b32 %r4919, %r4918, %r4916;
+ cvt.u64.u32 %rd1515, %r4917;
+ cvt.u64.u32 %rd1516, %r4919;
+ bfi.b64 %rd1517, %rd1515, %rd1516, 32, 32;
+ cvt.rn.f64.s64 %fd115, %rd1517;
+ mul.f64 %fd116, %fd115, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3551, %fd116;
+ setp.eq.s32 %p740, %r4915, 0;
+ neg.f32 %f3552, %f3551;
+ selp.f32 %f5571, %f3551, %f3552, %p740;
+
+$L__BB0_856:
+ add.s32 %r1138, %r8198, 1;
+ and.b32 %r1139, %r1138, 1;
+ setp.eq.s32 %p741, %r1139, 0;
+ selp.f32 %f988, %f5571, 0f3F800000, %p741;
+ mul.rn.f32 %f989, %f5571, %f5571;
+ mov.f32 %f5572, 0fB94D4153;
+ @%p741 bra $L__BB0_858;
+
+ mov.f32 %f3555, 0fBAB607ED;
+ mov.f32 %f3556, 0f37CBAC00;
+ fma.rn.f32 %f5572, %f3556, %f989, %f3555;
+
+$L__BB0_858:
+ selp.f32 %f3557, 0f3C0885E4, 0f3D2AAABB, %p741;
+ fma.rn.f32 %f3558, %f5572, %f989, %f3557;
+ selp.f32 %f3559, 0fBE2AAAA8, 0fBEFFFFFF, %p741;
+ fma.rn.f32 %f3560, %f3558, %f989, %f3559;
+ mov.f32 %f3561, 0f00000000;
+ fma.rn.f32 %f3562, %f989, %f988, %f3561;
+ fma.rn.f32 %f5573, %f3560, %f3562, %f988;
+ and.b32 %r4921, %r1138, 2;
+ setp.eq.s32 %p743, %r4921, 0;
+ @%p743 bra $L__BB0_860;
+
+ mov.f32 %f3564, 0fBF800000;
+ fma.rn.f32 %f5573, %f5573, %f3564, %f3561;
+
+$L__BB0_860:
+ add.f32 %f5595, %f5570, %f5573;
+ mul.f32 %f3565, %f868, 0f3F22F983;
+ cvt.rni.s32.f32 %r8202, %f3565;
+ cvt.rn.f32.s32 %f3566, %r8202;
+ mov.f32 %f3567, 0fBFC90FDA;
+ fma.rn.f32 %f3568, %f3566, %f3567, %f868;
+ mov.f32 %f3569, 0fB3A22168;
+ fma.rn.f32 %f3570, %f3566, %f3569, %f3568;
+ mov.f32 %f3571, 0fA7C234C5;
+ fma.rn.f32 %f5574, %f3566, %f3571, %f3570;
+ abs.f32 %f997, %f868;
+ setp.ltu.f32 %p744, %f997, 0f47CE4780;
+ @%p744 bra $L__BB0_868;
+
+ setp.eq.f32 %p745, %f997, 0f7F800000;
+ @%p745 bra $L__BB0_867;
+ bra.uni $L__BB0_862;
+
+$L__BB0_867:
+ mov.f32 %f3574, 0f00000000;
+ mul.rn.f32 %f5574, %f868, %f3574;
+ mov.u32 %r8202, 0;
+ bra.uni $L__BB0_868;
+
+$L__BB0_862:
+ mov.b32 %r1141, %f868;
+ shr.u32 %r4923, %r1141, 23;
+ and.b32 %r4924, %r4923, 255;
+ add.s32 %r1142, %r4924, -128;
+ shl.b32 %r4925, %r1141, 8;
+ or.b32 %r1143, %r4925, -2147483648;
+ shr.u32 %r1144, %r1142, 5;
+ mov.u64 %rd2621, 0;
+ mov.u32 %r8199, 0;
+ mov.u64 %rd1521, __cudart_i2opi_f;
+ mov.u64 %rd2622, %rd2621;
+
+$L__BB0_863:
+ .pragma "nounroll";
+ shl.b64 %rd1520, %rd2621, 2;
+ add.s64 %rd1522, %rd1521, %rd1520;
+ ld.global.nc.u32 %r4926, [%rd1522];
+ mad.wide.u32 %rd1523, %r4926, %r1143, %rd2622;
+ shr.u64 %rd2622, %rd1523, 32;
+ add.s64 %rd1524, %rd1, %rd1520;
+ st.local.u32 [%rd1524], %rd1523;
+ add.s32 %r8199, %r8199, 1;
+ cvt.s64.s32 %rd2621, %r8199;
+ setp.ne.s32 %p746, %r8199, 6;
+ @%p746 bra $L__BB0_863;
+
+ st.local.u32 [%rd4], %rd2622;
+ mov.u32 %r4927, 4;
+ sub.s32 %r1147, %r4927, %r1144;
+ mov.u32 %r4928, 6;
+ sub.s32 %r4929, %r4928, %r1144;
+ mul.wide.s32 %rd1525, %r4929, 4;
+ add.s64 %rd1526, %rd1, %rd1525;
+ ld.local.u32 %r8200, [%rd1526];
+ ld.local.u32 %r8201, [%rd1526+-4];
+ and.b32 %r1150, %r1142, 31;
+ setp.eq.s32 %p747, %r1150, 0;
+ @%p747 bra $L__BB0_866;
+
+ mov.u32 %r4930, 32;
+ sub.s32 %r4931, %r4930, %r1150;
+ shr.u32 %r4932, %r8201, %r4931;
+ shl.b32 %r4933, %r8200, %r1150;
+ add.s32 %r8200, %r4932, %r4933;
+ mul.wide.s32 %rd1527, %r1147, 4;
+ add.s64 %rd1528, %rd1, %rd1527;
+ ld.local.u32 %r4934, [%rd1528];
+ shr.u32 %r4935, %r4934, %r4931;
+ shl.b32 %r4936, %r8201, %r1150;
+ add.s32 %r8201, %r4935, %r4936;
+
+$L__BB0_866:
+ and.b32 %r4937, %r1141, -2147483648;
+ shr.u32 %r4938, %r8201, 30;
+ shl.b32 %r4939, %r8200, 2;
+ or.b32 %r4940, %r4938, %r4939;
+ shr.u32 %r4941, %r4940, 31;
+ shr.u32 %r4942, %r8200, 30;
+ add.s32 %r4943, %r4941, %r4942;
+ neg.s32 %r4944, %r4943;
+ setp.eq.s32 %p748, %r4937, 0;
+ selp.b32 %r8202, %r4943, %r4944, %p748;
+ setp.ne.s32 %p749, %r4941, 0;
+ xor.b32 %r4945, %r4937, -2147483648;
+ selp.b32 %r4946, %r4945, %r4937, %p749;
+ selp.b32 %r4947, -1, 0, %p749;
+ xor.b32 %r4948, %r4940, %r4947;
+ shl.b32 %r4949, %r8201, 2;
+ xor.b32 %r4950, %r4949, %r4947;
+ cvt.u64.u32 %rd1529, %r4948;
+ cvt.u64.u32 %rd1530, %r4950;
+ bfi.b64 %rd1531, %rd1529, %rd1530, 32, 32;
+ cvt.rn.f64.s64 %fd117, %rd1531;
+ mul.f64 %fd118, %fd117, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3572, %fd118;
+ setp.eq.s32 %p750, %r4946, 0;
+ neg.f32 %f3573, %f3572;
+ selp.f32 %f5574, %f3572, %f3573, %p750;
+
+$L__BB0_868:
+ and.b32 %r1157, %r8202, 1;
+ setp.eq.s32 %p751, %r1157, 0;
+ selp.f32 %f1001, %f5574, 0f3F800000, %p751;
+ mul.rn.f32 %f1002, %f5574, %f5574;
+ mov.f32 %f5575, 0fB94D4153;
+ @%p751 bra $L__BB0_870;
+
+ mov.f32 %f3576, 0fBAB607ED;
+ mov.f32 %f3577, 0f37CBAC00;
+ fma.rn.f32 %f5575, %f3577, %f1002, %f3576;
+
+$L__BB0_870:
+ selp.f32 %f3578, 0f3C0885E4, 0f3D2AAABB, %p751;
+ fma.rn.f32 %f3579, %f5575, %f1002, %f3578;
+ selp.f32 %f3580, 0fBE2AAAA8, 0fBEFFFFFF, %p751;
+ fma.rn.f32 %f3581, %f3579, %f1002, %f3580;
+ mov.f32 %f3582, 0f00000000;
+ fma.rn.f32 %f3583, %f1002, %f1001, %f3582;
+ fma.rn.f32 %f5576, %f3581, %f3583, %f1001;
+ and.b32 %r4952, %r8202, 2;
+ setp.eq.s32 %p753, %r4952, 0;
+ @%p753 bra $L__BB0_872;
+
+ mov.f32 %f3585, 0fBF800000;
+ fma.rn.f32 %f5576, %f5576, %f3585, %f3582;
+
+$L__BB0_872:
+ mul.f32 %f3586, %f860, 0f3F22F983;
+ cvt.rni.s32.f32 %r8206, %f3586;
+ cvt.rn.f32.s32 %f3587, %r8206;
+ mov.f32 %f3588, 0fBFC90FDA;
+ fma.rn.f32 %f3589, %f3587, %f3588, %f860;
+ mov.f32 %f3590, 0fB3A22168;
+ fma.rn.f32 %f3591, %f3587, %f3590, %f3589;
+ mov.f32 %f3592, 0fA7C234C5;
+ fma.rn.f32 %f5577, %f3587, %f3592, %f3591;
+ abs.f32 %f1009, %f860;
+ setp.ltu.f32 %p754, %f1009, 0f47CE4780;
+ @%p754 bra $L__BB0_880;
+
+ setp.eq.f32 %p755, %f1009, 0f7F800000;
+ @%p755 bra $L__BB0_879;
+ bra.uni $L__BB0_874;
+
+$L__BB0_879:
+ mov.f32 %f3595, 0f00000000;
+ mul.rn.f32 %f5577, %f860, %f3595;
+ mov.u32 %r8206, 0;
+ bra.uni $L__BB0_880;
+
+$L__BB0_874:
+ mov.b32 %r1159, %f860;
+ shr.u32 %r4954, %r1159, 23;
+ and.b32 %r4955, %r4954, 255;
+ add.s32 %r1160, %r4955, -128;
+ shl.b32 %r4956, %r1159, 8;
+ or.b32 %r1161, %r4956, -2147483648;
+ shr.u32 %r1162, %r1160, 5;
+ mov.u64 %rd2623, 0;
+ mov.u32 %r8203, 0;
+ mov.u64 %rd1535, __cudart_i2opi_f;
+ mov.u64 %rd2624, %rd2623;
+
+$L__BB0_875:
+ .pragma "nounroll";
+ shl.b64 %rd1534, %rd2623, 2;
+ add.s64 %rd1536, %rd1535, %rd1534;
+ ld.global.nc.u32 %r4957, [%rd1536];
+ mad.wide.u32 %rd1537, %r4957, %r1161, %rd2624;
+ shr.u64 %rd2624, %rd1537, 32;
+ add.s64 %rd1538, %rd1, %rd1534;
+ st.local.u32 [%rd1538], %rd1537;
+ add.s32 %r8203, %r8203, 1;
+ cvt.s64.s32 %rd2623, %r8203;
+ setp.ne.s32 %p756, %r8203, 6;
+ @%p756 bra $L__BB0_875;
+
+ st.local.u32 [%rd4], %rd2624;
+ mov.u32 %r4958, 4;
+ sub.s32 %r1165, %r4958, %r1162;
+ mov.u32 %r4959, 6;
+ sub.s32 %r4960, %r4959, %r1162;
+ mul.wide.s32 %rd1539, %r4960, 4;
+ add.s64 %rd1540, %rd1, %rd1539;
+ ld.local.u32 %r8204, [%rd1540];
+ ld.local.u32 %r8205, [%rd1540+-4];
+ and.b32 %r1168, %r1160, 31;
+ setp.eq.s32 %p757, %r1168, 0;
+ @%p757 bra $L__BB0_878;
+
+ mov.u32 %r4961, 32;
+ sub.s32 %r4962, %r4961, %r1168;
+ shr.u32 %r4963, %r8205, %r4962;
+ shl.b32 %r4964, %r8204, %r1168;
+ add.s32 %r8204, %r4963, %r4964;
+ mul.wide.s32 %rd1541, %r1165, 4;
+ add.s64 %rd1542, %rd1, %rd1541;
+ ld.local.u32 %r4965, [%rd1542];
+ shr.u32 %r4966, %r4965, %r4962;
+ shl.b32 %r4967, %r8205, %r1168;
+ add.s32 %r8205, %r4966, %r4967;
+
+$L__BB0_878:
+ and.b32 %r4968, %r1159, -2147483648;
+ shr.u32 %r4969, %r8205, 30;
+ shl.b32 %r4970, %r8204, 2;
+ or.b32 %r4971, %r4969, %r4970;
+ shr.u32 %r4972, %r4971, 31;
+ shr.u32 %r4973, %r8204, 30;
+ add.s32 %r4974, %r4972, %r4973;
+ neg.s32 %r4975, %r4974;
+ setp.eq.s32 %p758, %r4968, 0;
+ selp.b32 %r8206, %r4974, %r4975, %p758;
+ setp.ne.s32 %p759, %r4972, 0;
+ xor.b32 %r4976, %r4968, -2147483648;
+ selp.b32 %r4977, %r4976, %r4968, %p759;
+ selp.b32 %r4978, -1, 0, %p759;
+ xor.b32 %r4979, %r4971, %r4978;
+ shl.b32 %r4980, %r8205, 2;
+ xor.b32 %r4981, %r4980, %r4978;
+ cvt.u64.u32 %rd1543, %r4979;
+ cvt.u64.u32 %rd1544, %r4981;
+ bfi.b64 %rd1545, %rd1543, %rd1544, 32, 32;
+ cvt.rn.f64.s64 %fd119, %rd1545;
+ mul.f64 %fd120, %fd119, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3593, %fd120;
+ setp.eq.s32 %p760, %r4977, 0;
+ neg.f32 %f3594, %f3593;
+ selp.f32 %f5577, %f3593, %f3594, %p760;
+
+$L__BB0_880:
+ add.s32 %r1175, %r8206, 1;
+ and.b32 %r1176, %r1175, 1;
+ setp.eq.s32 %p761, %r1176, 0;
+ selp.f32 %f1013, %f5577, 0f3F800000, %p761;
+ mul.rn.f32 %f1014, %f5577, %f5577;
+ mov.f32 %f5578, 0fB94D4153;
+ @%p761 bra $L__BB0_882;
+
+ mov.f32 %f3597, 0fBAB607ED;
+ mov.f32 %f3598, 0f37CBAC00;
+ fma.rn.f32 %f5578, %f3598, %f1014, %f3597;
+
+$L__BB0_882:
+ selp.f32 %f3599, 0f3C0885E4, 0f3D2AAABB, %p761;
+ fma.rn.f32 %f3600, %f5578, %f1014, %f3599;
+ selp.f32 %f3601, 0fBE2AAAA8, 0fBEFFFFFF, %p761;
+ fma.rn.f32 %f3602, %f3600, %f1014, %f3601;
+ mov.f32 %f3603, 0f00000000;
+ fma.rn.f32 %f3604, %f1014, %f1013, %f3603;
+ fma.rn.f32 %f5579, %f3602, %f3604, %f1013;
+ and.b32 %r4983, %r1175, 2;
+ setp.eq.s32 %p763, %r4983, 0;
+ @%p763 bra $L__BB0_884;
+
+ mov.f32 %f3606, 0fBF800000;
+ fma.rn.f32 %f5579, %f5579, %f3606, %f3603;
+
+$L__BB0_884:
+ add.f32 %f5594, %f5576, %f5579;
+ mul.f32 %f3607, %f869, 0f3F22F983;
+ cvt.rni.s32.f32 %r8210, %f3607;
+ cvt.rn.f32.s32 %f3608, %r8210;
+ mov.f32 %f3609, 0fBFC90FDA;
+ fma.rn.f32 %f3610, %f3608, %f3609, %f869;
+ mov.f32 %f3611, 0fB3A22168;
+ fma.rn.f32 %f3612, %f3608, %f3611, %f3610;
+ mov.f32 %f3613, 0fA7C234C5;
+ fma.rn.f32 %f5580, %f3608, %f3613, %f3612;
+ abs.f32 %f1022, %f869;
+ setp.ltu.f32 %p764, %f1022, 0f47CE4780;
+ @%p764 bra $L__BB0_892;
+
+ setp.eq.f32 %p765, %f1022, 0f7F800000;
+ @%p765 bra $L__BB0_891;
+ bra.uni $L__BB0_886;
+
+$L__BB0_891:
+ mov.f32 %f3616, 0f00000000;
+ mul.rn.f32 %f5580, %f869, %f3616;
+ mov.u32 %r8210, 0;
+ bra.uni $L__BB0_892;
+
+$L__BB0_886:
+ mov.b32 %r1178, %f869;
+ shr.u32 %r4985, %r1178, 23;
+ and.b32 %r4986, %r4985, 255;
+ add.s32 %r1179, %r4986, -128;
+ shl.b32 %r4987, %r1178, 8;
+ or.b32 %r1180, %r4987, -2147483648;
+ shr.u32 %r1181, %r1179, 5;
+ mov.u64 %rd2625, 0;
+ mov.u32 %r8207, 0;
+ mov.u64 %rd1549, __cudart_i2opi_f;
+ mov.u64 %rd2626, %rd2625;
+
+$L__BB0_887:
+ .pragma "nounroll";
+ shl.b64 %rd1548, %rd2625, 2;
+ add.s64 %rd1550, %rd1549, %rd1548;
+ ld.global.nc.u32 %r4988, [%rd1550];
+ mad.wide.u32 %rd1551, %r4988, %r1180, %rd2626;
+ shr.u64 %rd2626, %rd1551, 32;
+ add.s64 %rd1552, %rd1, %rd1548;
+ st.local.u32 [%rd1552], %rd1551;
+ add.s32 %r8207, %r8207, 1;
+ cvt.s64.s32 %rd2625, %r8207;
+ setp.ne.s32 %p766, %r8207, 6;
+ @%p766 bra $L__BB0_887;
+
+ st.local.u32 [%rd4], %rd2626;
+ mov.u32 %r4989, 4;
+ sub.s32 %r1184, %r4989, %r1181;
+ mov.u32 %r4990, 6;
+ sub.s32 %r4991, %r4990, %r1181;
+ mul.wide.s32 %rd1553, %r4991, 4;
+ add.s64 %rd1554, %rd1, %rd1553;
+ ld.local.u32 %r8208, [%rd1554];
+ ld.local.u32 %r8209, [%rd1554+-4];
+ and.b32 %r1187, %r1179, 31;
+ setp.eq.s32 %p767, %r1187, 0;
+ @%p767 bra $L__BB0_890;
+
+ mov.u32 %r4992, 32;
+ sub.s32 %r4993, %r4992, %r1187;
+ shr.u32 %r4994, %r8209, %r4993;
+ shl.b32 %r4995, %r8208, %r1187;
+ add.s32 %r8208, %r4994, %r4995;
+ mul.wide.s32 %rd1555, %r1184, 4;
+ add.s64 %rd1556, %rd1, %rd1555;
+ ld.local.u32 %r4996, [%rd1556];
+ shr.u32 %r4997, %r4996, %r4993;
+ shl.b32 %r4998, %r8209, %r1187;
+ add.s32 %r8209, %r4997, %r4998;
+
+$L__BB0_890:
+ and.b32 %r4999, %r1178, -2147483648;
+ shr.u32 %r5000, %r8209, 30;
+ shl.b32 %r5001, %r8208, 2;
+ or.b32 %r5002, %r5000, %r5001;
+ shr.u32 %r5003, %r5002, 31;
+ shr.u32 %r5004, %r8208, 30;
+ add.s32 %r5005, %r5003, %r5004;
+ neg.s32 %r5006, %r5005;
+ setp.eq.s32 %p768, %r4999, 0;
+ selp.b32 %r8210, %r5005, %r5006, %p768;
+ setp.ne.s32 %p769, %r5003, 0;
+ xor.b32 %r5007, %r4999, -2147483648;
+ selp.b32 %r5008, %r5007, %r4999, %p769;
+ selp.b32 %r5009, -1, 0, %p769;
+ xor.b32 %r5010, %r5002, %r5009;
+ shl.b32 %r5011, %r8209, 2;
+ xor.b32 %r5012, %r5011, %r5009;
+ cvt.u64.u32 %rd1557, %r5010;
+ cvt.u64.u32 %rd1558, %r5012;
+ bfi.b64 %rd1559, %rd1557, %rd1558, 32, 32;
+ cvt.rn.f64.s64 %fd121, %rd1559;
+ mul.f64 %fd122, %fd121, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3614, %fd122;
+ setp.eq.s32 %p770, %r5008, 0;
+ neg.f32 %f3615, %f3614;
+ selp.f32 %f5580, %f3614, %f3615, %p770;
+
+$L__BB0_892:
+ and.b32 %r1194, %r8210, 1;
+ setp.eq.s32 %p771, %r1194, 0;
+ selp.f32 %f1026, %f5580, 0f3F800000, %p771;
+ mul.rn.f32 %f1027, %f5580, %f5580;
+ mov.f32 %f5581, 0fB94D4153;
+ @%p771 bra $L__BB0_894;
+
+ mov.f32 %f3618, 0fBAB607ED;
+ mov.f32 %f3619, 0f37CBAC00;
+ fma.rn.f32 %f5581, %f3619, %f1027, %f3618;
+
+$L__BB0_894:
+ selp.f32 %f3620, 0f3C0885E4, 0f3D2AAABB, %p771;
+ fma.rn.f32 %f3621, %f5581, %f1027, %f3620;
+ selp.f32 %f3622, 0fBE2AAAA8, 0fBEFFFFFF, %p771;
+ fma.rn.f32 %f3623, %f3621, %f1027, %f3622;
+ mov.f32 %f3624, 0f00000000;
+ fma.rn.f32 %f3625, %f1027, %f1026, %f3624;
+ fma.rn.f32 %f5582, %f3623, %f3625, %f1026;
+ and.b32 %r5014, %r8210, 2;
+ setp.eq.s32 %p773, %r5014, 0;
+ @%p773 bra $L__BB0_896;
+
+ mov.f32 %f3627, 0fBF800000;
+ fma.rn.f32 %f5582, %f5582, %f3627, %f3624;
+
+$L__BB0_896:
+ mul.f32 %f3628, %f861, 0f3F22F983;
+ cvt.rni.s32.f32 %r8214, %f3628;
+ cvt.rn.f32.s32 %f3629, %r8214;
+ mov.f32 %f3630, 0fBFC90FDA;
+ fma.rn.f32 %f3631, %f3629, %f3630, %f861;
+ mov.f32 %f3632, 0fB3A22168;
+ fma.rn.f32 %f3633, %f3629, %f3632, %f3631;
+ mov.f32 %f3634, 0fA7C234C5;
+ fma.rn.f32 %f5583, %f3629, %f3634, %f3633;
+ abs.f32 %f1034, %f861;
+ setp.ltu.f32 %p774, %f1034, 0f47CE4780;
+ @%p774 bra $L__BB0_904;
+
+ setp.eq.f32 %p775, %f1034, 0f7F800000;
+ @%p775 bra $L__BB0_903;
+ bra.uni $L__BB0_898;
+
+$L__BB0_903:
+ mov.f32 %f3637, 0f00000000;
+ mul.rn.f32 %f5583, %f861, %f3637;
+ mov.u32 %r8214, 0;
+ bra.uni $L__BB0_904;
+
+$L__BB0_898:
+ mov.b32 %r1196, %f861;
+ shr.u32 %r5016, %r1196, 23;
+ and.b32 %r5017, %r5016, 255;
+ add.s32 %r1197, %r5017, -128;
+ shl.b32 %r5018, %r1196, 8;
+ or.b32 %r1198, %r5018, -2147483648;
+ shr.u32 %r1199, %r1197, 5;
+ mov.u64 %rd2627, 0;
+ mov.u32 %r8211, 0;
+ mov.u64 %rd1563, __cudart_i2opi_f;
+ mov.u64 %rd2628, %rd2627;
+
+$L__BB0_899:
+ .pragma "nounroll";
+ shl.b64 %rd1562, %rd2627, 2;
+ add.s64 %rd1564, %rd1563, %rd1562;
+ ld.global.nc.u32 %r5019, [%rd1564];
+ mad.wide.u32 %rd1565, %r5019, %r1198, %rd2628;
+ shr.u64 %rd2628, %rd1565, 32;
+ add.s64 %rd1566, %rd1, %rd1562;
+ st.local.u32 [%rd1566], %rd1565;
+ add.s32 %r8211, %r8211, 1;
+ cvt.s64.s32 %rd2627, %r8211;
+ setp.ne.s32 %p776, %r8211, 6;
+ @%p776 bra $L__BB0_899;
+
+ st.local.u32 [%rd4], %rd2628;
+ mov.u32 %r5020, 4;
+ sub.s32 %r1202, %r5020, %r1199;
+ mov.u32 %r5021, 6;
+ sub.s32 %r5022, %r5021, %r1199;
+ mul.wide.s32 %rd1567, %r5022, 4;
+ add.s64 %rd1568, %rd1, %rd1567;
+ ld.local.u32 %r8212, [%rd1568];
+ ld.local.u32 %r8213, [%rd1568+-4];
+ and.b32 %r1205, %r1197, 31;
+ setp.eq.s32 %p777, %r1205, 0;
+ @%p777 bra $L__BB0_902;
+
+ mov.u32 %r5023, 32;
+ sub.s32 %r5024, %r5023, %r1205;
+ shr.u32 %r5025, %r8213, %r5024;
+ shl.b32 %r5026, %r8212, %r1205;
+ add.s32 %r8212, %r5025, %r5026;
+ mul.wide.s32 %rd1569, %r1202, 4;
+ add.s64 %rd1570, %rd1, %rd1569;
+ ld.local.u32 %r5027, [%rd1570];
+ shr.u32 %r5028, %r5027, %r5024;
+ shl.b32 %r5029, %r8213, %r1205;
+ add.s32 %r8213, %r5028, %r5029;
+
+$L__BB0_902:
+ and.b32 %r5030, %r1196, -2147483648;
+ shr.u32 %r5031, %r8213, 30;
+ shl.b32 %r5032, %r8212, 2;
+ or.b32 %r5033, %r5031, %r5032;
+ shr.u32 %r5034, %r5033, 31;
+ shr.u32 %r5035, %r8212, 30;
+ add.s32 %r5036, %r5034, %r5035;
+ neg.s32 %r5037, %r5036;
+ setp.eq.s32 %p778, %r5030, 0;
+ selp.b32 %r8214, %r5036, %r5037, %p778;
+ setp.ne.s32 %p779, %r5034, 0;
+ xor.b32 %r5038, %r5030, -2147483648;
+ selp.b32 %r5039, %r5038, %r5030, %p779;
+ selp.b32 %r5040, -1, 0, %p779;
+ xor.b32 %r5041, %r5033, %r5040;
+ shl.b32 %r5042, %r8213, 2;
+ xor.b32 %r5043, %r5042, %r5040;
+ cvt.u64.u32 %rd1571, %r5041;
+ cvt.u64.u32 %rd1572, %r5043;
+ bfi.b64 %rd1573, %rd1571, %rd1572, 32, 32;
+ cvt.rn.f64.s64 %fd123, %rd1573;
+ mul.f64 %fd124, %fd123, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3635, %fd124;
+ setp.eq.s32 %p780, %r5039, 0;
+ neg.f32 %f3636, %f3635;
+ selp.f32 %f5583, %f3635, %f3636, %p780;
+
+$L__BB0_904:
+ add.s32 %r1212, %r8214, 1;
+ and.b32 %r1213, %r1212, 1;
+ setp.eq.s32 %p781, %r1213, 0;
+ selp.f32 %f1038, %f5583, 0f3F800000, %p781;
+ mul.rn.f32 %f1039, %f5583, %f5583;
+ mov.f32 %f5584, 0fB94D4153;
+ @%p781 bra $L__BB0_906;
+
+ mov.f32 %f3639, 0fBAB607ED;
+ mov.f32 %f3640, 0f37CBAC00;
+ fma.rn.f32 %f5584, %f3640, %f1039, %f3639;
+
+$L__BB0_906:
+ selp.f32 %f3641, 0f3C0885E4, 0f3D2AAABB, %p781;
+ fma.rn.f32 %f3642, %f5584, %f1039, %f3641;
+ selp.f32 %f3643, 0fBE2AAAA8, 0fBEFFFFFF, %p781;
+ fma.rn.f32 %f3644, %f3642, %f1039, %f3643;
+ mov.f32 %f3645, 0f00000000;
+ fma.rn.f32 %f3646, %f1039, %f1038, %f3645;
+ fma.rn.f32 %f5585, %f3644, %f3646, %f1038;
+ and.b32 %r5045, %r1212, 2;
+ setp.eq.s32 %p783, %r5045, 0;
+ @%p783 bra $L__BB0_908;
+
+ mov.f32 %f3648, 0fBF800000;
+ fma.rn.f32 %f5585, %f5585, %f3648, %f3645;
+
+$L__BB0_908:
+ add.f32 %f5593, %f5582, %f5585;
+ mul.f32 %f3649, %f870, 0f3F22F983;
+ cvt.rni.s32.f32 %r8218, %f3649;
+ cvt.rn.f32.s32 %f3650, %r8218;
+ mov.f32 %f3651, 0fBFC90FDA;
+ fma.rn.f32 %f3652, %f3650, %f3651, %f870;
+ mov.f32 %f3653, 0fB3A22168;
+ fma.rn.f32 %f3654, %f3650, %f3653, %f3652;
+ mov.f32 %f3655, 0fA7C234C5;
+ fma.rn.f32 %f5586, %f3650, %f3655, %f3654;
+ abs.f32 %f1047, %f870;
+ setp.ltu.f32 %p784, %f1047, 0f47CE4780;
+ @%p784 bra $L__BB0_916;
+
+ setp.eq.f32 %p785, %f1047, 0f7F800000;
+ @%p785 bra $L__BB0_915;
+ bra.uni $L__BB0_910;
+
+$L__BB0_915:
+ mov.f32 %f3658, 0f00000000;
+ mul.rn.f32 %f5586, %f870, %f3658;
+ mov.u32 %r8218, 0;
+ bra.uni $L__BB0_916;
+
+$L__BB0_910:
+ mov.b32 %r1215, %f870;
+ shr.u32 %r5047, %r1215, 23;
+ and.b32 %r5048, %r5047, 255;
+ add.s32 %r1216, %r5048, -128;
+ shl.b32 %r5049, %r1215, 8;
+ or.b32 %r1217, %r5049, -2147483648;
+ shr.u32 %r1218, %r1216, 5;
+ mov.u64 %rd2629, 0;
+ mov.u32 %r8215, 0;
+ mov.u64 %rd1577, __cudart_i2opi_f;
+ mov.u64 %rd2630, %rd2629;
+
+$L__BB0_911:
+ .pragma "nounroll";
+ shl.b64 %rd1576, %rd2629, 2;
+ add.s64 %rd1578, %rd1577, %rd1576;
+ ld.global.nc.u32 %r5050, [%rd1578];
+ mad.wide.u32 %rd1579, %r5050, %r1217, %rd2630;
+ shr.u64 %rd2630, %rd1579, 32;
+ add.s64 %rd1580, %rd1, %rd1576;
+ st.local.u32 [%rd1580], %rd1579;
+ add.s32 %r8215, %r8215, 1;
+ cvt.s64.s32 %rd2629, %r8215;
+ setp.ne.s32 %p786, %r8215, 6;
+ @%p786 bra $L__BB0_911;
+
+ st.local.u32 [%rd4], %rd2630;
+ mov.u32 %r5051, 4;
+ sub.s32 %r1221, %r5051, %r1218;
+ mov.u32 %r5052, 6;
+ sub.s32 %r5053, %r5052, %r1218;
+ mul.wide.s32 %rd1581, %r5053, 4;
+ add.s64 %rd1582, %rd1, %rd1581;
+ ld.local.u32 %r8216, [%rd1582];
+ ld.local.u32 %r8217, [%rd1582+-4];
+ and.b32 %r1224, %r1216, 31;
+ setp.eq.s32 %p787, %r1224, 0;
+ @%p787 bra $L__BB0_914;
+
+ mov.u32 %r5054, 32;
+ sub.s32 %r5055, %r5054, %r1224;
+ shr.u32 %r5056, %r8217, %r5055;
+ shl.b32 %r5057, %r8216, %r1224;
+ add.s32 %r8216, %r5056, %r5057;
+ mul.wide.s32 %rd1583, %r1221, 4;
+ add.s64 %rd1584, %rd1, %rd1583;
+ ld.local.u32 %r5058, [%rd1584];
+ shr.u32 %r5059, %r5058, %r5055;
+ shl.b32 %r5060, %r8217, %r1224;
+ add.s32 %r8217, %r5059, %r5060;
+
+$L__BB0_914:
+ and.b32 %r5061, %r1215, -2147483648;
+ shr.u32 %r5062, %r8217, 30;
+ shl.b32 %r5063, %r8216, 2;
+ or.b32 %r5064, %r5062, %r5063;
+ shr.u32 %r5065, %r5064, 31;
+ shr.u32 %r5066, %r8216, 30;
+ add.s32 %r5067, %r5065, %r5066;
+ neg.s32 %r5068, %r5067;
+ setp.eq.s32 %p788, %r5061, 0;
+ selp.b32 %r8218, %r5067, %r5068, %p788;
+ setp.ne.s32 %p789, %r5065, 0;
+ xor.b32 %r5069, %r5061, -2147483648;
+ selp.b32 %r5070, %r5069, %r5061, %p789;
+ selp.b32 %r5071, -1, 0, %p789;
+ xor.b32 %r5072, %r5064, %r5071;
+ shl.b32 %r5073, %r8217, 2;
+ xor.b32 %r5074, %r5073, %r5071;
+ cvt.u64.u32 %rd1585, %r5072;
+ cvt.u64.u32 %rd1586, %r5074;
+ bfi.b64 %rd1587, %rd1585, %rd1586, 32, 32;
+ cvt.rn.f64.s64 %fd125, %rd1587;
+ mul.f64 %fd126, %fd125, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3656, %fd126;
+ setp.eq.s32 %p790, %r5070, 0;
+ neg.f32 %f3657, %f3656;
+ selp.f32 %f5586, %f3656, %f3657, %p790;
+
+$L__BB0_916:
+ and.b32 %r1231, %r8218, 1;
+ setp.eq.s32 %p791, %r1231, 0;
+ selp.f32 %f1051, %f5586, 0f3F800000, %p791;
+ mul.rn.f32 %f1052, %f5586, %f5586;
+ mov.f32 %f5587, 0fB94D4153;
+ @%p791 bra $L__BB0_918;
+
+ mov.f32 %f3660, 0fBAB607ED;
+ mov.f32 %f3661, 0f37CBAC00;
+ fma.rn.f32 %f5587, %f3661, %f1052, %f3660;
+
+$L__BB0_918:
+ selp.f32 %f3662, 0f3C0885E4, 0f3D2AAABB, %p791;
+ fma.rn.f32 %f3663, %f5587, %f1052, %f3662;
+ selp.f32 %f3664, 0fBE2AAAA8, 0fBEFFFFFF, %p791;
+ fma.rn.f32 %f3665, %f3663, %f1052, %f3664;
+ mov.f32 %f3666, 0f00000000;
+ fma.rn.f32 %f3667, %f1052, %f1051, %f3666;
+ fma.rn.f32 %f5588, %f3665, %f3667, %f1051;
+ and.b32 %r5076, %r8218, 2;
+ setp.eq.s32 %p793, %r5076, 0;
+ @%p793 bra $L__BB0_920;
+
+ mov.f32 %f3669, 0fBF800000;
+ fma.rn.f32 %f5588, %f5588, %f3669, %f3666;
+
+$L__BB0_920:
+ mul.f32 %f3670, %f862, 0f3F22F983;
+ cvt.rni.s32.f32 %r8222, %f3670;
+ cvt.rn.f32.s32 %f3671, %r8222;
+ mov.f32 %f3672, 0fBFC90FDA;
+ fma.rn.f32 %f3673, %f3671, %f3672, %f862;
+ mov.f32 %f3674, 0fB3A22168;
+ fma.rn.f32 %f3675, %f3671, %f3674, %f3673;
+ mov.f32 %f3676, 0fA7C234C5;
+ fma.rn.f32 %f5589, %f3671, %f3676, %f3675;
+ abs.f32 %f1059, %f862;
+ setp.ltu.f32 %p794, %f1059, 0f47CE4780;
+ @%p794 bra $L__BB0_928;
+
+ setp.eq.f32 %p795, %f1059, 0f7F800000;
+ @%p795 bra $L__BB0_927;
+ bra.uni $L__BB0_922;
+
+$L__BB0_927:
+ mov.f32 %f3679, 0f00000000;
+ mul.rn.f32 %f5589, %f862, %f3679;
+ mov.u32 %r8222, 0;
+ bra.uni $L__BB0_928;
+
+$L__BB0_922:
+ mov.b32 %r1233, %f862;
+ shr.u32 %r5078, %r1233, 23;
+ and.b32 %r5079, %r5078, 255;
+ add.s32 %r1234, %r5079, -128;
+ shl.b32 %r5080, %r1233, 8;
+ or.b32 %r1235, %r5080, -2147483648;
+ shr.u32 %r1236, %r1234, 5;
+ mov.u64 %rd2631, 0;
+ mov.u32 %r8219, 0;
+ mov.u64 %rd1591, __cudart_i2opi_f;
+ mov.u64 %rd2632, %rd2631;
+
+$L__BB0_923:
+ .pragma "nounroll";
+ shl.b64 %rd1590, %rd2631, 2;
+ add.s64 %rd1592, %rd1591, %rd1590;
+ ld.global.nc.u32 %r5081, [%rd1592];
+ mad.wide.u32 %rd1593, %r5081, %r1235, %rd2632;
+ shr.u64 %rd2632, %rd1593, 32;
+ add.s64 %rd1594, %rd1, %rd1590;
+ st.local.u32 [%rd1594], %rd1593;
+ add.s32 %r8219, %r8219, 1;
+ cvt.s64.s32 %rd2631, %r8219;
+ setp.ne.s32 %p796, %r8219, 6;
+ @%p796 bra $L__BB0_923;
+
+ st.local.u32 [%rd4], %rd2632;
+ mov.u32 %r5082, 4;
+ sub.s32 %r1239, %r5082, %r1236;
+ mov.u32 %r5083, 6;
+ sub.s32 %r5084, %r5083, %r1236;
+ mul.wide.s32 %rd1595, %r5084, 4;
+ add.s64 %rd1596, %rd1, %rd1595;
+ ld.local.u32 %r8220, [%rd1596];
+ ld.local.u32 %r8221, [%rd1596+-4];
+ and.b32 %r1242, %r1234, 31;
+ setp.eq.s32 %p797, %r1242, 0;
+ @%p797 bra $L__BB0_926;
+
+ mov.u32 %r5085, 32;
+ sub.s32 %r5086, %r5085, %r1242;
+ shr.u32 %r5087, %r8221, %r5086;
+ shl.b32 %r5088, %r8220, %r1242;
+ add.s32 %r8220, %r5087, %r5088;
+ mul.wide.s32 %rd1597, %r1239, 4;
+ add.s64 %rd1598, %rd1, %rd1597;
+ ld.local.u32 %r5089, [%rd1598];
+ shr.u32 %r5090, %r5089, %r5086;
+ shl.b32 %r5091, %r8221, %r1242;
+ add.s32 %r8221, %r5090, %r5091;
+
+$L__BB0_926:
+ and.b32 %r5092, %r1233, -2147483648;
+ shr.u32 %r5093, %r8221, 30;
+ shl.b32 %r5094, %r8220, 2;
+ or.b32 %r5095, %r5093, %r5094;
+ shr.u32 %r5096, %r5095, 31;
+ shr.u32 %r5097, %r8220, 30;
+ add.s32 %r5098, %r5096, %r5097;
+ neg.s32 %r5099, %r5098;
+ setp.eq.s32 %p798, %r5092, 0;
+ selp.b32 %r8222, %r5098, %r5099, %p798;
+ setp.ne.s32 %p799, %r5096, 0;
+ xor.b32 %r5100, %r5092, -2147483648;
+ selp.b32 %r5101, %r5100, %r5092, %p799;
+ selp.b32 %r5102, -1, 0, %p799;
+ xor.b32 %r5103, %r5095, %r5102;
+ shl.b32 %r5104, %r8221, 2;
+ xor.b32 %r5105, %r5104, %r5102;
+ cvt.u64.u32 %rd1599, %r5103;
+ cvt.u64.u32 %rd1600, %r5105;
+ bfi.b64 %rd1601, %rd1599, %rd1600, 32, 32;
+ cvt.rn.f64.s64 %fd127, %rd1601;
+ mul.f64 %fd128, %fd127, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3677, %fd128;
+ setp.eq.s32 %p800, %r5101, 0;
+ neg.f32 %f3678, %f3677;
+ selp.f32 %f5589, %f3677, %f3678, %p800;
+
+$L__BB0_928:
+ add.s32 %r1249, %r8222, 1;
+ and.b32 %r1250, %r1249, 1;
+ setp.eq.s32 %p801, %r1250, 0;
+ selp.f32 %f1063, %f5589, 0f3F800000, %p801;
+ mul.rn.f32 %f1064, %f5589, %f5589;
+ mov.f32 %f5590, 0fB94D4153;
+ @%p801 bra $L__BB0_930;
+
+ mov.f32 %f3681, 0fBAB607ED;
+ mov.f32 %f3682, 0f37CBAC00;
+ fma.rn.f32 %f5590, %f3682, %f1064, %f3681;
+
+$L__BB0_930:
+ selp.f32 %f3683, 0f3C0885E4, 0f3D2AAABB, %p801;
+ fma.rn.f32 %f3684, %f5590, %f1064, %f3683;
+ selp.f32 %f3685, 0fBE2AAAA8, 0fBEFFFFFF, %p801;
+ fma.rn.f32 %f3686, %f3684, %f1064, %f3685;
+ mov.f32 %f3687, 0f00000000;
+ fma.rn.f32 %f3688, %f1064, %f1063, %f3687;
+ fma.rn.f32 %f5591, %f3686, %f3688, %f1063;
+ and.b32 %r5107, %r1249, 2;
+ setp.eq.s32 %p803, %r5107, 0;
+ @%p803 bra $L__BB0_932;
+
+ mov.f32 %f3690, 0fBF800000;
+ fma.rn.f32 %f5591, %f5591, %f3690, %f3687;
+
+$L__BB0_932:
+ add.f32 %f5592, %f5588, %f5591;
+ bra.uni $L__BB0_933;
+
+$L__BB0_512:
+ mov.b32 %r653, %f5416;
+ shr.u32 %r4036, %r653, 23;
+ and.b32 %r4037, %r4036, 255;
+ add.s32 %r654, %r4037, -128;
+ shl.b32 %r4038, %r653, 8;
+ or.b32 %r655, %r4038, -2147483648;
+ shr.u32 %r656, %r654, 5;
+ mov.u64 %rd2569, 0;
+ mov.u32 %r8095, 0;
+ mov.u64 %rd1128, __cudart_i2opi_f;
+ mov.u64 %rd2570, %rd2569;
+
+$L__BB0_513:
+ .pragma "nounroll";
+ shl.b64 %rd1127, %rd2569, 2;
+ add.s64 %rd1129, %rd1128, %rd1127;
+ ld.global.nc.u32 %r4039, [%rd1129];
+ mad.wide.u32 %rd1130, %r4039, %r655, %rd2570;
+ shr.u64 %rd2570, %rd1130, 32;
+ add.s64 %rd1131, %rd1, %rd1127;
+ st.local.u32 [%rd1131], %rd1130;
+ add.s32 %r8095, %r8095, 1;
+ cvt.s64.s32 %rd2569, %r8095;
+ setp.ne.s32 %p448, %r8095, 6;
+ @%p448 bra $L__BB0_513;
+
+ st.local.u32 [%rd4], %rd2570;
+ mov.u32 %r4040, 4;
+ sub.s32 %r659, %r4040, %r656;
+ mov.u32 %r4041, 6;
+ sub.s32 %r4042, %r4041, %r656;
+ mul.wide.s32 %rd1132, %r4042, 4;
+ add.s64 %rd1133, %rd1, %rd1132;
+ ld.local.u32 %r8096, [%rd1133];
+ ld.local.u32 %r8097, [%rd1133+-4];
+ and.b32 %r662, %r654, 31;
+ setp.eq.s32 %p449, %r662, 0;
+ @%p449 bra $L__BB0_516;
+
+ mov.u32 %r4043, 32;
+ sub.s32 %r4044, %r4043, %r662;
+ shr.u32 %r4045, %r8097, %r4044;
+ shl.b32 %r4046, %r8096, %r662;
+ add.s32 %r8096, %r4045, %r4046;
+ mul.wide.s32 %rd1134, %r659, 4;
+ add.s64 %rd1135, %rd1, %rd1134;
+ ld.local.u32 %r4047, [%rd1135];
+ shr.u32 %r4048, %r4047, %r4044;
+ shl.b32 %r4049, %r8097, %r662;
+ add.s32 %r8097, %r4048, %r4049;
+
+$L__BB0_516:
+ and.b32 %r4050, %r653, -2147483648;
+ shr.u32 %r4051, %r8097, 30;
+ shl.b32 %r4052, %r8096, 2;
+ or.b32 %r4053, %r4051, %r4052;
+ shr.u32 %r4054, %r4053, 31;
+ shr.u32 %r4055, %r8096, 30;
+ add.s32 %r4056, %r4054, %r4055;
+ neg.s32 %r4057, %r4056;
+ setp.eq.s32 %p450, %r4050, 0;
+ selp.b32 %r8098, %r4056, %r4057, %p450;
+ setp.ne.s32 %p451, %r4054, 0;
+ xor.b32 %r4058, %r4050, -2147483648;
+ selp.b32 %r4059, %r4058, %r4050, %p451;
+ selp.b32 %r4060, -1, 0, %p451;
+ xor.b32 %r4061, %r4053, %r4060;
+ shl.b32 %r4062, %r8097, 2;
+ xor.b32 %r4063, %r4062, %r4060;
+ cvt.u64.u32 %rd1136, %r4061;
+ cvt.u64.u32 %rd1137, %r4063;
+ bfi.b64 %rd1138, %rd1136, %rd1137, 32, 32;
+ cvt.rn.f64.s64 %fd65, %rd1138;
+ mul.f64 %fd66, %fd65, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3011, %fd66;
+ setp.eq.s32 %p452, %r4059, 0;
+ neg.f32 %f3012, %f3011;
+ selp.f32 %f5445, %f3011, %f3012, %p452;
+
+$L__BB0_518:
+ and.b32 %r669, %r8098, 1;
+ setp.eq.s32 %p453, %r669, 0;
+ selp.f32 %f593, %f5445, 0f3F800000, %p453;
+ mul.rn.f32 %f594, %f5445, %f5445;
+ mov.f32 %f5446, 0fB94D4153;
+ @%p453 bra $L__BB0_520;
+
+ mov.f32 %f3015, 0fBAB607ED;
+ mov.f32 %f3016, 0f37CBAC00;
+ fma.rn.f32 %f5446, %f3016, %f594, %f3015;
+
+$L__BB0_520:
+ selp.f32 %f3017, 0f3C0885E4, 0f3D2AAABB, %p453;
+ fma.rn.f32 %f3018, %f5446, %f594, %f3017;
+ selp.f32 %f3019, 0fBE2AAAA8, 0fBEFFFFFF, %p453;
+ fma.rn.f32 %f3020, %f3018, %f594, %f3019;
+ mov.f32 %f3021, 0f00000000;
+ fma.rn.f32 %f3022, %f594, %f593, %f3021;
+ fma.rn.f32 %f5281, %f3020, %f3022, %f593;
+ and.b32 %r4065, %r8098, 2;
+ setp.eq.s32 %p455, %r4065, 0;
+ @%p455 bra $L__BB0_522;
+
+ mov.f32 %f3024, 0fBF800000;
+ fma.rn.f32 %f5281, %f5281, %f3024, %f3021;
+
+$L__BB0_522:
+ setp.lt.s32 %p8, %r11, %r651;
+ @%p445 bra $L__BB0_535;
+
+ mul.f32 %f3025, %f5607, 0f3F22F983;
+ cvt.rni.s32.f32 %r8102, %f3025;
+ cvt.rn.f32.s32 %f3026, %r8102;
+ mov.f32 %f3027, 0fBFC90FDA;
+ fma.rn.f32 %f3028, %f3026, %f3027, %f5607;
+ mov.f32 %f3029, 0fB3A22168;
+ fma.rn.f32 %f3030, %f3026, %f3029, %f3028;
+ mov.f32 %f3031, 0fA7C234C5;
+ fma.rn.f32 %f5449, %f3026, %f3031, %f3030;
+ abs.f32 %f602, %f5607;
+ setp.ltu.f32 %p457, %f602, 0f47CE4780;
+ @%p457 bra $L__BB0_531;
+
+ setp.eq.f32 %p458, %f602, 0f7F800000;
+ @%p458 bra $L__BB0_530;
+ bra.uni $L__BB0_525;
+
+$L__BB0_530:
+ mov.f32 %f3034, 0f00000000;
+ mul.rn.f32 %f5449, %f5607, %f3034;
+ mov.u32 %r8102, 0;
+ bra.uni $L__BB0_531;
+
+$L__BB0_525:
+ mov.b32 %r671, %f5607;
+ shr.u32 %r4067, %r671, 23;
+ and.b32 %r4068, %r4067, 255;
+ add.s32 %r672, %r4068, -128;
+ shl.b32 %r4069, %r671, 8;
+ or.b32 %r673, %r4069, -2147483648;
+ shr.u32 %r674, %r672, 5;
+ mov.u64 %rd2571, 0;
+ mov.u32 %r8099, 0;
+ mov.u64 %rd1142, __cudart_i2opi_f;
+ mov.u64 %rd2572, %rd2571;
+
+$L__BB0_526:
+ .pragma "nounroll";
+ shl.b64 %rd1141, %rd2571, 2;
+ add.s64 %rd1143, %rd1142, %rd1141;
+ ld.global.nc.u32 %r4070, [%rd1143];
+ mad.wide.u32 %rd1144, %r4070, %r673, %rd2572;
+ shr.u64 %rd2572, %rd1144, 32;
+ add.s64 %rd1145, %rd1, %rd1141;
+ st.local.u32 [%rd1145], %rd1144;
+ add.s32 %r8099, %r8099, 1;
+ cvt.s64.s32 %rd2571, %r8099;
+ setp.ne.s32 %p459, %r8099, 6;
+ @%p459 bra $L__BB0_526;
+
+ st.local.u32 [%rd4], %rd2572;
+ mov.u32 %r4071, 4;
+ sub.s32 %r677, %r4071, %r674;
+ mov.u32 %r4072, 6;
+ sub.s32 %r4073, %r4072, %r674;
+ mul.wide.s32 %rd1146, %r4073, 4;
+ add.s64 %rd1147, %rd1, %rd1146;
+ ld.local.u32 %r8100, [%rd1147];
+ ld.local.u32 %r8101, [%rd1147+-4];
+ and.b32 %r680, %r672, 31;
+ setp.eq.s32 %p460, %r680, 0;
+ @%p460 bra $L__BB0_529;
+
+ mov.u32 %r4074, 32;
+ sub.s32 %r4075, %r4074, %r680;
+ shr.u32 %r4076, %r8101, %r4075;
+ shl.b32 %r4077, %r8100, %r680;
+ add.s32 %r8100, %r4076, %r4077;
+ mul.wide.s32 %rd1148, %r677, 4;
+ add.s64 %rd1149, %rd1, %rd1148;
+ ld.local.u32 %r4078, [%rd1149];
+ shr.u32 %r4079, %r4078, %r4075;
+ shl.b32 %r4080, %r8101, %r680;
+ add.s32 %r8101, %r4079, %r4080;
+
+$L__BB0_529:
+ and.b32 %r4081, %r671, -2147483648;
+ shr.u32 %r4082, %r8101, 30;
+ shl.b32 %r4083, %r8100, 2;
+ or.b32 %r4084, %r4082, %r4083;
+ shr.u32 %r4085, %r4084, 31;
+ shr.u32 %r4086, %r8100, 30;
+ add.s32 %r4087, %r4085, %r4086;
+ neg.s32 %r4088, %r4087;
+ setp.eq.s32 %p461, %r4081, 0;
+ selp.b32 %r8102, %r4087, %r4088, %p461;
+ setp.ne.s32 %p462, %r4085, 0;
+ xor.b32 %r4089, %r4081, -2147483648;
+ selp.b32 %r4090, %r4089, %r4081, %p462;
+ selp.b32 %r4091, -1, 0, %p462;
+ xor.b32 %r4092, %r4084, %r4091;
+ shl.b32 %r4093, %r8101, 2;
+ xor.b32 %r4094, %r4093, %r4091;
+ cvt.u64.u32 %rd1150, %r4092;
+ cvt.u64.u32 %rd1151, %r4094;
+ bfi.b64 %rd1152, %rd1150, %rd1151, 32, 32;
+ cvt.rn.f64.s64 %fd67, %rd1152;
+ mul.f64 %fd68, %fd67, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3032, %fd68;
+ setp.eq.s32 %p463, %r4090, 0;
+ neg.f32 %f3033, %f3032;
+ selp.f32 %f5449, %f3032, %f3033, %p463;
+
+$L__BB0_531:
+ add.s32 %r687, %r8102, 1;
+ and.b32 %r688, %r687, 1;
+ setp.eq.s32 %p464, %r688, 0;
+ selp.f32 %f606, %f5449, 0f3F800000, %p464;
+ mul.rn.f32 %f607, %f5449, %f5449;
+ mov.f32 %f5450, 0fB94D4153;
+ @%p464 bra $L__BB0_533;
+
+ mov.f32 %f3036, 0fBAB607ED;
+ mov.f32 %f3037, 0f37CBAC00;
+ fma.rn.f32 %f5450, %f3037, %f607, %f3036;
+
+$L__BB0_533:
+ selp.f32 %f3038, 0f3C0885E4, 0f3D2AAABB, %p464;
+ fma.rn.f32 %f3039, %f5450, %f607, %f3038;
+ selp.f32 %f3040, 0fBE2AAAA8, 0fBEFFFFFF, %p464;
+ fma.rn.f32 %f3041, %f3039, %f607, %f3040;
+ mov.f32 %f3042, 0f00000000;
+ fma.rn.f32 %f3043, %f607, %f606, %f3042;
+ fma.rn.f32 %f5283, %f3041, %f3043, %f606;
+ and.b32 %r4096, %r687, 2;
+ setp.eq.s32 %p466, %r4096, 0;
+ @%p466 bra $L__BB0_535;
+
+ mov.f32 %f3045, 0fBF800000;
+ fma.rn.f32 %f5283, %f5283, %f3045, %f3042;
+
+$L__BB0_535:
+ selp.f32 %f614, %f5283, %f5284, %p8;
+ selp.f32 %f615, %f5281, %f5282, %p8;
+ @%p445 bra $L__BB0_537;
+
+ add.f32 %f5599, %f615, %f614;
+ mov.f32 %f5282, %f5281;
+ mov.f32 %f5284, %f5283;
+
+$L__BB0_537:
+ @%p420 bra $L__BB0_566;
+
+ shl.b32 %r4097, %r12, 5;
+ mov.u32 %r4098, -32;
+ sub.s32 %r689, %r4098, %r4097;
+ setp.ge.s32 %p470, %r11, %r689;
+ @%p470 bra $L__BB0_551;
+
+ mul.f32 %f3048, %f5415, 0f3F22F983;
+ cvt.rni.s32.f32 %r8106, %f3048;
+ cvt.rn.f32.s32 %f3049, %r8106;
+ mov.f32 %f3050, 0fBFC90FDA;
+ fma.rn.f32 %f3051, %f3049, %f3050, %f5415;
+ mov.f32 %f3052, 0fB3A22168;
+ fma.rn.f32 %f3053, %f3049, %f3052, %f3051;
+ mov.f32 %f3054, 0fA7C234C5;
+ fma.rn.f32 %f5458, %f3049, %f3054, %f3053;
+ abs.f32 %f623, %f5415;
+ setp.ltu.f32 %p471, %f623, 0f47CE4780;
+ @%p471 bra $L__BB0_547;
+
+ setp.eq.f32 %p472, %f623, 0f7F800000;
+ @%p472 bra $L__BB0_546;
+ bra.uni $L__BB0_541;
+
+$L__BB0_546:
+ mov.f32 %f3057, 0f00000000;
+ mul.rn.f32 %f5458, %f5415, %f3057;
+ mov.u32 %r8106, 0;
+ bra.uni $L__BB0_547;
+
+$L__BB0_541:
+ mov.b32 %r691, %f5415;
+ shr.u32 %r4100, %r691, 23;
+ and.b32 %r4101, %r4100, 255;
+ add.s32 %r692, %r4101, -128;
+ shl.b32 %r4102, %r691, 8;
+ or.b32 %r693, %r4102, -2147483648;
+ shr.u32 %r694, %r692, 5;
+ mov.u64 %rd2573, 0;
+ mov.u32 %r8103, 0;
+ mov.u64 %rd1156, __cudart_i2opi_f;
+ mov.u64 %rd2574, %rd2573;
+
+$L__BB0_542:
+ .pragma "nounroll";
+ shl.b64 %rd1155, %rd2573, 2;
+ add.s64 %rd1157, %rd1156, %rd1155;
+ ld.global.nc.u32 %r4103, [%rd1157];
+ mad.wide.u32 %rd1158, %r4103, %r693, %rd2574;
+ shr.u64 %rd2574, %rd1158, 32;
+ add.s64 %rd1159, %rd1, %rd1155;
+ st.local.u32 [%rd1159], %rd1158;
+ add.s32 %r8103, %r8103, 1;
+ cvt.s64.s32 %rd2573, %r8103;
+ setp.ne.s32 %p473, %r8103, 6;
+ @%p473 bra $L__BB0_542;
+
+ st.local.u32 [%rd4], %rd2574;
+ mov.u32 %r4104, 4;
+ sub.s32 %r697, %r4104, %r694;
+ mov.u32 %r4105, 6;
+ sub.s32 %r4106, %r4105, %r694;
+ mul.wide.s32 %rd1160, %r4106, 4;
+ add.s64 %rd1161, %rd1, %rd1160;
+ ld.local.u32 %r8104, [%rd1161];
+ ld.local.u32 %r8105, [%rd1161+-4];
+ and.b32 %r700, %r692, 31;
+ setp.eq.s32 %p474, %r700, 0;
+ @%p474 bra $L__BB0_545;
+
+ mov.u32 %r4107, 32;
+ sub.s32 %r4108, %r4107, %r700;
+ shr.u32 %r4109, %r8105, %r4108;
+ shl.b32 %r4110, %r8104, %r700;
+ add.s32 %r8104, %r4109, %r4110;
+ mul.wide.s32 %rd1162, %r697, 4;
+ add.s64 %rd1163, %rd1, %rd1162;
+ ld.local.u32 %r4111, [%rd1163];
+ shr.u32 %r4112, %r4111, %r4108;
+ shl.b32 %r4113, %r8105, %r700;
+ add.s32 %r8105, %r4112, %r4113;
+
+$L__BB0_545:
+ and.b32 %r4114, %r691, -2147483648;
+ shr.u32 %r4115, %r8105, 30;
+ shl.b32 %r4116, %r8104, 2;
+ or.b32 %r4117, %r4115, %r4116;
+ shr.u32 %r4118, %r4117, 31;
+ shr.u32 %r4119, %r8104, 30;
+ add.s32 %r4120, %r4118, %r4119;
+ neg.s32 %r4121, %r4120;
+ setp.eq.s32 %p475, %r4114, 0;
+ selp.b32 %r8106, %r4120, %r4121, %p475;
+ setp.ne.s32 %p476, %r4118, 0;
+ xor.b32 %r4122, %r4114, -2147483648;
+ selp.b32 %r4123, %r4122, %r4114, %p476;
+ selp.b32 %r4124, -1, 0, %p476;
+ xor.b32 %r4125, %r4117, %r4124;
+ shl.b32 %r4126, %r8105, 2;
+ xor.b32 %r4127, %r4126, %r4124;
+ cvt.u64.u32 %rd1164, %r4125;
+ cvt.u64.u32 %rd1165, %r4127;
+ bfi.b64 %rd1166, %rd1164, %rd1165, 32, 32;
+ cvt.rn.f64.s64 %fd69, %rd1166;
+ mul.f64 %fd70, %fd69, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3055, %fd70;
+ setp.eq.s32 %p477, %r4123, 0;
+ neg.f32 %f3056, %f3055;
+ selp.f32 %f5458, %f3055, %f3056, %p477;
+
+$L__BB0_547:
+ and.b32 %r707, %r8106, 1;
+ setp.eq.s32 %p478, %r707, 0;
+ selp.f32 %f627, %f5458, 0f3F800000, %p478;
+ mul.rn.f32 %f628, %f5458, %f5458;
+ mov.f32 %f5459, 0fB94D4153;
+ @%p478 bra $L__BB0_549;
+
+ mov.f32 %f3059, 0fBAB607ED;
+ mov.f32 %f3060, 0f37CBAC00;
+ fma.rn.f32 %f5459, %f3060, %f628, %f3059;
+
+$L__BB0_549:
+ selp.f32 %f3061, 0f3C0885E4, 0f3D2AAABB, %p478;
+ fma.rn.f32 %f3062, %f5459, %f628, %f3061;
+ selp.f32 %f3063, 0fBE2AAAA8, 0fBEFFFFFF, %p478;
+ fma.rn.f32 %f3064, %f3062, %f628, %f3063;
+ mov.f32 %f3065, 0f00000000;
+ fma.rn.f32 %f3066, %f628, %f627, %f3065;
+ fma.rn.f32 %f5281, %f3064, %f3066, %f627;
+ and.b32 %r4129, %r8106, 2;
+ setp.eq.s32 %p480, %r4129, 0;
+ @%p480 bra $L__BB0_551;
+
+ mov.f32 %f3068, 0fBF800000;
+ fma.rn.f32 %f5281, %f5281, %f3068, %f3065;
+
+$L__BB0_551:
+ setp.lt.s32 %p9, %r11, %r689;
+ @%p470 bra $L__BB0_564;
+
+ mul.f32 %f3069, %f5606, 0f3F22F983;
+ cvt.rni.s32.f32 %r8110, %f3069;
+ cvt.rn.f32.s32 %f3070, %r8110;
+ mov.f32 %f3071, 0fBFC90FDA;
+ fma.rn.f32 %f3072, %f3070, %f3071, %f5606;
+ mov.f32 %f3073, 0fB3A22168;
+ fma.rn.f32 %f3074, %f3070, %f3073, %f3072;
+ mov.f32 %f3075, 0fA7C234C5;
+ fma.rn.f32 %f5462, %f3070, %f3075, %f3074;
+ abs.f32 %f636, %f5606;
+ setp.ltu.f32 %p482, %f636, 0f47CE4780;
+ @%p482 bra $L__BB0_560;
+
+ setp.eq.f32 %p483, %f636, 0f7F800000;
+ @%p483 bra $L__BB0_559;
+ bra.uni $L__BB0_554;
+
+$L__BB0_559:
+ mov.f32 %f3078, 0f00000000;
+ mul.rn.f32 %f5462, %f5606, %f3078;
+ mov.u32 %r8110, 0;
+ bra.uni $L__BB0_560;
+
+$L__BB0_554:
+ mov.b32 %r709, %f5606;
+ shr.u32 %r4131, %r709, 23;
+ and.b32 %r4132, %r4131, 255;
+ add.s32 %r710, %r4132, -128;
+ shl.b32 %r4133, %r709, 8;
+ or.b32 %r711, %r4133, -2147483648;
+ shr.u32 %r712, %r710, 5;
+ mov.u64 %rd2575, 0;
+ mov.u32 %r8107, 0;
+ mov.u64 %rd1170, __cudart_i2opi_f;
+ mov.u64 %rd2576, %rd2575;
+
+$L__BB0_555:
+ .pragma "nounroll";
+ shl.b64 %rd1169, %rd2575, 2;
+ add.s64 %rd1171, %rd1170, %rd1169;
+ ld.global.nc.u32 %r4134, [%rd1171];
+ mad.wide.u32 %rd1172, %r4134, %r711, %rd2576;
+ shr.u64 %rd2576, %rd1172, 32;
+ add.s64 %rd1173, %rd1, %rd1169;
+ st.local.u32 [%rd1173], %rd1172;
+ add.s32 %r8107, %r8107, 1;
+ cvt.s64.s32 %rd2575, %r8107;
+ setp.ne.s32 %p484, %r8107, 6;
+ @%p484 bra $L__BB0_555;
+
+ st.local.u32 [%rd4], %rd2576;
+ mov.u32 %r4135, 4;
+ sub.s32 %r715, %r4135, %r712;
+ mov.u32 %r4136, 6;
+ sub.s32 %r4137, %r4136, %r712;
+ mul.wide.s32 %rd1174, %r4137, 4;
+ add.s64 %rd1175, %rd1, %rd1174;
+ ld.local.u32 %r8108, [%rd1175];
+ ld.local.u32 %r8109, [%rd1175+-4];
+ and.b32 %r718, %r710, 31;
+ setp.eq.s32 %p485, %r718, 0;
+ @%p485 bra $L__BB0_558;
+
+ mov.u32 %r4138, 32;
+ sub.s32 %r4139, %r4138, %r718;
+ shr.u32 %r4140, %r8109, %r4139;
+ shl.b32 %r4141, %r8108, %r718;
+ add.s32 %r8108, %r4140, %r4141;
+ mul.wide.s32 %rd1176, %r715, 4;
+ add.s64 %rd1177, %rd1, %rd1176;
+ ld.local.u32 %r4142, [%rd1177];
+ shr.u32 %r4143, %r4142, %r4139;
+ shl.b32 %r4144, %r8109, %r718;
+ add.s32 %r8109, %r4143, %r4144;
+
+$L__BB0_558:
+ and.b32 %r4145, %r709, -2147483648;
+ shr.u32 %r4146, %r8109, 30;
+ shl.b32 %r4147, %r8108, 2;
+ or.b32 %r4148, %r4146, %r4147;
+ shr.u32 %r4149, %r4148, 31;
+ shr.u32 %r4150, %r8108, 30;
+ add.s32 %r4151, %r4149, %r4150;
+ neg.s32 %r4152, %r4151;
+ setp.eq.s32 %p486, %r4145, 0;
+ selp.b32 %r8110, %r4151, %r4152, %p486;
+ setp.ne.s32 %p487, %r4149, 0;
+ xor.b32 %r4153, %r4145, -2147483648;
+ selp.b32 %r4154, %r4153, %r4145, %p487;
+ selp.b32 %r4155, -1, 0, %p487;
+ xor.b32 %r4156, %r4148, %r4155;
+ shl.b32 %r4157, %r8109, 2;
+ xor.b32 %r4158, %r4157, %r4155;
+ cvt.u64.u32 %rd1178, %r4156;
+ cvt.u64.u32 %rd1179, %r4158;
+ bfi.b64 %rd1180, %rd1178, %rd1179, 32, 32;
+ cvt.rn.f64.s64 %fd71, %rd1180;
+ mul.f64 %fd72, %fd71, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3076, %fd72;
+ setp.eq.s32 %p488, %r4154, 0;
+ neg.f32 %f3077, %f3076;
+ selp.f32 %f5462, %f3076, %f3077, %p488;
+
+$L__BB0_560:
+ add.s32 %r725, %r8110, 1;
+ and.b32 %r726, %r725, 1;
+ setp.eq.s32 %p489, %r726, 0;
+ selp.f32 %f640, %f5462, 0f3F800000, %p489;
+ mul.rn.f32 %f641, %f5462, %f5462;
+ mov.f32 %f5463, 0fB94D4153;
+ @%p489 bra $L__BB0_562;
+
+ mov.f32 %f3080, 0fBAB607ED;
+ mov.f32 %f3081, 0f37CBAC00;
+ fma.rn.f32 %f5463, %f3081, %f641, %f3080;
+
+$L__BB0_562:
+ selp.f32 %f3082, 0f3C0885E4, 0f3D2AAABB, %p489;
+ fma.rn.f32 %f3083, %f5463, %f641, %f3082;
+ selp.f32 %f3084, 0fBE2AAAA8, 0fBEFFFFFF, %p489;
+ fma.rn.f32 %f3085, %f3083, %f641, %f3084;
+ mov.f32 %f3086, 0f00000000;
+ fma.rn.f32 %f3087, %f641, %f640, %f3086;
+ fma.rn.f32 %f5283, %f3085, %f3087, %f640;
+ and.b32 %r4160, %r725, 2;
+ setp.eq.s32 %p491, %r4160, 0;
+ @%p491 bra $L__BB0_564;
+
+ mov.f32 %f3089, 0fBF800000;
+ fma.rn.f32 %f5283, %f5283, %f3089, %f3086;
+
+$L__BB0_564:
+ selp.f32 %f648, %f5283, %f5284, %p9;
+ selp.f32 %f649, %f5281, %f5282, %p9;
+ @%p470 bra $L__BB0_566;
+
+ add.f32 %f5598, %f649, %f648;
+ mov.f32 %f5282, %f5281;
+ mov.f32 %f5284, %f5283;
+
+$L__BB0_566:
+ @%p423 bra $L__BB0_595;
+
+ shl.b32 %r4161, %r12, 5;
+ neg.s32 %r727, %r4161;
+ setp.ge.s32 %p495, %r11, %r727;
+ @%p495 bra $L__BB0_580;
+
+ mul.f32 %f3092, %f5414, 0f3F22F983;
+ cvt.rni.s32.f32 %r8114, %f3092;
+ cvt.rn.f32.s32 %f3093, %r8114;
+ mov.f32 %f3094, 0fBFC90FDA;
+ fma.rn.f32 %f3095, %f3093, %f3094, %f5414;
+ mov.f32 %f3096, 0fB3A22168;
+ fma.rn.f32 %f3097, %f3093, %f3096, %f3095;
+ mov.f32 %f3098, 0fA7C234C5;
+ fma.rn.f32 %f5471, %f3093, %f3098, %f3097;
+ abs.f32 %f657, %f5414;
+ setp.ltu.f32 %p496, %f657, 0f47CE4780;
+ @%p496 bra $L__BB0_576;
+
+ setp.eq.f32 %p497, %f657, 0f7F800000;
+ @%p497 bra $L__BB0_575;
+ bra.uni $L__BB0_570;
+
+$L__BB0_575:
+ mov.f32 %f3101, 0f00000000;
+ mul.rn.f32 %f5471, %f5414, %f3101;
+ mov.u32 %r8114, 0;
+ bra.uni $L__BB0_576;
+
+$L__BB0_570:
+ mov.b32 %r729, %f5414;
+ shr.u32 %r4163, %r729, 23;
+ and.b32 %r4164, %r4163, 255;
+ add.s32 %r730, %r4164, -128;
+ shl.b32 %r4165, %r729, 8;
+ or.b32 %r731, %r4165, -2147483648;
+ shr.u32 %r732, %r730, 5;
+ mov.u64 %rd2577, 0;
+ mov.u32 %r8111, 0;
+ mov.u64 %rd1184, __cudart_i2opi_f;
+ mov.u64 %rd2578, %rd2577;
+
+$L__BB0_571:
+ .pragma "nounroll";
+ shl.b64 %rd1183, %rd2577, 2;
+ add.s64 %rd1185, %rd1184, %rd1183;
+ ld.global.nc.u32 %r4166, [%rd1185];
+ mad.wide.u32 %rd1186, %r4166, %r731, %rd2578;
+ shr.u64 %rd2578, %rd1186, 32;
+ add.s64 %rd1187, %rd1, %rd1183;
+ st.local.u32 [%rd1187], %rd1186;
+ add.s32 %r8111, %r8111, 1;
+ cvt.s64.s32 %rd2577, %r8111;
+ setp.ne.s32 %p498, %r8111, 6;
+ @%p498 bra $L__BB0_571;
+
+ st.local.u32 [%rd4], %rd2578;
+ mov.u32 %r4167, 4;
+ sub.s32 %r735, %r4167, %r732;
+ mov.u32 %r4168, 6;
+ sub.s32 %r4169, %r4168, %r732;
+ mul.wide.s32 %rd1188, %r4169, 4;
+ add.s64 %rd1189, %rd1, %rd1188;
+ ld.local.u32 %r8112, [%rd1189];
+ ld.local.u32 %r8113, [%rd1189+-4];
+ and.b32 %r738, %r730, 31;
+ setp.eq.s32 %p499, %r738, 0;
+ @%p499 bra $L__BB0_574;
+
+ mov.u32 %r4170, 32;
+ sub.s32 %r4171, %r4170, %r738;
+ shr.u32 %r4172, %r8113, %r4171;
+ shl.b32 %r4173, %r8112, %r738;
+ add.s32 %r8112, %r4172, %r4173;
+ mul.wide.s32 %rd1190, %r735, 4;
+ add.s64 %rd1191, %rd1, %rd1190;
+ ld.local.u32 %r4174, [%rd1191];
+ shr.u32 %r4175, %r4174, %r4171;
+ shl.b32 %r4176, %r8113, %r738;
+ add.s32 %r8113, %r4175, %r4176;
+
+$L__BB0_574:
+ and.b32 %r4177, %r729, -2147483648;
+ shr.u32 %r4178, %r8113, 30;
+ shl.b32 %r4179, %r8112, 2;
+ or.b32 %r4180, %r4178, %r4179;
+ shr.u32 %r4181, %r4180, 31;
+ shr.u32 %r4182, %r8112, 30;
+ add.s32 %r4183, %r4181, %r4182;
+ neg.s32 %r4184, %r4183;
+ setp.eq.s32 %p500, %r4177, 0;
+ selp.b32 %r8114, %r4183, %r4184, %p500;
+ setp.ne.s32 %p501, %r4181, 0;
+ xor.b32 %r4185, %r4177, -2147483648;
+ selp.b32 %r4186, %r4185, %r4177, %p501;
+ selp.b32 %r4187, -1, 0, %p501;
+ xor.b32 %r4188, %r4180, %r4187;
+ shl.b32 %r4189, %r8113, 2;
+ xor.b32 %r4190, %r4189, %r4187;
+ cvt.u64.u32 %rd1192, %r4188;
+ cvt.u64.u32 %rd1193, %r4190;
+ bfi.b64 %rd1194, %rd1192, %rd1193, 32, 32;
+ cvt.rn.f64.s64 %fd73, %rd1194;
+ mul.f64 %fd74, %fd73, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3099, %fd74;
+ setp.eq.s32 %p502, %r4186, 0;
+ neg.f32 %f3100, %f3099;
+ selp.f32 %f5471, %f3099, %f3100, %p502;
+
+$L__BB0_576:
+ and.b32 %r745, %r8114, 1;
+ setp.eq.s32 %p503, %r745, 0;
+ selp.f32 %f661, %f5471, 0f3F800000, %p503;
+ mul.rn.f32 %f662, %f5471, %f5471;
+ mov.f32 %f5472, 0fB94D4153;
+ @%p503 bra $L__BB0_578;
+
+ mov.f32 %f3103, 0fBAB607ED;
+ mov.f32 %f3104, 0f37CBAC00;
+ fma.rn.f32 %f5472, %f3104, %f662, %f3103;
+
+$L__BB0_578:
+ selp.f32 %f3105, 0f3C0885E4, 0f3D2AAABB, %p503;
+ fma.rn.f32 %f3106, %f5472, %f662, %f3105;
+ selp.f32 %f3107, 0fBE2AAAA8, 0fBEFFFFFF, %p503;
+ fma.rn.f32 %f3108, %f3106, %f662, %f3107;
+ mov.f32 %f3109, 0f00000000;
+ fma.rn.f32 %f3110, %f662, %f661, %f3109;
+ fma.rn.f32 %f5281, %f3108, %f3110, %f661;
+ and.b32 %r4192, %r8114, 2;
+ setp.eq.s32 %p505, %r4192, 0;
+ @%p505 bra $L__BB0_580;
+
+ mov.f32 %f3112, 0fBF800000;
+ fma.rn.f32 %f5281, %f5281, %f3112, %f3109;
+
+$L__BB0_580:
+ setp.lt.s32 %p10, %r11, %r727;
+ @%p495 bra $L__BB0_593;
+
+ mul.f32 %f3113, %f5406, 0f3F22F983;
+ cvt.rni.s32.f32 %r8118, %f3113;
+ cvt.rn.f32.s32 %f3114, %r8118;
+ mov.f32 %f3115, 0fBFC90FDA;
+ fma.rn.f32 %f3116, %f3114, %f3115, %f5406;
+ mov.f32 %f3117, 0fB3A22168;
+ fma.rn.f32 %f3118, %f3114, %f3117, %f3116;
+ mov.f32 %f3119, 0fA7C234C5;
+ fma.rn.f32 %f5475, %f3114, %f3119, %f3118;
+ abs.f32 %f670, %f5406;
+ setp.ltu.f32 %p507, %f670, 0f47CE4780;
+ @%p507 bra $L__BB0_589;
+
+ setp.eq.f32 %p508, %f670, 0f7F800000;
+ @%p508 bra $L__BB0_588;
+ bra.uni $L__BB0_583;
+
+$L__BB0_588:
+ mov.f32 %f3122, 0f00000000;
+ mul.rn.f32 %f5475, %f5406, %f3122;
+ mov.u32 %r8118, 0;
+ bra.uni $L__BB0_589;
+
+$L__BB0_583:
+ mov.b32 %r747, %f5406;
+ shr.u32 %r4194, %r747, 23;
+ and.b32 %r4195, %r4194, 255;
+ add.s32 %r748, %r4195, -128;
+ shl.b32 %r4196, %r747, 8;
+ or.b32 %r749, %r4196, -2147483648;
+ shr.u32 %r750, %r748, 5;
+ mov.u64 %rd2579, 0;
+ mov.u32 %r8115, 0;
+ mov.u64 %rd1198, __cudart_i2opi_f;
+ mov.u64 %rd2580, %rd2579;
+
+$L__BB0_584:
+ .pragma "nounroll";
+ shl.b64 %rd1197, %rd2579, 2;
+ add.s64 %rd1199, %rd1198, %rd1197;
+ ld.global.nc.u32 %r4197, [%rd1199];
+ mad.wide.u32 %rd1200, %r4197, %r749, %rd2580;
+ shr.u64 %rd2580, %rd1200, 32;
+ add.s64 %rd1201, %rd1, %rd1197;
+ st.local.u32 [%rd1201], %rd1200;
+ add.s32 %r8115, %r8115, 1;
+ cvt.s64.s32 %rd2579, %r8115;
+ setp.ne.s32 %p509, %r8115, 6;
+ @%p509 bra $L__BB0_584;
+
+ st.local.u32 [%rd4], %rd2580;
+ mov.u32 %r4198, 4;
+ sub.s32 %r753, %r4198, %r750;
+ mov.u32 %r4199, 6;
+ sub.s32 %r4200, %r4199, %r750;
+ mul.wide.s32 %rd1202, %r4200, 4;
+ add.s64 %rd1203, %rd1, %rd1202;
+ ld.local.u32 %r8116, [%rd1203];
+ ld.local.u32 %r8117, [%rd1203+-4];
+ and.b32 %r756, %r748, 31;
+ setp.eq.s32 %p510, %r756, 0;
+ @%p510 bra $L__BB0_587;
+
+ mov.u32 %r4201, 32;
+ sub.s32 %r4202, %r4201, %r756;
+ shr.u32 %r4203, %r8117, %r4202;
+ shl.b32 %r4204, %r8116, %r756;
+ add.s32 %r8116, %r4203, %r4204;
+ mul.wide.s32 %rd1204, %r753, 4;
+ add.s64 %rd1205, %rd1, %rd1204;
+ ld.local.u32 %r4205, [%rd1205];
+ shr.u32 %r4206, %r4205, %r4202;
+ shl.b32 %r4207, %r8117, %r756;
+ add.s32 %r8117, %r4206, %r4207;
+
+$L__BB0_587:
+ and.b32 %r4208, %r747, -2147483648;
+ shr.u32 %r4209, %r8117, 30;
+ shl.b32 %r4210, %r8116, 2;
+ or.b32 %r4211, %r4209, %r4210;
+ shr.u32 %r4212, %r4211, 31;
+ shr.u32 %r4213, %r8116, 30;
+ add.s32 %r4214, %r4212, %r4213;
+ neg.s32 %r4215, %r4214;
+ setp.eq.s32 %p511, %r4208, 0;
+ selp.b32 %r8118, %r4214, %r4215, %p511;
+ setp.ne.s32 %p512, %r4212, 0;
+ xor.b32 %r4216, %r4208, -2147483648;
+ selp.b32 %r4217, %r4216, %r4208, %p512;
+ selp.b32 %r4218, -1, 0, %p512;
+ xor.b32 %r4219, %r4211, %r4218;
+ shl.b32 %r4220, %r8117, 2;
+ xor.b32 %r4221, %r4220, %r4218;
+ cvt.u64.u32 %rd1206, %r4219;
+ cvt.u64.u32 %rd1207, %r4221;
+ bfi.b64 %rd1208, %rd1206, %rd1207, 32, 32;
+ cvt.rn.f64.s64 %fd75, %rd1208;
+ mul.f64 %fd76, %fd75, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3120, %fd76;
+ setp.eq.s32 %p513, %r4217, 0;
+ neg.f32 %f3121, %f3120;
+ selp.f32 %f5475, %f3120, %f3121, %p513;
+
+$L__BB0_589:
+ add.s32 %r763, %r8118, 1;
+ and.b32 %r764, %r763, 1;
+ setp.eq.s32 %p514, %r764, 0;
+ selp.f32 %f674, %f5475, 0f3F800000, %p514;
+ mul.rn.f32 %f675, %f5475, %f5475;
+ mov.f32 %f5476, 0fB94D4153;
+ @%p514 bra $L__BB0_591;
+
+ mov.f32 %f3124, 0fBAB607ED;
+ mov.f32 %f3125, 0f37CBAC00;
+ fma.rn.f32 %f5476, %f3125, %f675, %f3124;
+
+$L__BB0_591:
+ selp.f32 %f3126, 0f3C0885E4, 0f3D2AAABB, %p514;
+ fma.rn.f32 %f3127, %f5476, %f675, %f3126;
+ selp.f32 %f3128, 0fBE2AAAA8, 0fBEFFFFFF, %p514;
+ fma.rn.f32 %f3129, %f3127, %f675, %f3128;
+ mov.f32 %f3130, 0f00000000;
+ fma.rn.f32 %f3131, %f675, %f674, %f3130;
+ fma.rn.f32 %f5283, %f3129, %f3131, %f674;
+ and.b32 %r4223, %r763, 2;
+ setp.eq.s32 %p516, %r4223, 0;
+ @%p516 bra $L__BB0_593;
+
+ mov.f32 %f3133, 0fBF800000;
+ fma.rn.f32 %f5283, %f5283, %f3133, %f3130;
+
+$L__BB0_593:
+ selp.f32 %f682, %f5283, %f5284, %p10;
+ selp.f32 %f683, %f5281, %f5282, %p10;
+ @%p495 bra $L__BB0_595;
+
+ add.f32 %f5597, %f683, %f682;
+ mov.f32 %f5282, %f5281;
+ mov.f32 %f5284, %f5283;
+
+$L__BB0_595:
+ @%p423 bra $L__BB0_624;
+
+ shl.b32 %r4224, %r12, 5;
+ mov.u32 %r4225, -32;
+ sub.s32 %r765, %r4225, %r4224;
+ setp.ge.s32 %p520, %r11, %r765;
+ @%p520 bra $L__BB0_609;
+
+ mul.f32 %f3136, %f5413, 0f3F22F983;
+ cvt.rni.s32.f32 %r8122, %f3136;
+ cvt.rn.f32.s32 %f3137, %r8122;
+ mov.f32 %f3138, 0fBFC90FDA;
+ fma.rn.f32 %f3139, %f3137, %f3138, %f5413;
+ mov.f32 %f3140, 0fB3A22168;
+ fma.rn.f32 %f3141, %f3137, %f3140, %f3139;
+ mov.f32 %f3142, 0fA7C234C5;
+ fma.rn.f32 %f5484, %f3137, %f3142, %f3141;
+ abs.f32 %f691, %f5413;
+ setp.ltu.f32 %p521, %f691, 0f47CE4780;
+ @%p521 bra $L__BB0_605;
+
+ setp.eq.f32 %p522, %f691, 0f7F800000;
+ @%p522 bra $L__BB0_604;
+ bra.uni $L__BB0_599;
+
+$L__BB0_604:
+ mov.f32 %f3145, 0f00000000;
+ mul.rn.f32 %f5484, %f5413, %f3145;
+ mov.u32 %r8122, 0;
+ bra.uni $L__BB0_605;
+
+$L__BB0_599:
+ mov.b32 %r767, %f5413;
+ shr.u32 %r4227, %r767, 23;
+ and.b32 %r4228, %r4227, 255;
+ add.s32 %r768, %r4228, -128;
+ shl.b32 %r4229, %r767, 8;
+ or.b32 %r769, %r4229, -2147483648;
+ shr.u32 %r770, %r768, 5;
+ mov.u64 %rd2581, 0;
+ mov.u32 %r8119, 0;
+ mov.u64 %rd1212, __cudart_i2opi_f;
+ mov.u64 %rd2582, %rd2581;
+
+$L__BB0_600:
+ .pragma "nounroll";
+ shl.b64 %rd1211, %rd2581, 2;
+ add.s64 %rd1213, %rd1212, %rd1211;
+ ld.global.nc.u32 %r4230, [%rd1213];
+ mad.wide.u32 %rd1214, %r4230, %r769, %rd2582;
+ shr.u64 %rd2582, %rd1214, 32;
+ add.s64 %rd1215, %rd1, %rd1211;
+ st.local.u32 [%rd1215], %rd1214;
+ add.s32 %r8119, %r8119, 1;
+ cvt.s64.s32 %rd2581, %r8119;
+ setp.ne.s32 %p523, %r8119, 6;
+ @%p523 bra $L__BB0_600;
+
+ st.local.u32 [%rd4], %rd2582;
+ mov.u32 %r4231, 4;
+ sub.s32 %r773, %r4231, %r770;
+ mov.u32 %r4232, 6;
+ sub.s32 %r4233, %r4232, %r770;
+ mul.wide.s32 %rd1216, %r4233, 4;
+ add.s64 %rd1217, %rd1, %rd1216;
+ ld.local.u32 %r8120, [%rd1217];
+ ld.local.u32 %r8121, [%rd1217+-4];
+ and.b32 %r776, %r768, 31;
+ setp.eq.s32 %p524, %r776, 0;
+ @%p524 bra $L__BB0_603;
+
+ mov.u32 %r4234, 32;
+ sub.s32 %r4235, %r4234, %r776;
+ shr.u32 %r4236, %r8121, %r4235;
+ shl.b32 %r4237, %r8120, %r776;
+ add.s32 %r8120, %r4236, %r4237;
+ mul.wide.s32 %rd1218, %r773, 4;
+ add.s64 %rd1219, %rd1, %rd1218;
+ ld.local.u32 %r4238, [%rd1219];
+ shr.u32 %r4239, %r4238, %r4235;
+ shl.b32 %r4240, %r8121, %r776;
+ add.s32 %r8121, %r4239, %r4240;
+
+$L__BB0_603:
+ and.b32 %r4241, %r767, -2147483648;
+ shr.u32 %r4242, %r8121, 30;
+ shl.b32 %r4243, %r8120, 2;
+ or.b32 %r4244, %r4242, %r4243;
+ shr.u32 %r4245, %r4244, 31;
+ shr.u32 %r4246, %r8120, 30;
+ add.s32 %r4247, %r4245, %r4246;
+ neg.s32 %r4248, %r4247;
+ setp.eq.s32 %p525, %r4241, 0;
+ selp.b32 %r8122, %r4247, %r4248, %p525;
+ setp.ne.s32 %p526, %r4245, 0;
+ xor.b32 %r4249, %r4241, -2147483648;
+ selp.b32 %r4250, %r4249, %r4241, %p526;
+ selp.b32 %r4251, -1, 0, %p526;
+ xor.b32 %r4252, %r4244, %r4251;
+ shl.b32 %r4253, %r8121, 2;
+ xor.b32 %r4254, %r4253, %r4251;
+ cvt.u64.u32 %rd1220, %r4252;
+ cvt.u64.u32 %rd1221, %r4254;
+ bfi.b64 %rd1222, %rd1220, %rd1221, 32, 32;
+ cvt.rn.f64.s64 %fd77, %rd1222;
+ mul.f64 %fd78, %fd77, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3143, %fd78;
+ setp.eq.s32 %p527, %r4250, 0;
+ neg.f32 %f3144, %f3143;
+ selp.f32 %f5484, %f3143, %f3144, %p527;
+
+$L__BB0_605:
+ and.b32 %r783, %r8122, 1;
+ setp.eq.s32 %p528, %r783, 0;
+ selp.f32 %f695, %f5484, 0f3F800000, %p528;
+ mul.rn.f32 %f696, %f5484, %f5484;
+ mov.f32 %f5485, 0fB94D4153;
+ @%p528 bra $L__BB0_607;
+
+ mov.f32 %f3147, 0fBAB607ED;
+ mov.f32 %f3148, 0f37CBAC00;
+ fma.rn.f32 %f5485, %f3148, %f696, %f3147;
+
+$L__BB0_607:
+ selp.f32 %f3149, 0f3C0885E4, 0f3D2AAABB, %p528;
+ fma.rn.f32 %f3150, %f5485, %f696, %f3149;
+ selp.f32 %f3151, 0fBE2AAAA8, 0fBEFFFFFF, %p528;
+ fma.rn.f32 %f3152, %f3150, %f696, %f3151;
+ mov.f32 %f3153, 0f00000000;
+ fma.rn.f32 %f3154, %f696, %f695, %f3153;
+ fma.rn.f32 %f5281, %f3152, %f3154, %f695;
+ and.b32 %r4256, %r8122, 2;
+ setp.eq.s32 %p530, %r4256, 0;
+ @%p530 bra $L__BB0_609;
+
+ mov.f32 %f3156, 0fBF800000;
+ fma.rn.f32 %f5281, %f5281, %f3156, %f3153;
+
+$L__BB0_609:
+ setp.lt.s32 %p11, %r11, %r765;
+ @%p520 bra $L__BB0_622;
+
+ mul.f32 %f3157, %f5405, 0f3F22F983;
+ cvt.rni.s32.f32 %r8126, %f3157;
+ cvt.rn.f32.s32 %f3158, %r8126;
+ mov.f32 %f3159, 0fBFC90FDA;
+ fma.rn.f32 %f3160, %f3158, %f3159, %f5405;
+ mov.f32 %f3161, 0fB3A22168;
+ fma.rn.f32 %f3162, %f3158, %f3161, %f3160;
+ mov.f32 %f3163, 0fA7C234C5;
+ fma.rn.f32 %f5488, %f3158, %f3163, %f3162;
+ abs.f32 %f704, %f5405;
+ setp.ltu.f32 %p532, %f704, 0f47CE4780;
+ @%p532 bra $L__BB0_618;
+
+ setp.eq.f32 %p533, %f704, 0f7F800000;
+ @%p533 bra $L__BB0_617;
+ bra.uni $L__BB0_612;
+
+$L__BB0_617:
+ mov.f32 %f3166, 0f00000000;
+ mul.rn.f32 %f5488, %f5405, %f3166;
+ mov.u32 %r8126, 0;
+ bra.uni $L__BB0_618;
+
+$L__BB0_612:
+ mov.b32 %r785, %f5405;
+ shr.u32 %r4258, %r785, 23;
+ and.b32 %r4259, %r4258, 255;
+ add.s32 %r786, %r4259, -128;
+ shl.b32 %r4260, %r785, 8;
+ or.b32 %r787, %r4260, -2147483648;
+ shr.u32 %r788, %r786, 5;
+ mov.u64 %rd2583, 0;
+ mov.u32 %r8123, 0;
+ mov.u64 %rd1226, __cudart_i2opi_f;
+ mov.u64 %rd2584, %rd2583;
+
+$L__BB0_613:
+ .pragma "nounroll";
+ shl.b64 %rd1225, %rd2583, 2;
+ add.s64 %rd1227, %rd1226, %rd1225;
+ ld.global.nc.u32 %r4261, [%rd1227];
+ mad.wide.u32 %rd1228, %r4261, %r787, %rd2584;
+ shr.u64 %rd2584, %rd1228, 32;
+ add.s64 %rd1229, %rd1, %rd1225;
+ st.local.u32 [%rd1229], %rd1228;
+ add.s32 %r8123, %r8123, 1;
+ cvt.s64.s32 %rd2583, %r8123;
+ setp.ne.s32 %p534, %r8123, 6;
+ @%p534 bra $L__BB0_613;
+
+ st.local.u32 [%rd4], %rd2584;
+ mov.u32 %r4262, 4;
+ sub.s32 %r791, %r4262, %r788;
+ mov.u32 %r4263, 6;
+ sub.s32 %r4264, %r4263, %r788;
+ mul.wide.s32 %rd1230, %r4264, 4;
+ add.s64 %rd1231, %rd1, %rd1230;
+ ld.local.u32 %r8124, [%rd1231];
+ ld.local.u32 %r8125, [%rd1231+-4];
+ and.b32 %r794, %r786, 31;
+ setp.eq.s32 %p535, %r794, 0;
+ @%p535 bra $L__BB0_616;
+
+ mov.u32 %r4265, 32;
+ sub.s32 %r4266, %r4265, %r794;
+ shr.u32 %r4267, %r8125, %r4266;
+ shl.b32 %r4268, %r8124, %r794;
+ add.s32 %r8124, %r4267, %r4268;
+ mul.wide.s32 %rd1232, %r791, 4;
+ add.s64 %rd1233, %rd1, %rd1232;
+ ld.local.u32 %r4269, [%rd1233];
+ shr.u32 %r4270, %r4269, %r4266;
+ shl.b32 %r4271, %r8125, %r794;
+ add.s32 %r8125, %r4270, %r4271;
+
+$L__BB0_616:
+ and.b32 %r4272, %r785, -2147483648;
+ shr.u32 %r4273, %r8125, 30;
+ shl.b32 %r4274, %r8124, 2;
+ or.b32 %r4275, %r4273, %r4274;
+ shr.u32 %r4276, %r4275, 31;
+ shr.u32 %r4277, %r8124, 30;
+ add.s32 %r4278, %r4276, %r4277;
+ neg.s32 %r4279, %r4278;
+ setp.eq.s32 %p536, %r4272, 0;
+ selp.b32 %r8126, %r4278, %r4279, %p536;
+ setp.ne.s32 %p537, %r4276, 0;
+ xor.b32 %r4280, %r4272, -2147483648;
+ selp.b32 %r4281, %r4280, %r4272, %p537;
+ selp.b32 %r4282, -1, 0, %p537;
+ xor.b32 %r4283, %r4275, %r4282;
+ shl.b32 %r4284, %r8125, 2;
+ xor.b32 %r4285, %r4284, %r4282;
+ cvt.u64.u32 %rd1234, %r4283;
+ cvt.u64.u32 %rd1235, %r4285;
+ bfi.b64 %rd1236, %rd1234, %rd1235, 32, 32;
+ cvt.rn.f64.s64 %fd79, %rd1236;
+ mul.f64 %fd80, %fd79, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3164, %fd80;
+ setp.eq.s32 %p538, %r4281, 0;
+ neg.f32 %f3165, %f3164;
+ selp.f32 %f5488, %f3164, %f3165, %p538;
+
+$L__BB0_618:
+ add.s32 %r801, %r8126, 1;
+ and.b32 %r802, %r801, 1;
+ setp.eq.s32 %p539, %r802, 0;
+ selp.f32 %f708, %f5488, 0f3F800000, %p539;
+ mul.rn.f32 %f709, %f5488, %f5488;
+ mov.f32 %f5489, 0fB94D4153;
+ @%p539 bra $L__BB0_620;
+
+ mov.f32 %f3168, 0fBAB607ED;
+ mov.f32 %f3169, 0f37CBAC00;
+ fma.rn.f32 %f5489, %f3169, %f709, %f3168;
+
+$L__BB0_620:
+ selp.f32 %f3170, 0f3C0885E4, 0f3D2AAABB, %p539;
+ fma.rn.f32 %f3171, %f5489, %f709, %f3170;
+ selp.f32 %f3172, 0fBE2AAAA8, 0fBEFFFFFF, %p539;
+ fma.rn.f32 %f3173, %f3171, %f709, %f3172;
+ mov.f32 %f3174, 0f00000000;
+ fma.rn.f32 %f3175, %f709, %f708, %f3174;
+ fma.rn.f32 %f5283, %f3173, %f3175, %f708;
+ and.b32 %r4287, %r801, 2;
+ setp.eq.s32 %p541, %r4287, 0;
+ @%p541 bra $L__BB0_622;
+
+ mov.f32 %f3177, 0fBF800000;
+ fma.rn.f32 %f5283, %f5283, %f3177, %f3174;
+
+$L__BB0_622:
+ selp.f32 %f716, %f5283, %f5284, %p11;
+ selp.f32 %f717, %f5281, %f5282, %p11;
+ @%p520 bra $L__BB0_624;
+
+ add.f32 %f5596, %f717, %f716;
+ mov.f32 %f5282, %f5281;
+ mov.f32 %f5284, %f5283;
+
+$L__BB0_624:
+ @%p426 bra $L__BB0_653;
+
+ shl.b32 %r4288, %r12, 5;
+ neg.s32 %r803, %r4288;
+ setp.ge.s32 %p545, %r11, %r803;
+ @%p545 bra $L__BB0_638;
+
+ mul.f32 %f3180, %f5412, 0f3F22F983;
+ cvt.rni.s32.f32 %r8130, %f3180;
+ cvt.rn.f32.s32 %f3181, %r8130;
+ mov.f32 %f3182, 0fBFC90FDA;
+ fma.rn.f32 %f3183, %f3181, %f3182, %f5412;
+ mov.f32 %f3184, 0fB3A22168;
+ fma.rn.f32 %f3185, %f3181, %f3184, %f3183;
+ mov.f32 %f3186, 0fA7C234C5;
+ fma.rn.f32 %f5497, %f3181, %f3186, %f3185;
+ abs.f32 %f725, %f5412;
+ setp.ltu.f32 %p546, %f725, 0f47CE4780;
+ @%p546 bra $L__BB0_634;
+
+ setp.eq.f32 %p547, %f725, 0f7F800000;
+ @%p547 bra $L__BB0_633;
+ bra.uni $L__BB0_628;
+
+$L__BB0_633:
+ mov.f32 %f3189, 0f00000000;
+ mul.rn.f32 %f5497, %f5412, %f3189;
+ mov.u32 %r8130, 0;
+ bra.uni $L__BB0_634;
+
+$L__BB0_628:
+ mov.b32 %r805, %f5412;
+ shr.u32 %r4290, %r805, 23;
+ and.b32 %r4291, %r4290, 255;
+ add.s32 %r806, %r4291, -128;
+ shl.b32 %r4292, %r805, 8;
+ or.b32 %r807, %r4292, -2147483648;
+ shr.u32 %r808, %r806, 5;
+ mov.u64 %rd2585, 0;
+ mov.u32 %r8127, 0;
+ mov.u64 %rd1240, __cudart_i2opi_f;
+ mov.u64 %rd2586, %rd2585;
+
+$L__BB0_629:
+ .pragma "nounroll";
+ shl.b64 %rd1239, %rd2585, 2;
+ add.s64 %rd1241, %rd1240, %rd1239;
+ ld.global.nc.u32 %r4293, [%rd1241];
+ mad.wide.u32 %rd1242, %r4293, %r807, %rd2586;
+ shr.u64 %rd2586, %rd1242, 32;
+ add.s64 %rd1243, %rd1, %rd1239;
+ st.local.u32 [%rd1243], %rd1242;
+ add.s32 %r8127, %r8127, 1;
+ cvt.s64.s32 %rd2585, %r8127;
+ setp.ne.s32 %p548, %r8127, 6;
+ @%p548 bra $L__BB0_629;
+
+ st.local.u32 [%rd4], %rd2586;
+ mov.u32 %r4294, 4;
+ sub.s32 %r811, %r4294, %r808;
+ mov.u32 %r4295, 6;
+ sub.s32 %r4296, %r4295, %r808;
+ mul.wide.s32 %rd1244, %r4296, 4;
+ add.s64 %rd1245, %rd1, %rd1244;
+ ld.local.u32 %r8128, [%rd1245];
+ ld.local.u32 %r8129, [%rd1245+-4];
+ and.b32 %r814, %r806, 31;
+ setp.eq.s32 %p549, %r814, 0;
+ @%p549 bra $L__BB0_632;
+
+ mov.u32 %r4297, 32;
+ sub.s32 %r4298, %r4297, %r814;
+ shr.u32 %r4299, %r8129, %r4298;
+ shl.b32 %r4300, %r8128, %r814;
+ add.s32 %r8128, %r4299, %r4300;
+ mul.wide.s32 %rd1246, %r811, 4;
+ add.s64 %rd1247, %rd1, %rd1246;
+ ld.local.u32 %r4301, [%rd1247];
+ shr.u32 %r4302, %r4301, %r4298;
+ shl.b32 %r4303, %r8129, %r814;
+ add.s32 %r8129, %r4302, %r4303;
+
+$L__BB0_632:
+ and.b32 %r4304, %r805, -2147483648;
+ shr.u32 %r4305, %r8129, 30;
+ shl.b32 %r4306, %r8128, 2;
+ or.b32 %r4307, %r4305, %r4306;
+ shr.u32 %r4308, %r4307, 31;
+ shr.u32 %r4309, %r8128, 30;
+ add.s32 %r4310, %r4308, %r4309;
+ neg.s32 %r4311, %r4310;
+ setp.eq.s32 %p550, %r4304, 0;
+ selp.b32 %r8130, %r4310, %r4311, %p550;
+ setp.ne.s32 %p551, %r4308, 0;
+ xor.b32 %r4312, %r4304, -2147483648;
+ selp.b32 %r4313, %r4312, %r4304, %p551;
+ selp.b32 %r4314, -1, 0, %p551;
+ xor.b32 %r4315, %r4307, %r4314;
+ shl.b32 %r4316, %r8129, 2;
+ xor.b32 %r4317, %r4316, %r4314;
+ cvt.u64.u32 %rd1248, %r4315;
+ cvt.u64.u32 %rd1249, %r4317;
+ bfi.b64 %rd1250, %rd1248, %rd1249, 32, 32;
+ cvt.rn.f64.s64 %fd81, %rd1250;
+ mul.f64 %fd82, %fd81, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3187, %fd82;
+ setp.eq.s32 %p552, %r4313, 0;
+ neg.f32 %f3188, %f3187;
+ selp.f32 %f5497, %f3187, %f3188, %p552;
+
+$L__BB0_634:
+ and.b32 %r821, %r8130, 1;
+ setp.eq.s32 %p553, %r821, 0;
+ selp.f32 %f729, %f5497, 0f3F800000, %p553;
+ mul.rn.f32 %f730, %f5497, %f5497;
+ mov.f32 %f5498, 0fB94D4153;
+ @%p553 bra $L__BB0_636;
+
+ mov.f32 %f3191, 0fBAB607ED;
+ mov.f32 %f3192, 0f37CBAC00;
+ fma.rn.f32 %f5498, %f3192, %f730, %f3191;
+
+$L__BB0_636:
+ selp.f32 %f3193, 0f3C0885E4, 0f3D2AAABB, %p553;
+ fma.rn.f32 %f3194, %f5498, %f730, %f3193;
+ selp.f32 %f3195, 0fBE2AAAA8, 0fBEFFFFFF, %p553;
+ fma.rn.f32 %f3196, %f3194, %f730, %f3195;
+ mov.f32 %f3197, 0f00000000;
+ fma.rn.f32 %f3198, %f730, %f729, %f3197;
+ fma.rn.f32 %f5281, %f3196, %f3198, %f729;
+ and.b32 %r4319, %r8130, 2;
+ setp.eq.s32 %p555, %r4319, 0;
+ @%p555 bra $L__BB0_638;
+
+ mov.f32 %f3200, 0fBF800000;
+ fma.rn.f32 %f5281, %f5281, %f3200, %f3197;
+
+$L__BB0_638:
+ setp.lt.s32 %p12, %r11, %r803;
+ @%p545 bra $L__BB0_651;
+
+ mul.f32 %f3201, %f5404, 0f3F22F983;
+ cvt.rni.s32.f32 %r8134, %f3201;
+ cvt.rn.f32.s32 %f3202, %r8134;
+ mov.f32 %f3203, 0fBFC90FDA;
+ fma.rn.f32 %f3204, %f3202, %f3203, %f5404;
+ mov.f32 %f3205, 0fB3A22168;
+ fma.rn.f32 %f3206, %f3202, %f3205, %f3204;
+ mov.f32 %f3207, 0fA7C234C5;
+ fma.rn.f32 %f5501, %f3202, %f3207, %f3206;
+ abs.f32 %f738, %f5404;
+ setp.ltu.f32 %p557, %f738, 0f47CE4780;
+ @%p557 bra $L__BB0_647;
+
+ setp.eq.f32 %p558, %f738, 0f7F800000;
+ @%p558 bra $L__BB0_646;
+ bra.uni $L__BB0_641;
+
+$L__BB0_646:
+ mov.f32 %f3210, 0f00000000;
+ mul.rn.f32 %f5501, %f5404, %f3210;
+ mov.u32 %r8134, 0;
+ bra.uni $L__BB0_647;
+
+$L__BB0_641:
+ mov.b32 %r823, %f5404;
+ shr.u32 %r4321, %r823, 23;
+ and.b32 %r4322, %r4321, 255;
+ add.s32 %r824, %r4322, -128;
+ shl.b32 %r4323, %r823, 8;
+ or.b32 %r825, %r4323, -2147483648;
+ shr.u32 %r826, %r824, 5;
+ mov.u64 %rd2587, 0;
+ mov.u32 %r8131, 0;
+ mov.u64 %rd1254, __cudart_i2opi_f;
+ mov.u64 %rd2588, %rd2587;
+
+$L__BB0_642:
+ .pragma "nounroll";
+ shl.b64 %rd1253, %rd2587, 2;
+ add.s64 %rd1255, %rd1254, %rd1253;
+ ld.global.nc.u32 %r4324, [%rd1255];
+ mad.wide.u32 %rd1256, %r4324, %r825, %rd2588;
+ shr.u64 %rd2588, %rd1256, 32;
+ add.s64 %rd1257, %rd1, %rd1253;
+ st.local.u32 [%rd1257], %rd1256;
+ add.s32 %r8131, %r8131, 1;
+ cvt.s64.s32 %rd2587, %r8131;
+ setp.ne.s32 %p559, %r8131, 6;
+ @%p559 bra $L__BB0_642;
+
+ st.local.u32 [%rd4], %rd2588;
+ mov.u32 %r4325, 4;
+ sub.s32 %r829, %r4325, %r826;
+ mov.u32 %r4326, 6;
+ sub.s32 %r4327, %r4326, %r826;
+ mul.wide.s32 %rd1258, %r4327, 4;
+ add.s64 %rd1259, %rd1, %rd1258;
+ ld.local.u32 %r8132, [%rd1259];
+ ld.local.u32 %r8133, [%rd1259+-4];
+ and.b32 %r832, %r824, 31;
+ setp.eq.s32 %p560, %r832, 0;
+ @%p560 bra $L__BB0_645;
+
+ mov.u32 %r4328, 32;
+ sub.s32 %r4329, %r4328, %r832;
+ shr.u32 %r4330, %r8133, %r4329;
+ shl.b32 %r4331, %r8132, %r832;
+ add.s32 %r8132, %r4330, %r4331;
+ mul.wide.s32 %rd1260, %r829, 4;
+ add.s64 %rd1261, %rd1, %rd1260;
+ ld.local.u32 %r4332, [%rd1261];
+ shr.u32 %r4333, %r4332, %r4329;
+ shl.b32 %r4334, %r8133, %r832;
+ add.s32 %r8133, %r4333, %r4334;
+
+$L__BB0_645:
+ and.b32 %r4335, %r823, -2147483648;
+ shr.u32 %r4336, %r8133, 30;
+ shl.b32 %r4337, %r8132, 2;
+ or.b32 %r4338, %r4336, %r4337;
+ shr.u32 %r4339, %r4338, 31;
+ shr.u32 %r4340, %r8132, 30;
+ add.s32 %r4341, %r4339, %r4340;
+ neg.s32 %r4342, %r4341;
+ setp.eq.s32 %p561, %r4335, 0;
+ selp.b32 %r8134, %r4341, %r4342, %p561;
+ setp.ne.s32 %p562, %r4339, 0;
+ xor.b32 %r4343, %r4335, -2147483648;
+ selp.b32 %r4344, %r4343, %r4335, %p562;
+ selp.b32 %r4345, -1, 0, %p562;
+ xor.b32 %r4346, %r4338, %r4345;
+ shl.b32 %r4347, %r8133, 2;
+ xor.b32 %r4348, %r4347, %r4345;
+ cvt.u64.u32 %rd1262, %r4346;
+ cvt.u64.u32 %rd1263, %r4348;
+ bfi.b64 %rd1264, %rd1262, %rd1263, 32, 32;
+ cvt.rn.f64.s64 %fd83, %rd1264;
+ mul.f64 %fd84, %fd83, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3208, %fd84;
+ setp.eq.s32 %p563, %r4344, 0;
+ neg.f32 %f3209, %f3208;
+ selp.f32 %f5501, %f3208, %f3209, %p563;
+
+$L__BB0_647:
+ add.s32 %r839, %r8134, 1;
+ and.b32 %r840, %r839, 1;
+ setp.eq.s32 %p564, %r840, 0;
+ selp.f32 %f742, %f5501, 0f3F800000, %p564;
+ mul.rn.f32 %f743, %f5501, %f5501;
+ mov.f32 %f5502, 0fB94D4153;
+ @%p564 bra $L__BB0_649;
+
+ mov.f32 %f3212, 0fBAB607ED;
+ mov.f32 %f3213, 0f37CBAC00;
+ fma.rn.f32 %f5502, %f3213, %f743, %f3212;
+
+$L__BB0_649:
+ selp.f32 %f3214, 0f3C0885E4, 0f3D2AAABB, %p564;
+ fma.rn.f32 %f3215, %f5502, %f743, %f3214;
+ selp.f32 %f3216, 0fBE2AAAA8, 0fBEFFFFFF, %p564;
+ fma.rn.f32 %f3217, %f3215, %f743, %f3216;
+ mov.f32 %f3218, 0f00000000;
+ fma.rn.f32 %f3219, %f743, %f742, %f3218;
+ fma.rn.f32 %f5283, %f3217, %f3219, %f742;
+ and.b32 %r4350, %r839, 2;
+ setp.eq.s32 %p566, %r4350, 0;
+ @%p566 bra $L__BB0_651;
+
+ mov.f32 %f3221, 0fBF800000;
+ fma.rn.f32 %f5283, %f5283, %f3221, %f3218;
+
+$L__BB0_651:
+ selp.f32 %f750, %f5283, %f5284, %p12;
+ selp.f32 %f751, %f5281, %f5282, %p12;
+ @%p545 bra $L__BB0_653;
+
+ add.f32 %f5595, %f751, %f750;
+ mov.f32 %f5282, %f5281;
+ mov.f32 %f5284, %f5283;
+
+$L__BB0_653:
+ @%p426 bra $L__BB0_682;
+
+ shl.b32 %r4351, %r12, 5;
+ mov.u32 %r4352, -32;
+ sub.s32 %r841, %r4352, %r4351;
+ setp.ge.s32 %p570, %r11, %r841;
+ @%p570 bra $L__BB0_667;
+
+ mul.f32 %f3224, %f5411, 0f3F22F983;
+ cvt.rni.s32.f32 %r8138, %f3224;
+ cvt.rn.f32.s32 %f3225, %r8138;
+ mov.f32 %f3226, 0fBFC90FDA;
+ fma.rn.f32 %f3227, %f3225, %f3226, %f5411;
+ mov.f32 %f3228, 0fB3A22168;
+ fma.rn.f32 %f3229, %f3225, %f3228, %f3227;
+ mov.f32 %f3230, 0fA7C234C5;
+ fma.rn.f32 %f5510, %f3225, %f3230, %f3229;
+ abs.f32 %f759, %f5411;
+ setp.ltu.f32 %p571, %f759, 0f47CE4780;
+ @%p571 bra $L__BB0_663;
+
+ setp.eq.f32 %p572, %f759, 0f7F800000;
+ @%p572 bra $L__BB0_662;
+ bra.uni $L__BB0_657;
+
+$L__BB0_662:
+ mov.f32 %f3233, 0f00000000;
+ mul.rn.f32 %f5510, %f5411, %f3233;
+ mov.u32 %r8138, 0;
+ bra.uni $L__BB0_663;
+
+$L__BB0_657:
+ mov.b32 %r843, %f5411;
+ shr.u32 %r4354, %r843, 23;
+ and.b32 %r4355, %r4354, 255;
+ add.s32 %r844, %r4355, -128;
+ shl.b32 %r4356, %r843, 8;
+ or.b32 %r845, %r4356, -2147483648;
+ shr.u32 %r846, %r844, 5;
+ mov.u64 %rd2589, 0;
+ mov.u32 %r8135, 0;
+ mov.u64 %rd1268, __cudart_i2opi_f;
+ mov.u64 %rd2590, %rd2589;
+
+$L__BB0_658:
+ .pragma "nounroll";
+ shl.b64 %rd1267, %rd2589, 2;
+ add.s64 %rd1269, %rd1268, %rd1267;
+ ld.global.nc.u32 %r4357, [%rd1269];
+ mad.wide.u32 %rd1270, %r4357, %r845, %rd2590;
+ shr.u64 %rd2590, %rd1270, 32;
+ add.s64 %rd1271, %rd1, %rd1267;
+ st.local.u32 [%rd1271], %rd1270;
+ add.s32 %r8135, %r8135, 1;
+ cvt.s64.s32 %rd2589, %r8135;
+ setp.ne.s32 %p573, %r8135, 6;
+ @%p573 bra $L__BB0_658;
+
+ st.local.u32 [%rd4], %rd2590;
+ mov.u32 %r4358, 4;
+ sub.s32 %r849, %r4358, %r846;
+ mov.u32 %r4359, 6;
+ sub.s32 %r4360, %r4359, %r846;
+ mul.wide.s32 %rd1272, %r4360, 4;
+ add.s64 %rd1273, %rd1, %rd1272;
+ ld.local.u32 %r8136, [%rd1273];
+ ld.local.u32 %r8137, [%rd1273+-4];
+ and.b32 %r852, %r844, 31;
+ setp.eq.s32 %p574, %r852, 0;
+ @%p574 bra $L__BB0_661;
+
+ mov.u32 %r4361, 32;
+ sub.s32 %r4362, %r4361, %r852;
+ shr.u32 %r4363, %r8137, %r4362;
+ shl.b32 %r4364, %r8136, %r852;
+ add.s32 %r8136, %r4363, %r4364;
+ mul.wide.s32 %rd1274, %r849, 4;
+ add.s64 %rd1275, %rd1, %rd1274;
+ ld.local.u32 %r4365, [%rd1275];
+ shr.u32 %r4366, %r4365, %r4362;
+ shl.b32 %r4367, %r8137, %r852;
+ add.s32 %r8137, %r4366, %r4367;
+
+$L__BB0_661:
+ and.b32 %r4368, %r843, -2147483648;
+ shr.u32 %r4369, %r8137, 30;
+ shl.b32 %r4370, %r8136, 2;
+ or.b32 %r4371, %r4369, %r4370;
+ shr.u32 %r4372, %r4371, 31;
+ shr.u32 %r4373, %r8136, 30;
+ add.s32 %r4374, %r4372, %r4373;
+ neg.s32 %r4375, %r4374;
+ setp.eq.s32 %p575, %r4368, 0;
+ selp.b32 %r8138, %r4374, %r4375, %p575;
+ setp.ne.s32 %p576, %r4372, 0;
+ xor.b32 %r4376, %r4368, -2147483648;
+ selp.b32 %r4377, %r4376, %r4368, %p576;
+ selp.b32 %r4378, -1, 0, %p576;
+ xor.b32 %r4379, %r4371, %r4378;
+ shl.b32 %r4380, %r8137, 2;
+ xor.b32 %r4381, %r4380, %r4378;
+ cvt.u64.u32 %rd1276, %r4379;
+ cvt.u64.u32 %rd1277, %r4381;
+ bfi.b64 %rd1278, %rd1276, %rd1277, 32, 32;
+ cvt.rn.f64.s64 %fd85, %rd1278;
+ mul.f64 %fd86, %fd85, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3231, %fd86;
+ setp.eq.s32 %p577, %r4377, 0;
+ neg.f32 %f3232, %f3231;
+ selp.f32 %f5510, %f3231, %f3232, %p577;
+
+$L__BB0_663:
+ and.b32 %r859, %r8138, 1;
+ setp.eq.s32 %p578, %r859, 0;
+ selp.f32 %f763, %f5510, 0f3F800000, %p578;
+ mul.rn.f32 %f764, %f5510, %f5510;
+ mov.f32 %f5511, 0fB94D4153;
+ @%p578 bra $L__BB0_665;
+
+ mov.f32 %f3235, 0fBAB607ED;
+ mov.f32 %f3236, 0f37CBAC00;
+ fma.rn.f32 %f5511, %f3236, %f764, %f3235;
+
+$L__BB0_665:
+ selp.f32 %f3237, 0f3C0885E4, 0f3D2AAABB, %p578;
+ fma.rn.f32 %f3238, %f5511, %f764, %f3237;
+ selp.f32 %f3239, 0fBE2AAAA8, 0fBEFFFFFF, %p578;
+ fma.rn.f32 %f3240, %f3238, %f764, %f3239;
+ mov.f32 %f3241, 0f00000000;
+ fma.rn.f32 %f3242, %f764, %f763, %f3241;
+ fma.rn.f32 %f5281, %f3240, %f3242, %f763;
+ and.b32 %r4383, %r8138, 2;
+ setp.eq.s32 %p580, %r4383, 0;
+ @%p580 bra $L__BB0_667;
+
+ mov.f32 %f3244, 0fBF800000;
+ fma.rn.f32 %f5281, %f5281, %f3244, %f3241;
+
+$L__BB0_667:
+ setp.lt.s32 %p13, %r11, %r841;
+ @%p570 bra $L__BB0_680;
+
+ mul.f32 %f3245, %f5403, 0f3F22F983;
+ cvt.rni.s32.f32 %r8142, %f3245;
+ cvt.rn.f32.s32 %f3246, %r8142;
+ mov.f32 %f3247, 0fBFC90FDA;
+ fma.rn.f32 %f3248, %f3246, %f3247, %f5403;
+ mov.f32 %f3249, 0fB3A22168;
+ fma.rn.f32 %f3250, %f3246, %f3249, %f3248;
+ mov.f32 %f3251, 0fA7C234C5;
+ fma.rn.f32 %f5514, %f3246, %f3251, %f3250;
+ abs.f32 %f772, %f5403;
+ setp.ltu.f32 %p582, %f772, 0f47CE4780;
+ @%p582 bra $L__BB0_676;
+
+ setp.eq.f32 %p583, %f772, 0f7F800000;
+ @%p583 bra $L__BB0_675;
+ bra.uni $L__BB0_670;
+
+$L__BB0_675:
+ mov.f32 %f3254, 0f00000000;
+ mul.rn.f32 %f5514, %f5403, %f3254;
+ mov.u32 %r8142, 0;
+ bra.uni $L__BB0_676;
+
+$L__BB0_670:
+ mov.b32 %r861, %f5403;
+ shr.u32 %r4385, %r861, 23;
+ and.b32 %r4386, %r4385, 255;
+ add.s32 %r862, %r4386, -128;
+ shl.b32 %r4387, %r861, 8;
+ or.b32 %r863, %r4387, -2147483648;
+ shr.u32 %r864, %r862, 5;
+ mov.u64 %rd2591, 0;
+ mov.u32 %r8139, 0;
+ mov.u64 %rd1282, __cudart_i2opi_f;
+ mov.u64 %rd2592, %rd2591;
+
+$L__BB0_671:
+ .pragma "nounroll";
+ shl.b64 %rd1281, %rd2591, 2;
+ add.s64 %rd1283, %rd1282, %rd1281;
+ ld.global.nc.u32 %r4388, [%rd1283];
+ mad.wide.u32 %rd1284, %r4388, %r863, %rd2592;
+ shr.u64 %rd2592, %rd1284, 32;
+ add.s64 %rd1285, %rd1, %rd1281;
+ st.local.u32 [%rd1285], %rd1284;
+ add.s32 %r8139, %r8139, 1;
+ cvt.s64.s32 %rd2591, %r8139;
+ setp.ne.s32 %p584, %r8139, 6;
+ @%p584 bra $L__BB0_671;
+
+ st.local.u32 [%rd4], %rd2592;
+ mov.u32 %r4389, 4;
+ sub.s32 %r867, %r4389, %r864;
+ mov.u32 %r4390, 6;
+ sub.s32 %r4391, %r4390, %r864;
+ mul.wide.s32 %rd1286, %r4391, 4;
+ add.s64 %rd1287, %rd1, %rd1286;
+ ld.local.u32 %r8140, [%rd1287];
+ ld.local.u32 %r8141, [%rd1287+-4];
+ and.b32 %r870, %r862, 31;
+ setp.eq.s32 %p585, %r870, 0;
+ @%p585 bra $L__BB0_674;
+
+ mov.u32 %r4392, 32;
+ sub.s32 %r4393, %r4392, %r870;
+ shr.u32 %r4394, %r8141, %r4393;
+ shl.b32 %r4395, %r8140, %r870;
+ add.s32 %r8140, %r4394, %r4395;
+ mul.wide.s32 %rd1288, %r867, 4;
+ add.s64 %rd1289, %rd1, %rd1288;
+ ld.local.u32 %r4396, [%rd1289];
+ shr.u32 %r4397, %r4396, %r4393;
+ shl.b32 %r4398, %r8141, %r870;
+ add.s32 %r8141, %r4397, %r4398;
+
+$L__BB0_674:
+ and.b32 %r4399, %r861, -2147483648;
+ shr.u32 %r4400, %r8141, 30;
+ shl.b32 %r4401, %r8140, 2;
+ or.b32 %r4402, %r4400, %r4401;
+ shr.u32 %r4403, %r4402, 31;
+ shr.u32 %r4404, %r8140, 30;
+ add.s32 %r4405, %r4403, %r4404;
+ neg.s32 %r4406, %r4405;
+ setp.eq.s32 %p586, %r4399, 0;
+ selp.b32 %r8142, %r4405, %r4406, %p586;
+ setp.ne.s32 %p587, %r4403, 0;
+ xor.b32 %r4407, %r4399, -2147483648;
+ selp.b32 %r4408, %r4407, %r4399, %p587;
+ selp.b32 %r4409, -1, 0, %p587;
+ xor.b32 %r4410, %r4402, %r4409;
+ shl.b32 %r4411, %r8141, 2;
+ xor.b32 %r4412, %r4411, %r4409;
+ cvt.u64.u32 %rd1290, %r4410;
+ cvt.u64.u32 %rd1291, %r4412;
+ bfi.b64 %rd1292, %rd1290, %rd1291, 32, 32;
+ cvt.rn.f64.s64 %fd87, %rd1292;
+ mul.f64 %fd88, %fd87, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3252, %fd88;
+ setp.eq.s32 %p588, %r4408, 0;
+ neg.f32 %f3253, %f3252;
+ selp.f32 %f5514, %f3252, %f3253, %p588;
+
+$L__BB0_676:
+ add.s32 %r877, %r8142, 1;
+ and.b32 %r878, %r877, 1;
+ setp.eq.s32 %p589, %r878, 0;
+ selp.f32 %f776, %f5514, 0f3F800000, %p589;
+ mul.rn.f32 %f777, %f5514, %f5514;
+ mov.f32 %f5515, 0fB94D4153;
+ @%p589 bra $L__BB0_678;
+
+ mov.f32 %f3256, 0fBAB607ED;
+ mov.f32 %f3257, 0f37CBAC00;
+ fma.rn.f32 %f5515, %f3257, %f777, %f3256;
+
+$L__BB0_678:
+ selp.f32 %f3258, 0f3C0885E4, 0f3D2AAABB, %p589;
+ fma.rn.f32 %f3259, %f5515, %f777, %f3258;
+ selp.f32 %f3260, 0fBE2AAAA8, 0fBEFFFFFF, %p589;
+ fma.rn.f32 %f3261, %f3259, %f777, %f3260;
+ mov.f32 %f3262, 0f00000000;
+ fma.rn.f32 %f3263, %f777, %f776, %f3262;
+ fma.rn.f32 %f5283, %f3261, %f3263, %f776;
+ and.b32 %r4414, %r877, 2;
+ setp.eq.s32 %p591, %r4414, 0;
+ @%p591 bra $L__BB0_680;
+
+ mov.f32 %f3265, 0fBF800000;
+ fma.rn.f32 %f5283, %f5283, %f3265, %f3262;
+
+$L__BB0_680:
+ selp.f32 %f784, %f5283, %f5284, %p13;
+ selp.f32 %f785, %f5281, %f5282, %p13;
+ @%p570 bra $L__BB0_682;
+
+ add.f32 %f5594, %f785, %f784;
+ mov.f32 %f5282, %f5281;
+ mov.f32 %f5284, %f5283;
+
+$L__BB0_682:
+ @%p429 bra $L__BB0_711;
+
+ shl.b32 %r4415, %r12, 5;
+ neg.s32 %r879, %r4415;
+ setp.ge.s32 %p595, %r11, %r879;
+ @%p595 bra $L__BB0_696;
+
+ mul.f32 %f3268, %f5410, 0f3F22F983;
+ cvt.rni.s32.f32 %r8146, %f3268;
+ cvt.rn.f32.s32 %f3269, %r8146;
+ mov.f32 %f3270, 0fBFC90FDA;
+ fma.rn.f32 %f3271, %f3269, %f3270, %f5410;
+ mov.f32 %f3272, 0fB3A22168;
+ fma.rn.f32 %f3273, %f3269, %f3272, %f3271;
+ mov.f32 %f3274, 0fA7C234C5;
+ fma.rn.f32 %f5523, %f3269, %f3274, %f3273;
+ abs.f32 %f793, %f5410;
+ setp.ltu.f32 %p596, %f793, 0f47CE4780;
+ @%p596 bra $L__BB0_692;
+
+ setp.eq.f32 %p597, %f793, 0f7F800000;
+ @%p597 bra $L__BB0_691;
+ bra.uni $L__BB0_686;
+
+$L__BB0_691:
+ mov.f32 %f3277, 0f00000000;
+ mul.rn.f32 %f5523, %f5410, %f3277;
+ mov.u32 %r8146, 0;
+ bra.uni $L__BB0_692;
+
+$L__BB0_686:
+ mov.b32 %r881, %f5410;
+ shr.u32 %r4417, %r881, 23;
+ and.b32 %r4418, %r4417, 255;
+ add.s32 %r882, %r4418, -128;
+ shl.b32 %r4419, %r881, 8;
+ or.b32 %r883, %r4419, -2147483648;
+ shr.u32 %r884, %r882, 5;
+ mov.u64 %rd2593, 0;
+ mov.u32 %r8143, 0;
+ mov.u64 %rd1296, __cudart_i2opi_f;
+ mov.u64 %rd2594, %rd2593;
+
+$L__BB0_687:
+ .pragma "nounroll";
+ shl.b64 %rd1295, %rd2593, 2;
+ add.s64 %rd1297, %rd1296, %rd1295;
+ ld.global.nc.u32 %r4420, [%rd1297];
+ mad.wide.u32 %rd1298, %r4420, %r883, %rd2594;
+ shr.u64 %rd2594, %rd1298, 32;
+ add.s64 %rd1299, %rd1, %rd1295;
+ st.local.u32 [%rd1299], %rd1298;
+ add.s32 %r8143, %r8143, 1;
+ cvt.s64.s32 %rd2593, %r8143;
+ setp.ne.s32 %p598, %r8143, 6;
+ @%p598 bra $L__BB0_687;
+
+ st.local.u32 [%rd4], %rd2594;
+ mov.u32 %r4421, 4;
+ sub.s32 %r887, %r4421, %r884;
+ mov.u32 %r4422, 6;
+ sub.s32 %r4423, %r4422, %r884;
+ mul.wide.s32 %rd1300, %r4423, 4;
+ add.s64 %rd1301, %rd1, %rd1300;
+ ld.local.u32 %r8144, [%rd1301];
+ ld.local.u32 %r8145, [%rd1301+-4];
+ and.b32 %r890, %r882, 31;
+ setp.eq.s32 %p599, %r890, 0;
+ @%p599 bra $L__BB0_690;
+
+ mov.u32 %r4424, 32;
+ sub.s32 %r4425, %r4424, %r890;
+ shr.u32 %r4426, %r8145, %r4425;
+ shl.b32 %r4427, %r8144, %r890;
+ add.s32 %r8144, %r4426, %r4427;
+ mul.wide.s32 %rd1302, %r887, 4;
+ add.s64 %rd1303, %rd1, %rd1302;
+ ld.local.u32 %r4428, [%rd1303];
+ shr.u32 %r4429, %r4428, %r4425;
+ shl.b32 %r4430, %r8145, %r890;
+ add.s32 %r8145, %r4429, %r4430;
+
+$L__BB0_690:
+ and.b32 %r4431, %r881, -2147483648;
+ shr.u32 %r4432, %r8145, 30;
+ shl.b32 %r4433, %r8144, 2;
+ or.b32 %r4434, %r4432, %r4433;
+ shr.u32 %r4435, %r4434, 31;
+ shr.u32 %r4436, %r8144, 30;
+ add.s32 %r4437, %r4435, %r4436;
+ neg.s32 %r4438, %r4437;
+ setp.eq.s32 %p600, %r4431, 0;
+ selp.b32 %r8146, %r4437, %r4438, %p600;
+ setp.ne.s32 %p601, %r4435, 0;
+ xor.b32 %r4439, %r4431, -2147483648;
+ selp.b32 %r4440, %r4439, %r4431, %p601;
+ selp.b32 %r4441, -1, 0, %p601;
+ xor.b32 %r4442, %r4434, %r4441;
+ shl.b32 %r4443, %r8145, 2;
+ xor.b32 %r4444, %r4443, %r4441;
+ cvt.u64.u32 %rd1304, %r4442;
+ cvt.u64.u32 %rd1305, %r4444;
+ bfi.b64 %rd1306, %rd1304, %rd1305, 32, 32;
+ cvt.rn.f64.s64 %fd89, %rd1306;
+ mul.f64 %fd90, %fd89, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3275, %fd90;
+ setp.eq.s32 %p602, %r4440, 0;
+ neg.f32 %f3276, %f3275;
+ selp.f32 %f5523, %f3275, %f3276, %p602;
+
+$L__BB0_692:
+ and.b32 %r897, %r8146, 1;
+ setp.eq.s32 %p603, %r897, 0;
+ selp.f32 %f797, %f5523, 0f3F800000, %p603;
+ mul.rn.f32 %f798, %f5523, %f5523;
+ mov.f32 %f5524, 0fB94D4153;
+ @%p603 bra $L__BB0_694;
+
+ mov.f32 %f3279, 0fBAB607ED;
+ mov.f32 %f3280, 0f37CBAC00;
+ fma.rn.f32 %f5524, %f3280, %f798, %f3279;
+
+$L__BB0_694:
+ selp.f32 %f3281, 0f3C0885E4, 0f3D2AAABB, %p603;
+ fma.rn.f32 %f3282, %f5524, %f798, %f3281;
+ selp.f32 %f3283, 0fBE2AAAA8, 0fBEFFFFFF, %p603;
+ fma.rn.f32 %f3284, %f3282, %f798, %f3283;
+ mov.f32 %f3285, 0f00000000;
+ fma.rn.f32 %f3286, %f798, %f797, %f3285;
+ fma.rn.f32 %f5281, %f3284, %f3286, %f797;
+ and.b32 %r4446, %r8146, 2;
+ setp.eq.s32 %p605, %r4446, 0;
+ @%p605 bra $L__BB0_696;
+
+ mov.f32 %f3288, 0fBF800000;
+ fma.rn.f32 %f5281, %f5281, %f3288, %f3285;
+
+$L__BB0_696:
+ setp.lt.s32 %p14, %r11, %r879;
+ @%p595 bra $L__BB0_709;
+
+ mul.f32 %f3289, %f5402, 0f3F22F983;
+ cvt.rni.s32.f32 %r8150, %f3289;
+ cvt.rn.f32.s32 %f3290, %r8150;
+ mov.f32 %f3291, 0fBFC90FDA;
+ fma.rn.f32 %f3292, %f3290, %f3291, %f5402;
+ mov.f32 %f3293, 0fB3A22168;
+ fma.rn.f32 %f3294, %f3290, %f3293, %f3292;
+ mov.f32 %f3295, 0fA7C234C5;
+ fma.rn.f32 %f5527, %f3290, %f3295, %f3294;
+ abs.f32 %f806, %f5402;
+ setp.ltu.f32 %p607, %f806, 0f47CE4780;
+ @%p607 bra $L__BB0_705;
+
+ setp.eq.f32 %p608, %f806, 0f7F800000;
+ @%p608 bra $L__BB0_704;
+ bra.uni $L__BB0_699;
+
+$L__BB0_704:
+ mov.f32 %f3298, 0f00000000;
+ mul.rn.f32 %f5527, %f5402, %f3298;
+ mov.u32 %r8150, 0;
+ bra.uni $L__BB0_705;
+
+$L__BB0_699:
+ mov.b32 %r899, %f5402;
+ shr.u32 %r4448, %r899, 23;
+ and.b32 %r4449, %r4448, 255;
+ add.s32 %r900, %r4449, -128;
+ shl.b32 %r4450, %r899, 8;
+ or.b32 %r901, %r4450, -2147483648;
+ shr.u32 %r902, %r900, 5;
+ mov.u64 %rd2595, 0;
+ mov.u32 %r8147, 0;
+ mov.u64 %rd1310, __cudart_i2opi_f;
+ mov.u64 %rd2596, %rd2595;
+
+$L__BB0_700:
+ .pragma "nounroll";
+ shl.b64 %rd1309, %rd2595, 2;
+ add.s64 %rd1311, %rd1310, %rd1309;
+ ld.global.nc.u32 %r4451, [%rd1311];
+ mad.wide.u32 %rd1312, %r4451, %r901, %rd2596;
+ shr.u64 %rd2596, %rd1312, 32;
+ add.s64 %rd1313, %rd1, %rd1309;
+ st.local.u32 [%rd1313], %rd1312;
+ add.s32 %r8147, %r8147, 1;
+ cvt.s64.s32 %rd2595, %r8147;
+ setp.ne.s32 %p609, %r8147, 6;
+ @%p609 bra $L__BB0_700;
+
+ st.local.u32 [%rd4], %rd2596;
+ mov.u32 %r4452, 4;
+ sub.s32 %r905, %r4452, %r902;
+ mov.u32 %r4453, 6;
+ sub.s32 %r4454, %r4453, %r902;
+ mul.wide.s32 %rd1314, %r4454, 4;
+ add.s64 %rd1315, %rd1, %rd1314;
+ ld.local.u32 %r8148, [%rd1315];
+ ld.local.u32 %r8149, [%rd1315+-4];
+ and.b32 %r908, %r900, 31;
+ setp.eq.s32 %p610, %r908, 0;
+ @%p610 bra $L__BB0_703;
+
+ mov.u32 %r4455, 32;
+ sub.s32 %r4456, %r4455, %r908;
+ shr.u32 %r4457, %r8149, %r4456;
+ shl.b32 %r4458, %r8148, %r908;
+ add.s32 %r8148, %r4457, %r4458;
+ mul.wide.s32 %rd1316, %r905, 4;
+ add.s64 %rd1317, %rd1, %rd1316;
+ ld.local.u32 %r4459, [%rd1317];
+ shr.u32 %r4460, %r4459, %r4456;
+ shl.b32 %r4461, %r8149, %r908;
+ add.s32 %r8149, %r4460, %r4461;
+
+$L__BB0_703:
+ and.b32 %r4462, %r899, -2147483648;
+ shr.u32 %r4463, %r8149, 30;
+ shl.b32 %r4464, %r8148, 2;
+ or.b32 %r4465, %r4463, %r4464;
+ shr.u32 %r4466, %r4465, 31;
+ shr.u32 %r4467, %r8148, 30;
+ add.s32 %r4468, %r4466, %r4467;
+ neg.s32 %r4469, %r4468;
+ setp.eq.s32 %p611, %r4462, 0;
+ selp.b32 %r8150, %r4468, %r4469, %p611;
+ setp.ne.s32 %p612, %r4466, 0;
+ xor.b32 %r4470, %r4462, -2147483648;
+ selp.b32 %r4471, %r4470, %r4462, %p612;
+ selp.b32 %r4472, -1, 0, %p612;
+ xor.b32 %r4473, %r4465, %r4472;
+ shl.b32 %r4474, %r8149, 2;
+ xor.b32 %r4475, %r4474, %r4472;
+ cvt.u64.u32 %rd1318, %r4473;
+ cvt.u64.u32 %rd1319, %r4475;
+ bfi.b64 %rd1320, %rd1318, %rd1319, 32, 32;
+ cvt.rn.f64.s64 %fd91, %rd1320;
+ mul.f64 %fd92, %fd91, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3296, %fd92;
+ setp.eq.s32 %p613, %r4471, 0;
+ neg.f32 %f3297, %f3296;
+ selp.f32 %f5527, %f3296, %f3297, %p613;
+
+$L__BB0_705:
+ add.s32 %r915, %r8150, 1;
+ and.b32 %r916, %r915, 1;
+ setp.eq.s32 %p614, %r916, 0;
+ selp.f32 %f810, %f5527, 0f3F800000, %p614;
+ mul.rn.f32 %f811, %f5527, %f5527;
+ mov.f32 %f5528, 0fB94D4153;
+ @%p614 bra $L__BB0_707;
+
+ mov.f32 %f3300, 0fBAB607ED;
+ mov.f32 %f3301, 0f37CBAC00;
+ fma.rn.f32 %f5528, %f3301, %f811, %f3300;
+
+$L__BB0_707:
+ selp.f32 %f3302, 0f3C0885E4, 0f3D2AAABB, %p614;
+ fma.rn.f32 %f3303, %f5528, %f811, %f3302;
+ selp.f32 %f3304, 0fBE2AAAA8, 0fBEFFFFFF, %p614;
+ fma.rn.f32 %f3305, %f3303, %f811, %f3304;
+ mov.f32 %f3306, 0f00000000;
+ fma.rn.f32 %f3307, %f811, %f810, %f3306;
+ fma.rn.f32 %f5283, %f3305, %f3307, %f810;
+ and.b32 %r4477, %r915, 2;
+ setp.eq.s32 %p616, %r4477, 0;
+ @%p616 bra $L__BB0_709;
+
+ mov.f32 %f3309, 0fBF800000;
+ fma.rn.f32 %f5283, %f5283, %f3309, %f3306;
+
+$L__BB0_709:
+ selp.f32 %f818, %f5283, %f5284, %p14;
+ selp.f32 %f819, %f5281, %f5282, %p14;
+ @%p595 bra $L__BB0_711;
+
+ add.f32 %f5593, %f819, %f818;
+ mov.f32 %f5282, %f5281;
+ mov.f32 %f5284, %f5283;
+
+$L__BB0_711:
+ @%p429 bra $L__BB0_933;
+
+ shl.b32 %r4478, %r12, 5;
+ mov.u32 %r4479, -32;
+ sub.s32 %r917, %r4479, %r4478;
+ setp.ge.s32 %p620, %r11, %r917;
+ @%p620 bra $L__BB0_725;
+
+ mul.f32 %f3312, %f5409, 0f3F22F983;
+ cvt.rni.s32.f32 %r8154, %f3312;
+ cvt.rn.f32.s32 %f3313, %r8154;
+ mov.f32 %f3314, 0fBFC90FDA;
+ fma.rn.f32 %f3315, %f3313, %f3314, %f5409;
+ mov.f32 %f3316, 0fB3A22168;
+ fma.rn.f32 %f3317, %f3313, %f3316, %f3315;
+ mov.f32 %f3318, 0fA7C234C5;
+ fma.rn.f32 %f5536, %f3313, %f3318, %f3317;
+ abs.f32 %f827, %f5409;
+ setp.ltu.f32 %p621, %f827, 0f47CE4780;
+ @%p621 bra $L__BB0_721;
+
+ setp.eq.f32 %p622, %f827, 0f7F800000;
+ @%p622 bra $L__BB0_720;
+ bra.uni $L__BB0_715;
+
+$L__BB0_720:
+ mov.f32 %f3321, 0f00000000;
+ mul.rn.f32 %f5536, %f5409, %f3321;
+ mov.u32 %r8154, 0;
+ bra.uni $L__BB0_721;
+
+$L__BB0_715:
+ mov.b32 %r919, %f5409;
+ shr.u32 %r4481, %r919, 23;
+ and.b32 %r4482, %r4481, 255;
+ add.s32 %r920, %r4482, -128;
+ shl.b32 %r4483, %r919, 8;
+ or.b32 %r921, %r4483, -2147483648;
+ shr.u32 %r922, %r920, 5;
+ mov.u64 %rd2597, 0;
+ mov.u32 %r8151, 0;
+ mov.u64 %rd1324, __cudart_i2opi_f;
+ mov.u64 %rd2598, %rd2597;
+
+$L__BB0_716:
+ .pragma "nounroll";
+ shl.b64 %rd1323, %rd2597, 2;
+ add.s64 %rd1325, %rd1324, %rd1323;
+ ld.global.nc.u32 %r4484, [%rd1325];
+ mad.wide.u32 %rd1326, %r4484, %r921, %rd2598;
+ shr.u64 %rd2598, %rd1326, 32;
+ add.s64 %rd1327, %rd1, %rd1323;
+ st.local.u32 [%rd1327], %rd1326;
+ add.s32 %r8151, %r8151, 1;
+ cvt.s64.s32 %rd2597, %r8151;
+ setp.ne.s32 %p623, %r8151, 6;
+ @%p623 bra $L__BB0_716;
+
+ st.local.u32 [%rd4], %rd2598;
+ mov.u32 %r4485, 4;
+ sub.s32 %r925, %r4485, %r922;
+ mov.u32 %r4486, 6;
+ sub.s32 %r4487, %r4486, %r922;
+ mul.wide.s32 %rd1328, %r4487, 4;
+ add.s64 %rd1329, %rd1, %rd1328;
+ ld.local.u32 %r8152, [%rd1329];
+ ld.local.u32 %r8153, [%rd1329+-4];
+ and.b32 %r928, %r920, 31;
+ setp.eq.s32 %p624, %r928, 0;
+ @%p624 bra $L__BB0_719;
+
+ mov.u32 %r4488, 32;
+ sub.s32 %r4489, %r4488, %r928;
+ shr.u32 %r4490, %r8153, %r4489;
+ shl.b32 %r4491, %r8152, %r928;
+ add.s32 %r8152, %r4490, %r4491;
+ mul.wide.s32 %rd1330, %r925, 4;
+ add.s64 %rd1331, %rd1, %rd1330;
+ ld.local.u32 %r4492, [%rd1331];
+ shr.u32 %r4493, %r4492, %r4489;
+ shl.b32 %r4494, %r8153, %r928;
+ add.s32 %r8153, %r4493, %r4494;
+
+$L__BB0_719:
+ and.b32 %r4495, %r919, -2147483648;
+ shr.u32 %r4496, %r8153, 30;
+ shl.b32 %r4497, %r8152, 2;
+ or.b32 %r4498, %r4496, %r4497;
+ shr.u32 %r4499, %r4498, 31;
+ shr.u32 %r4500, %r8152, 30;
+ add.s32 %r4501, %r4499, %r4500;
+ neg.s32 %r4502, %r4501;
+ setp.eq.s32 %p625, %r4495, 0;
+ selp.b32 %r8154, %r4501, %r4502, %p625;
+ setp.ne.s32 %p626, %r4499, 0;
+ xor.b32 %r4503, %r4495, -2147483648;
+ selp.b32 %r4504, %r4503, %r4495, %p626;
+ selp.b32 %r4505, -1, 0, %p626;
+ xor.b32 %r4506, %r4498, %r4505;
+ shl.b32 %r4507, %r8153, 2;
+ xor.b32 %r4508, %r4507, %r4505;
+ cvt.u64.u32 %rd1332, %r4506;
+ cvt.u64.u32 %rd1333, %r4508;
+ bfi.b64 %rd1334, %rd1332, %rd1333, 32, 32;
+ cvt.rn.f64.s64 %fd93, %rd1334;
+ mul.f64 %fd94, %fd93, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3319, %fd94;
+ setp.eq.s32 %p627, %r4504, 0;
+ neg.f32 %f3320, %f3319;
+ selp.f32 %f5536, %f3319, %f3320, %p627;
+
+$L__BB0_721:
+ and.b32 %r935, %r8154, 1;
+ setp.eq.s32 %p628, %r935, 0;
+ selp.f32 %f831, %f5536, 0f3F800000, %p628;
+ mul.rn.f32 %f832, %f5536, %f5536;
+ mov.f32 %f5537, 0fB94D4153;
+ @%p628 bra $L__BB0_723;
+
+ mov.f32 %f3323, 0fBAB607ED;
+ mov.f32 %f3324, 0f37CBAC00;
+ fma.rn.f32 %f5537, %f3324, %f832, %f3323;
+
+$L__BB0_723:
+ selp.f32 %f3325, 0f3C0885E4, 0f3D2AAABB, %p628;
+ fma.rn.f32 %f3326, %f5537, %f832, %f3325;
+ selp.f32 %f3327, 0fBE2AAAA8, 0fBEFFFFFF, %p628;
+ fma.rn.f32 %f3328, %f3326, %f832, %f3327;
+ mov.f32 %f3329, 0f00000000;
+ fma.rn.f32 %f3330, %f832, %f831, %f3329;
+ fma.rn.f32 %f5281, %f3328, %f3330, %f831;
+ and.b32 %r4510, %r8154, 2;
+ setp.eq.s32 %p630, %r4510, 0;
+ @%p630 bra $L__BB0_725;
+
+ mov.f32 %f3332, 0fBF800000;
+ fma.rn.f32 %f5281, %f5281, %f3332, %f3329;
+
+$L__BB0_725:
+ setp.lt.s32 %p15, %r11, %r917;
+ @%p620 bra $L__BB0_738;
+
+ mul.f32 %f3333, %f5401, 0f3F22F983;
+ cvt.rni.s32.f32 %r8158, %f3333;
+ cvt.rn.f32.s32 %f3334, %r8158;
+ mov.f32 %f3335, 0fBFC90FDA;
+ fma.rn.f32 %f3336, %f3334, %f3335, %f5401;
+ mov.f32 %f3337, 0fB3A22168;
+ fma.rn.f32 %f3338, %f3334, %f3337, %f3336;
+ mov.f32 %f3339, 0fA7C234C5;
+ fma.rn.f32 %f5540, %f3334, %f3339, %f3338;
+ abs.f32 %f840, %f5401;
+ setp.ltu.f32 %p632, %f840, 0f47CE4780;
+ @%p632 bra $L__BB0_734;
+
+ setp.eq.f32 %p633, %f840, 0f7F800000;
+ @%p633 bra $L__BB0_733;
+ bra.uni $L__BB0_728;
+
+$L__BB0_733:
+ mov.f32 %f3342, 0f00000000;
+ mul.rn.f32 %f5540, %f5401, %f3342;
+ mov.u32 %r8158, 0;
+ bra.uni $L__BB0_734;
+
+$L__BB0_728:
+ mov.b32 %r937, %f5401;
+ shr.u32 %r4512, %r937, 23;
+ and.b32 %r4513, %r4512, 255;
+ add.s32 %r938, %r4513, -128;
+ shl.b32 %r4514, %r937, 8;
+ or.b32 %r939, %r4514, -2147483648;
+ shr.u32 %r940, %r938, 5;
+ mov.u64 %rd2599, 0;
+ mov.u32 %r8155, 0;
+ mov.u64 %rd1338, __cudart_i2opi_f;
+ mov.u64 %rd2600, %rd2599;
+
+$L__BB0_729:
+ .pragma "nounroll";
+ shl.b64 %rd1337, %rd2599, 2;
+ add.s64 %rd1339, %rd1338, %rd1337;
+ ld.global.nc.u32 %r4515, [%rd1339];
+ mad.wide.u32 %rd1340, %r4515, %r939, %rd2600;
+ shr.u64 %rd2600, %rd1340, 32;
+ add.s64 %rd1341, %rd1, %rd1337;
+ st.local.u32 [%rd1341], %rd1340;
+ add.s32 %r8155, %r8155, 1;
+ cvt.s64.s32 %rd2599, %r8155;
+ setp.ne.s32 %p634, %r8155, 6;
+ @%p634 bra $L__BB0_729;
+
+ st.local.u32 [%rd4], %rd2600;
+ mov.u32 %r4516, 4;
+ sub.s32 %r943, %r4516, %r940;
+ mov.u32 %r4517, 6;
+ sub.s32 %r4518, %r4517, %r940;
+ mul.wide.s32 %rd1342, %r4518, 4;
+ add.s64 %rd1343, %rd1, %rd1342;
+ ld.local.u32 %r8156, [%rd1343];
+ ld.local.u32 %r8157, [%rd1343+-4];
+ and.b32 %r946, %r938, 31;
+ setp.eq.s32 %p635, %r946, 0;
+ @%p635 bra $L__BB0_732;
+
+ mov.u32 %r4519, 32;
+ sub.s32 %r4520, %r4519, %r946;
+ shr.u32 %r4521, %r8157, %r4520;
+ shl.b32 %r4522, %r8156, %r946;
+ add.s32 %r8156, %r4521, %r4522;
+ mul.wide.s32 %rd1344, %r943, 4;
+ add.s64 %rd1345, %rd1, %rd1344;
+ ld.local.u32 %r4523, [%rd1345];
+ shr.u32 %r4524, %r4523, %r4520;
+ shl.b32 %r4525, %r8157, %r946;
+ add.s32 %r8157, %r4524, %r4525;
+
+$L__BB0_732:
+ and.b32 %r4526, %r937, -2147483648;
+ shr.u32 %r4527, %r8157, 30;
+ shl.b32 %r4528, %r8156, 2;
+ or.b32 %r4529, %r4527, %r4528;
+ shr.u32 %r4530, %r4529, 31;
+ shr.u32 %r4531, %r8156, 30;
+ add.s32 %r4532, %r4530, %r4531;
+ neg.s32 %r4533, %r4532;
+ setp.eq.s32 %p636, %r4526, 0;
+ selp.b32 %r8158, %r4532, %r4533, %p636;
+ setp.ne.s32 %p637, %r4530, 0;
+ xor.b32 %r4534, %r4526, -2147483648;
+ selp.b32 %r4535, %r4534, %r4526, %p637;
+ selp.b32 %r4536, -1, 0, %p637;
+ xor.b32 %r4537, %r4529, %r4536;
+ shl.b32 %r4538, %r8157, 2;
+ xor.b32 %r4539, %r4538, %r4536;
+ cvt.u64.u32 %rd1346, %r4537;
+ cvt.u64.u32 %rd1347, %r4539;
+ bfi.b64 %rd1348, %rd1346, %rd1347, 32, 32;
+ cvt.rn.f64.s64 %fd95, %rd1348;
+ mul.f64 %fd96, %fd95, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3340, %fd96;
+ setp.eq.s32 %p638, %r4535, 0;
+ neg.f32 %f3341, %f3340;
+ selp.f32 %f5540, %f3340, %f3341, %p638;
+
+$L__BB0_734:
+ add.s32 %r953, %r8158, 1;
+ and.b32 %r954, %r953, 1;
+ setp.eq.s32 %p639, %r954, 0;
+ selp.f32 %f844, %f5540, 0f3F800000, %p639;
+ mul.rn.f32 %f845, %f5540, %f5540;
+ mov.f32 %f5541, 0fB94D4153;
+ @%p639 bra $L__BB0_736;
+
+ mov.f32 %f3344, 0fBAB607ED;
+ mov.f32 %f3345, 0f37CBAC00;
+ fma.rn.f32 %f5541, %f3345, %f845, %f3344;
+
+$L__BB0_736:
+ selp.f32 %f3346, 0f3C0885E4, 0f3D2AAABB, %p639;
+ fma.rn.f32 %f3347, %f5541, %f845, %f3346;
+ selp.f32 %f3348, 0fBE2AAAA8, 0fBEFFFFFF, %p639;
+ fma.rn.f32 %f3349, %f3347, %f845, %f3348;
+ mov.f32 %f3350, 0f00000000;
+ fma.rn.f32 %f3351, %f845, %f844, %f3350;
+ fma.rn.f32 %f5283, %f3349, %f3351, %f844;
+ and.b32 %r4541, %r953, 2;
+ setp.eq.s32 %p641, %r4541, 0;
+ @%p641 bra $L__BB0_738;
+
+ mov.f32 %f3353, 0fBF800000;
+ fma.rn.f32 %f5283, %f5283, %f3353, %f3350;
+
+$L__BB0_738:
+ selp.f32 %f852, %f5283, %f5284, %p15;
+ selp.f32 %f853, %f5281, %f5282, %p15;
+ @%p620 bra $L__BB0_933;
+
+ add.f32 %f5592, %f853, %f852;
+ mov.f32 %f5282, %f5281;
+ mov.f32 %f5284, %f5283;
+
+$L__BB0_933:
+ setp.lt.s32 %p804, %r12, 1;
+ and.pred %p806, %p33, %p804;
+ @%p806 bra $L__BB0_1206;
+ bra.uni $L__BB0_934;
+
+$L__BB0_1206:
+ mov.u32 %r7786, %ctaid.x;
+ shl.b32 %r5805, %r12, 5;
+ add.s32 %r5806, %r5805, %r1;
+ mul.hi.s32 %r5807, %r5806, -1840700269;
+ add.s32 %r5808, %r5807, %r5806;
+ shr.u32 %r5809, %r5808, 31;
+ shr.s32 %r5810, %r5808, 2;
+ add.s32 %r5811, %r5810, %r5809;
+ mul.lo.s32 %r5812, %r5811, %r2589;
+ add.s32 %r5813, %r2586, %r14;
+ mad.lo.s32 %r5814, %r2587, 3, %r5813;
+ add.s32 %r5815, %r5814, %r5812;
+ mul.lo.s32 %r5816, %r5811, 7;
+ sub.s32 %r5817, %r5806, %r5816;
+ mul.lo.s32 %r5818, %r5817, %r2590;
+ add.s32 %r5819, %r5815, %r5818;
+ mul.wide.s32 %rd1858, %r5819, 4;
+ add.s64 %rd1859, %rd3, %rd1858;
+ ld.global.f32 %f1406, [%rd1859];
+ add.s32 %r5820, %r5806, 32;
+ mul.hi.s32 %r5821, %r5820, -1840700269;
+ add.s32 %r5822, %r5821, %r5820;
+ shr.u32 %r5823, %r5822, 31;
+ shr.s32 %r5824, %r5822, 2;
+ add.s32 %r5825, %r5824, %r5823;
+ mul.lo.s32 %r5826, %r5825, %r2589;
+ add.s32 %r5827, %r5814, %r5826;
+ mul.lo.s32 %r5828, %r5825, 7;
+ sub.s32 %r5829, %r5820, %r5828;
+ mul.lo.s32 %r5830, %r5829, %r2590;
+ add.s32 %r5831, %r5827, %r5830;
+ mul.wide.s32 %rd1860, %r5831, 4;
+ add.s64 %rd1861, %rd3, %rd1860;
+ ld.global.f32 %f1407, [%rd1861];
+ add.s32 %r5832, %r5814, %r2587;
+ add.s32 %r5833, %r5832, %r5812;
+ add.s32 %r5834, %r5833, %r5818;
+ mul.wide.s32 %rd1862, %r5834, 4;
+ add.s64 %rd1863, %rd3, %rd1862;
+ ld.global.f32 %f1408, [%rd1863];
+ add.s32 %r5835, %r5832, %r5826;
+ add.s32 %r5836, %r5835, %r5830;
+ mul.wide.s32 %rd1864, %r5836, 4;
+ add.s64 %rd1865, %rd3, %rd1864;
+ ld.global.f32 %f1409, [%rd1865];
+ add.s32 %r5837, %r5813, %r2586;
+ add.s32 %r5838, %r5837, %r5812;
+ add.s32 %r5839, %r5838, %r5818;
+ mul.wide.s32 %rd1866, %r5839, 4;
+ add.s64 %rd1867, %rd3, %rd1866;
+ ld.global.f32 %f1410, [%rd1867];
+ add.s32 %r5840, %r5837, %r5826;
+ add.s32 %r5841, %r5840, %r5830;
+ mul.wide.s32 %rd1868, %r5841, 4;
+ add.s64 %rd1869, %rd3, %rd1868;
+ ld.global.f32 %f1411, [%rd1869];
+ add.s32 %r5842, %r5837, %r2587;
+ add.s32 %r5843, %r5842, %r5812;
+ add.s32 %r5844, %r5843, %r5818;
+ mul.wide.s32 %rd1870, %r5844, 4;
+ add.s64 %rd1871, %rd3, %rd1870;
+ ld.global.f32 %f1412, [%rd1871];
+ add.s32 %r5845, %r5842, %r5826;
+ add.s32 %r5846, %r5845, %r5830;
+ mul.wide.s32 %rd1872, %r5846, 4;
+ add.s64 %rd1873, %rd3, %rd1872;
+ ld.global.f32 %f1413, [%rd1873];
+ mul.hi.s32 %r5848, %r5806, 954437177;
+ shr.u32 %r5849, %r5848, 31;
+ shr.s32 %r5850, %r5848, 1;
+ add.s32 %r5851, %r5850, %r5849;
+ mul.lo.s32 %r5852, %r5851, %r2579;
+ shl.b32 %r5853, %r2576, 1;
+ mad.lo.s32 %r5854, %r2578, %r7786, %r5853;
+ shl.b32 %r5855, %r2577, 1;
+ add.s32 %r5856, %r5854, %r5855;
+ add.s32 %r5857, %r5856, %r5852;
+ mul.lo.s32 %r5858, %r5851, 9;
+ sub.s32 %r5859, %r5806, %r5858;
+ mul.lo.s32 %r5860, %r5859, %r2580;
+ add.s32 %r5861, %r5857, %r5860;
+ mul.wide.s32 %rd1874, %r5861, 4;
+ add.s64 %rd1875, %rd2, %rd1874;
+ ld.global.f32 %f1414, [%rd1875];
+ mul.hi.s32 %r5862, %r5820, 954437177;
+ shr.u32 %r5863, %r5862, 31;
+ shr.s32 %r5864, %r5862, 1;
+ add.s32 %r5865, %r5864, %r5863;
+ mul.lo.s32 %r5866, %r5865, %r2579;
+ add.s32 %r5867, %r5856, %r5866;
+ mul.lo.s32 %r5868, %r5865, 9;
+ sub.s32 %r5869, %r5820, %r5868;
+ mul.lo.s32 %r5870, %r5869, %r2580;
+ add.s32 %r5871, %r5867, %r5870;
+ mul.wide.s32 %rd1876, %r5871, 4;
+ add.s64 %rd1877, %rd2, %rd1876;
+ ld.global.f32 %f1415, [%rd1877];
+ add.s32 %r5872, %r5854, %r2576;
+ add.s32 %r5873, %r5872, %r5852;
+ add.s32 %r5874, %r5873, %r5860;
+ mul.wide.s32 %rd1878, %r5874, 4;
+ add.s64 %rd1879, %rd2, %rd1878;
+ ld.global.f32 %f1416, [%rd1879];
+ add.s32 %r5875, %r5872, %r5866;
+ add.s32 %r5876, %r5875, %r5870;
+ mul.wide.s32 %rd1880, %r5876, 4;
+ add.s64 %rd1881, %rd2, %rd1880;
+ ld.global.f32 %f1417, [%rd1881];
+ mul.wide.s32 %rd1882, %r2577, 4;
+ add.s64 %rd1883, %rd1879, %rd1882;
+ ld.global.f32 %f1418, [%rd1883];
+ add.s64 %rd1884, %rd1881, %rd1882;
+ ld.global.f32 %f1419, [%rd1884];
+ add.s64 %rd1885, %rd1883, %rd1882;
+ ld.global.f32 %f1420, [%rd1885];
+ add.s64 %rd1886, %rd1884, %rd1882;
+ ld.global.f32 %f1421, [%rd1886];
+ mul.f32 %f4043, %f1414, 0f3F22F983;
+ cvt.rni.s32.f32 %r8290, %f4043;
+ cvt.rn.f32.s32 %f4044, %r8290;
+ mov.f32 %f4045, 0fBFC90FDA;
+ fma.rn.f32 %f4046, %f4044, %f4045, %f1414;
+ mov.f32 %f4047, 0fB3A22168;
+ fma.rn.f32 %f4048, %f4044, %f4047, %f4046;
+ mov.f32 %f4049, 0fA7C234C5;
+ fma.rn.f32 %f5743, %f4044, %f4049, %f4048;
+ abs.f32 %f1423, %f1414;
+ setp.ltu.f32 %p1031, %f1423, 0f47CE4780;
+ @%p1031 bra $L__BB0_1214;
+
+ setp.eq.f32 %p1032, %f1423, 0f7F800000;
+ @%p1032 bra $L__BB0_1213;
+ bra.uni $L__BB0_1208;
+
+$L__BB0_1213:
+ mov.f32 %f4052, 0f00000000;
+ mul.rn.f32 %f5743, %f1414, %f4052;
+ mov.u32 %r8290, 0;
+ bra.uni $L__BB0_1214;
+
+$L__BB0_934:
+ add.s32 %r1251, %r12, 8;
+ setp.gt.s32 %p807, %r1251, 14;
+ add.s32 %r5108, %r2586, %r14;
+ mad.lo.s32 %r1252, %r2587, 3, %r5108;
+ @%p807 bra $L__BB0_939;
+
+ shl.b32 %r1253, %r12, 5;
+ neg.s32 %r5109, %r1253;
+ setp.ge.s32 %p808, %r11, %r5109;
+ @%p808 bra $L__BB0_937;
+
+ add.s32 %r5110, %r1253, %r1;
+ mul.hi.s32 %r5111, %r5110, -1840700269;
+ add.s32 %r5112, %r5111, %r5110;
+ shr.u32 %r5113, %r5112, 31;
+ shr.s32 %r5114, %r5112, 2;
+ add.s32 %r5115, %r5114, %r5113;
+ mad.lo.s32 %r5116, %r5115, %r2589, %r1252;
+ mul.lo.s32 %r5117, %r5115, 7;
+ sub.s32 %r5118, %r5110, %r5117;
+ mad.lo.s32 %r5119, %r5118, %r2590, %r5116;
+ mul.wide.s32 %rd1602, %r5119, 4;
+ add.s64 %rd1603, %rd3, %rd1602;
+ ld.global.f32 %f5607, [%rd1603];
+
+$L__BB0_937:
+ mov.u32 %r5120, -32;
+ sub.s32 %r5121, %r5120, %r1253;
+ setp.ge.s32 %p809, %r11, %r5121;
+ @%p809 bra $L__BB0_939;
+
+ add.s32 %r5122, %r1253, %r1;
+ add.s32 %r5123, %r5122, 32;
+ mul.hi.s32 %r5124, %r5123, -1840700269;
+ add.s32 %r5125, %r5124, %r5123;
+ shr.u32 %r5126, %r5125, 31;
+ shr.s32 %r5127, %r5125, 2;
+ add.s32 %r5128, %r5127, %r5126;
+ mad.lo.s32 %r5129, %r5128, %r2589, %r1252;
+ mul.lo.s32 %r5130, %r5128, 7;
+ sub.s32 %r5131, %r5123, %r5130;
+ mad.lo.s32 %r5132, %r5131, %r2590, %r5129;
+ mul.wide.s32 %rd1604, %r5132, 4;
+ add.s64 %rd1605, %rd3, %rd1604;
+ ld.global.f32 %f5606, [%rd1605];
+
+$L__BB0_939:
+ add.s32 %r1254, %r12, 9;
+ setp.gt.s32 %p810, %r1254, 14;
+ add.s32 %r1255, %r1252, %r2587;
+ @%p810 bra $L__BB0_944;
+
+ shl.b32 %r1256, %r12, 5;
+ neg.s32 %r5133, %r1256;
+ setp.ge.s32 %p811, %r11, %r5133;
+ @%p811 bra $L__BB0_942;
+
+ add.s32 %r5134, %r1256, %r1;
+ mul.hi.s32 %r5135, %r5134, -1840700269;
+ add.s32 %r5136, %r5135, %r5134;
+ shr.u32 %r5137, %r5136, 31;
+ shr.s32 %r5138, %r5136, 2;
+ add.s32 %r5139, %r5138, %r5137;
+ mad.lo.s32 %r5140, %r5139, %r2589, %r1255;
+ mul.lo.s32 %r5141, %r5139, 7;
+ sub.s32 %r5142, %r5134, %r5141;
+ mad.lo.s32 %r5143, %r5142, %r2590, %r5140;
+ mul.wide.s32 %rd1606, %r5143, 4;
+ add.s64 %rd1607, %rd3, %rd1606;
+ ld.global.f32 %f5406, [%rd1607];
+
+$L__BB0_942:
+ mov.u32 %r5144, -32;
+ sub.s32 %r5145, %r5144, %r1256;
+ setp.ge.s32 %p812, %r11, %r5145;
+ @%p812 bra $L__BB0_944;
+
+ add.s32 %r5146, %r1256, %r1;
+ add.s32 %r5147, %r5146, 32;
+ mul.hi.s32 %r5148, %r5147, -1840700269;
+ add.s32 %r5149, %r5148, %r5147;
+ shr.u32 %r5150, %r5149, 31;
+ shr.s32 %r5151, %r5149, 2;
+ add.s32 %r5152, %r5151, %r5150;
+ mad.lo.s32 %r5153, %r5152, %r2589, %r1255;
+ mul.lo.s32 %r5154, %r5152, 7;
+ sub.s32 %r5155, %r5147, %r5154;
+ mad.lo.s32 %r5156, %r5155, %r2590, %r5153;
+ mul.wide.s32 %rd1608, %r5156, 4;
+ add.s64 %rd1609, %rd3, %rd1608;
+ ld.global.f32 %f5405, [%rd1609];
+
+$L__BB0_944:
+ add.s32 %r1257, %r12, 10;
+ setp.gt.s32 %p813, %r1257, 14;
+ shl.b32 %r5157, %r2586, 1;
+ add.s32 %r1258, %r5157, %r14;
+ @%p813 bra $L__BB0_949;
+
+ shl.b32 %r1259, %r12, 5;
+ neg.s32 %r5158, %r1259;
+ setp.ge.s32 %p814, %r11, %r5158;
+ @%p814 bra $L__BB0_947;
+
+ add.s32 %r5159, %r1259, %r1;
+ mul.hi.s32 %r5160, %r5159, -1840700269;
+ add.s32 %r5161, %r5160, %r5159;
+ shr.u32 %r5162, %r5161, 31;
+ shr.s32 %r5163, %r5161, 2;
+ add.s32 %r5164, %r5163, %r5162;
+ mad.lo.s32 %r5165, %r5164, %r2589, %r1258;
+ mul.lo.s32 %r5166, %r5164, 7;
+ sub.s32 %r5167, %r5159, %r5166;
+ mad.lo.s32 %r5168, %r5167, %r2590, %r5165;
+ mul.wide.s32 %rd1610, %r5168, 4;
+ add.s64 %rd1611, %rd3, %rd1610;
+ ld.global.f32 %f5404, [%rd1611];
+
+$L__BB0_947:
+ mov.u32 %r5169, -32;
+ sub.s32 %r5170, %r5169, %r1259;
+ setp.ge.s32 %p815, %r11, %r5170;
+ @%p815 bra $L__BB0_949;
+
+ add.s32 %r5171, %r1259, %r1;
+ add.s32 %r5172, %r5171, 32;
+ mul.hi.s32 %r5173, %r5172, -1840700269;
+ add.s32 %r5174, %r5173, %r5172;
+ shr.u32 %r5175, %r5174, 31;
+ shr.s32 %r5176, %r5174, 2;
+ add.s32 %r5177, %r5176, %r5175;
+ mad.lo.s32 %r5178, %r5177, %r2589, %r1258;
+ mul.lo.s32 %r5179, %r5177, 7;
+ sub.s32 %r5180, %r5172, %r5179;
+ mad.lo.s32 %r5181, %r5180, %r2590, %r5178;
+ mul.wide.s32 %rd1612, %r5181, 4;
+ add.s64 %rd1613, %rd3, %rd1612;
+ ld.global.f32 %f5403, [%rd1613];
+
+$L__BB0_949:
+ add.s32 %r1260, %r12, 11;
+ setp.gt.s32 %p816, %r1260, 14;
+ add.s32 %r1261, %r1258, %r2587;
+ @%p816 bra $L__BB0_954;
+
+ shl.b32 %r1262, %r12, 5;
+ neg.s32 %r5182, %r1262;
+ setp.ge.s32 %p817, %r11, %r5182;
+ @%p817 bra $L__BB0_952;
+
+ add.s32 %r5183, %r1262, %r1;
+ mul.hi.s32 %r5184, %r5183, -1840700269;
+ add.s32 %r5185, %r5184, %r5183;
+ shr.u32 %r5186, %r5185, 31;
+ shr.s32 %r5187, %r5185, 2;
+ add.s32 %r5188, %r5187, %r5186;
+ mad.lo.s32 %r5189, %r5188, %r2589, %r1261;
+ mul.lo.s32 %r5190, %r5188, 7;
+ sub.s32 %r5191, %r5183, %r5190;
+ mad.lo.s32 %r5192, %r5191, %r2590, %r5189;
+ mul.wide.s32 %rd1614, %r5192, 4;
+ add.s64 %rd1615, %rd3, %rd1614;
+ ld.global.f32 %f5402, [%rd1615];
+
+$L__BB0_952:
+ mov.u32 %r5193, -32;
+ sub.s32 %r5194, %r5193, %r1262;
+ setp.ge.s32 %p818, %r11, %r5194;
+ @%p818 bra $L__BB0_954;
+
+ add.s32 %r5195, %r1262, %r1;
+ add.s32 %r5196, %r5195, 32;
+ mul.hi.s32 %r5197, %r5196, -1840700269;
+ add.s32 %r5198, %r5197, %r5196;
+ shr.u32 %r5199, %r5198, 31;
+ shr.s32 %r5200, %r5198, 2;
+ add.s32 %r5201, %r5200, %r5199;
+ mad.lo.s32 %r5202, %r5201, %r2589, %r1261;
+ mul.lo.s32 %r5203, %r5201, 7;
+ sub.s32 %r5204, %r5196, %r5203;
+ mad.lo.s32 %r5205, %r5204, %r2590, %r5202;
+ mul.wide.s32 %rd1616, %r5205, 4;
+ add.s64 %rd1617, %rd3, %rd1616;
+ ld.global.f32 %f5401, [%rd1617];
+
+$L__BB0_954:
+ mov.u32 %r7782, %ctaid.x;
+ shl.b32 %r5207, %r2576, 1;
+ mad.lo.s32 %r1263, %r2578, %r7782, %r5207;
+ shl.b32 %r5208, %r2577, 1;
+ add.s32 %r1264, %r1263, %r5208;
+ @%p807 bra $L__BB0_959;
+
+ shl.b32 %r1265, %r12, 5;
+ neg.s32 %r5209, %r1265;
+ setp.ge.s32 %p820, %r11, %r5209;
+ @%p820 bra $L__BB0_957;
+
+ add.s32 %r5210, %r1265, %r1;
+ mul.hi.s32 %r5211, %r5210, 954437177;
+ shr.u32 %r5212, %r5211, 31;
+ shr.s32 %r5213, %r5211, 1;
+ add.s32 %r5214, %r5213, %r5212;
+ mad.lo.s32 %r5215, %r5214, %r2579, %r1264;
+ mul.lo.s32 %r5216, %r5214, 9;
+ sub.s32 %r5217, %r5210, %r5216;
+ mad.lo.s32 %r5218, %r5217, %r2580, %r5215;
+ mul.wide.s32 %rd1618, %r5218, 4;
+ add.s64 %rd1619, %rd2, %rd1618;
+ ld.global.f32 %f5416, [%rd1619];
+
+$L__BB0_957:
+ mov.u32 %r5219, -32;
+ sub.s32 %r5220, %r5219, %r1265;
+ setp.ge.s32 %p821, %r11, %r5220;
+ @%p821 bra $L__BB0_959;
+
+ add.s32 %r5221, %r1265, %r1;
+ add.s32 %r5222, %r5221, 32;
+ mul.hi.s32 %r5223, %r5222, 954437177;
+ shr.u32 %r5224, %r5223, 31;
+ shr.s32 %r5225, %r5223, 1;
+ add.s32 %r5226, %r5225, %r5224;
+ mad.lo.s32 %r5227, %r5226, %r2579, %r1264;
+ mul.lo.s32 %r5228, %r5226, 9;
+ sub.s32 %r5229, %r5222, %r5228;
+ mad.lo.s32 %r5230, %r5229, %r2580, %r5227;
+ mul.wide.s32 %rd1620, %r5230, 4;
+ add.s64 %rd1621, %rd2, %rd1620;
+ ld.global.f32 %f5415, [%rd1621];
+
+$L__BB0_959:
+ add.s32 %r1266, %r1263, %r2576;
+ @%p810 bra $L__BB0_964;
+
+ shl.b32 %r1267, %r12, 5;
+ neg.s32 %r5231, %r1267;
+ setp.ge.s32 %p823, %r11, %r5231;
+ @%p823 bra $L__BB0_962;
+
+ add.s32 %r5232, %r1267, %r1;
+ mul.hi.s32 %r5233, %r5232, 954437177;
+ shr.u32 %r5234, %r5233, 31;
+ shr.s32 %r5235, %r5233, 1;
+ add.s32 %r5236, %r5235, %r5234;
+ mad.lo.s32 %r5237, %r5236, %r2579, %r1266;
+ mul.lo.s32 %r5238, %r5236, 9;
+ sub.s32 %r5239, %r5232, %r5238;
+ mad.lo.s32 %r5240, %r5239, %r2580, %r5237;
+ mul.wide.s32 %rd1622, %r5240, 4;
+ add.s64 %rd1623, %rd2, %rd1622;
+ ld.global.f32 %f5414, [%rd1623];
+
+$L__BB0_962:
+ mov.u32 %r5241, -32;
+ sub.s32 %r5242, %r5241, %r1267;
+ setp.ge.s32 %p824, %r11, %r5242;
+ @%p824 bra $L__BB0_964;
+
+ add.s32 %r5243, %r1267, %r1;
+ add.s32 %r5244, %r5243, 32;
+ mul.hi.s32 %r5245, %r5244, 954437177;
+ shr.u32 %r5246, %r5245, 31;
+ shr.s32 %r5247, %r5245, 1;
+ add.s32 %r5248, %r5247, %r5246;
+ mad.lo.s32 %r5249, %r5248, %r2579, %r1266;
+ mul.lo.s32 %r5250, %r5248, 9;
+ sub.s32 %r5251, %r5244, %r5250;
+ mad.lo.s32 %r5252, %r5251, %r2580, %r5249;
+ mul.wide.s32 %rd1624, %r5252, 4;
+ add.s64 %rd1625, %rd2, %rd1624;
+ ld.global.f32 %f5413, [%rd1625];
+
+$L__BB0_964:
+ add.s32 %r1268, %r1266, %r2577;
+ @%p813 bra $L__BB0_969;
+
+ shl.b32 %r1269, %r12, 5;
+ neg.s32 %r5253, %r1269;
+ setp.ge.s32 %p826, %r11, %r5253;
+ @%p826 bra $L__BB0_967;
+
+ add.s32 %r5254, %r1269, %r1;
+ mul.hi.s32 %r5255, %r5254, 954437177;
+ shr.u32 %r5256, %r5255, 31;
+ shr.s32 %r5257, %r5255, 1;
+ add.s32 %r5258, %r5257, %r5256;
+ mad.lo.s32 %r5259, %r5258, %r2579, %r1268;
+ mul.lo.s32 %r5260, %r5258, 9;
+ sub.s32 %r5261, %r5254, %r5260;
+ mad.lo.s32 %r5262, %r5261, %r2580, %r5259;
+ mul.wide.s32 %rd1626, %r5262, 4;
+ add.s64 %rd1627, %rd2, %rd1626;
+ ld.global.f32 %f5412, [%rd1627];
+
+$L__BB0_967:
+ mov.u32 %r5263, -32;
+ sub.s32 %r5264, %r5263, %r1269;
+ setp.ge.s32 %p827, %r11, %r5264;
+ @%p827 bra $L__BB0_969;
+
+ add.s32 %r5265, %r1269, %r1;
+ add.s32 %r5266, %r5265, 32;
+ mul.hi.s32 %r5267, %r5266, 954437177;
+ shr.u32 %r5268, %r5267, 31;
+ shr.s32 %r5269, %r5267, 1;
+ add.s32 %r5270, %r5269, %r5268;
+ mad.lo.s32 %r5271, %r5270, %r2579, %r1268;
+ mul.lo.s32 %r5272, %r5270, 9;
+ sub.s32 %r5273, %r5266, %r5272;
+ mad.lo.s32 %r5274, %r5273, %r2580, %r5271;
+ mul.wide.s32 %rd1628, %r5274, 4;
+ add.s64 %rd1629, %rd2, %rd1628;
+ ld.global.f32 %f5411, [%rd1629];
+
+$L__BB0_969:
+ add.s32 %r1270, %r1268, %r2577;
+ @%p816 bra $L__BB0_974;
+
+ shl.b32 %r1271, %r12, 5;
+ neg.s32 %r5275, %r1271;
+ setp.ge.s32 %p829, %r11, %r5275;
+ @%p829 bra $L__BB0_972;
+
+ add.s32 %r5276, %r1271, %r1;
+ mul.hi.s32 %r5277, %r5276, 954437177;
+ shr.u32 %r5278, %r5277, 31;
+ shr.s32 %r5279, %r5277, 1;
+ add.s32 %r5280, %r5279, %r5278;
+ mad.lo.s32 %r5281, %r5280, %r2579, %r1270;
+ mul.lo.s32 %r5282, %r5280, 9;
+ sub.s32 %r5283, %r5276, %r5282;
+ mad.lo.s32 %r5284, %r5283, %r2580, %r5281;
+ mul.wide.s32 %rd1630, %r5284, 4;
+ add.s64 %rd1631, %rd2, %rd1630;
+ ld.global.f32 %f5410, [%rd1631];
+
+$L__BB0_972:
+ mov.u32 %r5285, -32;
+ sub.s32 %r5286, %r5285, %r1271;
+ setp.ge.s32 %p830, %r11, %r5286;
+ @%p830 bra $L__BB0_974;
+
+ add.s32 %r5287, %r1271, %r1;
+ add.s32 %r5288, %r5287, 32;
+ mul.hi.s32 %r5289, %r5288, 954437177;
+ shr.u32 %r5290, %r5289, 31;
+ shr.s32 %r5291, %r5289, 1;
+ add.s32 %r5292, %r5291, %r5290;
+ mad.lo.s32 %r5293, %r5292, %r2579, %r1270;
+ mul.lo.s32 %r5294, %r5292, 9;
+ sub.s32 %r5295, %r5288, %r5294;
+ mad.lo.s32 %r5296, %r5295, %r2580, %r5293;
+ mul.wide.s32 %rd1632, %r5296, 4;
+ add.s64 %rd1633, %rd2, %rd1632;
+ ld.global.f32 %f5409, [%rd1633];
+
+$L__BB0_974:
+ @%p807 bra $L__BB0_1003;
+
+ shl.b32 %r5297, %r12, 5;
+ neg.s32 %r1272, %r5297;
+ setp.ge.s32 %p832, %r11, %r1272;
+ @%p832 bra $L__BB0_988;
+
+ mul.f32 %f3692, %f5416, 0f3F22F983;
+ cvt.rni.s32.f32 %r8226, %f3692;
+ cvt.rn.f32.s32 %f3693, %r8226;
+ mov.f32 %f3694, 0fBFC90FDA;
+ fma.rn.f32 %f3695, %f3693, %f3694, %f5416;
+ mov.f32 %f3696, 0fB3A22168;
+ fma.rn.f32 %f3697, %f3693, %f3696, %f3695;
+ mov.f32 %f3698, 0fA7C234C5;
+ fma.rn.f32 %f5644, %f3693, %f3698, %f3697;
+ abs.f32 %f1140, %f5416;
+ setp.ltu.f32 %p833, %f1140, 0f47CE4780;
+ @%p833 bra $L__BB0_984;
+
+ setp.eq.f32 %p834, %f1140, 0f7F800000;
+ @%p834 bra $L__BB0_983;
+ bra.uni $L__BB0_978;
+
+$L__BB0_983:
+ mov.f32 %f3701, 0f00000000;
+ mul.rn.f32 %f5644, %f5416, %f3701;
mov.u32 %r8226, 0;
- mov.u64 %rd648, __cudart_i2opi_f;
- mov.u64 %rd2499, %rd2498;
-
-$L__BB0_57:
+ bra.uni $L__BB0_984;
+
+$L__BB0_1208:
+ mov.b32 %r1577, %f1414;
+ shr.u32 %r5878, %r1577, 23;
+ and.b32 %r5879, %r5878, 255;
+ add.s32 %r1578, %r5879, -128;
+ shl.b32 %r5880, %r1577, 8;
+ or.b32 %r1579, %r5880, -2147483648;
+ shr.u32 %r1580, %r1578, 5;
+ mov.u64 %rd2665, 0;
+ mov.u32 %r8287, 0;
+ mov.u64 %rd1890, __cudart_i2opi_f;
+ mov.u64 %rd2666, %rd2665;
+
+$L__BB0_1209:
.pragma "nounroll";
- shl.b64 %rd647, %rd2498, 2;
- add.s64 %rd649, %rd648, %rd647;
- ld.global.nc.u32 %r2832, [%rd649];
- mad.wide.u32 %rd650, %r2832, %r47, %rd2499;
- shr.u64 %rd2499, %rd650, 32;
- add.s64 %rd651, %rd1, %rd647;
- st.local.u32 [%rd651], %rd650;
- add.s32 %r8226, %r8226, 1;
- cvt.s64.s32 %rd2498, %r8226;
- setp.ne.s32 %p70, %r8226, 6;
- @%p70 bra $L__BB0_57;
-
- mov.b32 %r8066, %f5348;
- shr.u32 %r2833, %r8066, 23;
- and.b32 %r2834, %r2833, 255;
- add.s32 %r2835, %r2834, -128;
- shr.u32 %r2836, %r2835, 5;
- st.local.u32 [%rd5], %rd2499;
- and.b32 %r52, %r2835, 31;
- mov.u32 %r2838, 6;
- sub.s32 %r2839, %r2838, %r2836;
- mul.wide.s32 %rd652, %r2839, 4;
- add.s64 %rd653, %rd1, %rd652;
- ld.local.u32 %r8227, [%rd653];
- ld.local.u32 %r8228, [%rd653+-4];
- setp.eq.s32 %p71, %r52, 0;
- @%p71 bra $L__BB0_60;
-
- mov.b32 %r8074, %f5348;
- shr.u32 %r8073, %r8074, 23;
- and.b32 %r8072, %r8073, 255;
- add.s32 %r8071, %r8072, -128;
- shr.u32 %r8070, %r8071, 5;
- mov.u32 %r8069, 4;
- sub.s32 %r8068, %r8069, %r8070;
- mov.u32 %r2840, 32;
- sub.s32 %r2841, %r2840, %r52;
- shr.u32 %r2842, %r8228, %r2841;
- shl.b32 %r2843, %r8227, %r52;
- add.s32 %r8227, %r2842, %r2843;
- mul.wide.s32 %rd654, %r8068, 4;
- add.s64 %rd655, %rd1, %rd654;
- ld.local.u32 %r2844, [%rd655];
- shr.u32 %r2845, %r2844, %r2841;
- shl.b32 %r2846, %r8228, %r52;
- add.s32 %r8228, %r2845, %r2846;
-
-$L__BB0_60:
- mov.b32 %r8067, %f5348;
- and.b32 %r2847, %r8067, -2147483648;
- shr.u32 %r2848, %r8228, 30;
- shl.b32 %r2849, %r8227, 2;
- or.b32 %r2850, %r2848, %r2849;
- shr.u32 %r2851, %r2850, 31;
- shr.u32 %r2852, %r8227, 30;
- add.s32 %r2853, %r2851, %r2852;
- neg.s32 %r2854, %r2853;
- setp.eq.s32 %p72, %r2847, 0;
- selp.b32 %r8229, %r2853, %r2854, %p72;
- setp.ne.s32 %p73, %r2851, 0;
- xor.b32 %r2855, %r2847, -2147483648;
- selp.b32 %r2856, %r2855, %r2847, %p73;
- selp.b32 %r2857, -1, 0, %p73;
- xor.b32 %r2858, %r2850, %r2857;
- shl.b32 %r2859, %r8228, 2;
- xor.b32 %r2860, %r2859, %r2857;
- cvt.u64.u32 %rd656, %r2858;
- cvt.u64.u32 %rd657, %r2860;
- bfi.b64 %rd658, %rd656, %rd657, 32, 32;
- cvt.rn.f64.s64 %fd1, %rd658;
- mul.f64 %fd2, %fd1, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f2266, %fd2;
- setp.eq.s32 %p74, %r2856, 0;
- neg.f32 %f2267, %f2266;
- selp.f32 %f5180, %f2266, %f2267, %p74;
-
-$L__BB0_62:
- and.b32 %r61, %r8229, 1;
- setp.eq.s32 %p75, %r61, 0;
- mul.rn.f32 %f39, %f5180, %f5180;
- mov.f32 %f5181, 0fB94D4153;
- @%p75 bra $L__BB0_64;
-
- mov.f32 %f2270, 0fBAB607ED;
- mov.f32 %f2271, 0f37CBAC00;
- fma.rn.f32 %f5181, %f2271, %f39, %f2270;
-
-$L__BB0_64:
- and.b32 %r8075, %r8229, 1;
- setp.eq.s32 %p1805, %r8075, 0;
- selp.f32 %f5161, %f5180, 0f3F800000, %p1805;
- selp.f32 %f2272, 0f3C0885E4, 0f3D2AAABB, %p1805;
- fma.rn.f32 %f2273, %f5181, %f39, %f2272;
- selp.f32 %f2274, 0fBE2AAAA8, 0fBEFFFFFF, %p1805;
- fma.rn.f32 %f2275, %f2273, %f39, %f2274;
- mov.f32 %f2276, 0f00000000;
- fma.rn.f32 %f2277, %f39, %f5161, %f2276;
- fma.rn.f32 %f5214, %f2275, %f2277, %f5161;
- and.b32 %r2862, %r8229, 2;
- setp.eq.s32 %p77, %r2862, 0;
- @%p77 bra $L__BB0_66;
-
- mov.f32 %f2279, 0fBF800000;
- fma.rn.f32 %f5214, %f5214, %f2279, %f2276;
-
-$L__BB0_66:
- shl.b32 %r8065, %r12, 5;
- neg.s32 %r8064, %r8065;
- setp.ge.s32 %p1804, %r14, %r8064;
- @%p1804 bra $L__BB0_79;
-
- mul.f32 %f2281, %f5531, 0f3F22F983;
- cvt.rni.s32.f32 %r8233, %f2281;
- cvt.rn.f32.s32 %f2282, %r8233;
- mov.f32 %f2283, 0fBFC90FDA;
- fma.rn.f32 %f2284, %f2282, %f2283, %f5531;
- mov.f32 %f2285, 0fB3A22168;
- fma.rn.f32 %f2286, %f2282, %f2285, %f2284;
- mov.f32 %f2287, 0fA7C234C5;
- fma.rn.f32 %f5184, %f2282, %f2287, %f2286;
- abs.f32 %f47, %f5531;
- setp.ltu.f32 %p79, %f47, 0f47CE4780;
- @%p79 bra $L__BB0_75;
-
- setp.eq.f32 %p80, %f47, 0f7F800000;
- @%p80 bra $L__BB0_74;
- bra.uni $L__BB0_69;
-
-$L__BB0_74:
- mov.f32 %f2290, 0f00000000;
- mul.rn.f32 %f5184, %f5531, %f2290;
- mov.u32 %r8233, 0;
- bra.uni $L__BB0_75;
-
-$L__BB0_69:
- mov.b32 %r63, %f5531;
- shr.u32 %r2866, %r63, 23;
- and.b32 %r2867, %r2866, 255;
- shl.b32 %r2868, %r63, 8;
- or.b32 %r65, %r2868, -2147483648;
- mov.u64 %rd2500, 0;
- mov.u32 %r8230, 0;
- mov.u64 %rd662, __cudart_i2opi_f;
- mov.u64 %rd2501, %rd2500;
-
-$L__BB0_70:
+ shl.b64 %rd1889, %rd2665, 2;
+ add.s64 %rd1891, %rd1890, %rd1889;
+ ld.global.nc.u32 %r5881, [%rd1891];
+ mad.wide.u32 %rd1892, %r5881, %r1579, %rd2666;
+ shr.u64 %rd2666, %rd1892, 32;
+ add.s64 %rd1893, %rd1, %rd1889;
+ st.local.u32 [%rd1893], %rd1892;
+ add.s32 %r8287, %r8287, 1;
+ cvt.s64.s32 %rd2665, %r8287;
+ setp.ne.s32 %p1033, %r8287, 6;
+ @%p1033 bra $L__BB0_1209;
+
+ st.local.u32 [%rd4], %rd2666;
+ mov.u32 %r5882, 4;
+ sub.s32 %r1583, %r5882, %r1580;
+ mov.u32 %r5883, 6;
+ sub.s32 %r5884, %r5883, %r1580;
+ mul.wide.s32 %rd1894, %r5884, 4;
+ add.s64 %rd1895, %rd1, %rd1894;
+ ld.local.u32 %r8288, [%rd1895];
+ ld.local.u32 %r8289, [%rd1895+-4];
+ and.b32 %r1586, %r1578, 31;
+ setp.eq.s32 %p1034, %r1586, 0;
+ @%p1034 bra $L__BB0_1212;
+
+ mov.u32 %r5885, 32;
+ sub.s32 %r5886, %r5885, %r1586;
+ shr.u32 %r5887, %r8289, %r5886;
+ shl.b32 %r5888, %r8288, %r1586;
+ add.s32 %r8288, %r5887, %r5888;
+ mul.wide.s32 %rd1896, %r1583, 4;
+ add.s64 %rd1897, %rd1, %rd1896;
+ ld.local.u32 %r5889, [%rd1897];
+ shr.u32 %r5890, %r5889, %r5886;
+ shl.b32 %r5891, %r8289, %r1586;
+ add.s32 %r8289, %r5890, %r5891;
+
+$L__BB0_1212:
+ and.b32 %r5892, %r1577, -2147483648;
+ shr.u32 %r5893, %r8289, 30;
+ shl.b32 %r5894, %r8288, 2;
+ or.b32 %r5895, %r5893, %r5894;
+ shr.u32 %r5896, %r5895, 31;
+ shr.u32 %r5897, %r8288, 30;
+ add.s32 %r5898, %r5896, %r5897;
+ neg.s32 %r5899, %r5898;
+ setp.eq.s32 %p1035, %r5892, 0;
+ selp.b32 %r8290, %r5898, %r5899, %p1035;
+ setp.ne.s32 %p1036, %r5896, 0;
+ xor.b32 %r5900, %r5892, -2147483648;
+ selp.b32 %r5901, %r5900, %r5892, %p1036;
+ selp.b32 %r5902, -1, 0, %p1036;
+ xor.b32 %r5903, %r5895, %r5902;
+ shl.b32 %r5904, %r8289, 2;
+ xor.b32 %r5905, %r5904, %r5902;
+ cvt.u64.u32 %rd1898, %r5903;
+ cvt.u64.u32 %rd1899, %r5905;
+ bfi.b64 %rd1900, %rd1898, %rd1899, 32, 32;
+ cvt.rn.f64.s64 %fd161, %rd1900;
+ mul.f64 %fd162, %fd161, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4050, %fd162;
+ setp.eq.s32 %p1037, %r5901, 0;
+ neg.f32 %f4051, %f4050;
+ selp.f32 %f5743, %f4050, %f4051, %p1037;
+
+$L__BB0_1214:
+ and.b32 %r1593, %r8290, 1;
+ setp.eq.s32 %p1038, %r1593, 0;
+ selp.f32 %f1427, %f5743, 0f3F800000, %p1038;
+ mul.rn.f32 %f1428, %f5743, %f5743;
+ mov.f32 %f5744, 0fB94D4153;
+ @%p1038 bra $L__BB0_1216;
+
+ mov.f32 %f4054, 0fBAB607ED;
+ mov.f32 %f4055, 0f37CBAC00;
+ fma.rn.f32 %f5744, %f4055, %f1428, %f4054;
+
+$L__BB0_1216:
+ selp.f32 %f4056, 0f3C0885E4, 0f3D2AAABB, %p1038;
+ fma.rn.f32 %f4057, %f5744, %f1428, %f4056;
+ selp.f32 %f4058, 0fBE2AAAA8, 0fBEFFFFFF, %p1038;
+ fma.rn.f32 %f4059, %f4057, %f1428, %f4058;
+ mov.f32 %f4060, 0f00000000;
+ fma.rn.f32 %f4061, %f1428, %f1427, %f4060;
+ fma.rn.f32 %f5745, %f4059, %f4061, %f1427;
+ and.b32 %r5907, %r8290, 2;
+ setp.eq.s32 %p1040, %r5907, 0;
+ @%p1040 bra $L__BB0_1218;
+
+ mov.f32 %f4063, 0fBF800000;
+ fma.rn.f32 %f5745, %f5745, %f4063, %f4060;
+
+$L__BB0_1218:
+ mul.f32 %f4064, %f1406, 0f3F22F983;
+ cvt.rni.s32.f32 %r8294, %f4064;
+ cvt.rn.f32.s32 %f4065, %r8294;
+ mov.f32 %f4066, 0fBFC90FDA;
+ fma.rn.f32 %f4067, %f4065, %f4066, %f1406;
+ mov.f32 %f4068, 0fB3A22168;
+ fma.rn.f32 %f4069, %f4065, %f4068, %f4067;
+ mov.f32 %f4070, 0fA7C234C5;
+ fma.rn.f32 %f5746, %f4065, %f4070, %f4069;
+ abs.f32 %f1435, %f1406;
+ setp.ltu.f32 %p1041, %f1435, 0f47CE4780;
+ @%p1041 bra $L__BB0_1226;
+
+ setp.eq.f32 %p1042, %f1435, 0f7F800000;
+ @%p1042 bra $L__BB0_1225;
+ bra.uni $L__BB0_1220;
+
+$L__BB0_1225:
+ mov.f32 %f4073, 0f00000000;
+ mul.rn.f32 %f5746, %f1406, %f4073;
+ mov.u32 %r8294, 0;
+ bra.uni $L__BB0_1226;
+
+$L__BB0_1220:
+ mov.b32 %r1595, %f1406;
+ shr.u32 %r5909, %r1595, 23;
+ and.b32 %r5910, %r5909, 255;
+ add.s32 %r1596, %r5910, -128;
+ shl.b32 %r5911, %r1595, 8;
+ or.b32 %r1597, %r5911, -2147483648;
+ shr.u32 %r1598, %r1596, 5;
+ mov.u64 %rd2667, 0;
+ mov.u32 %r8291, 0;
+ mov.u64 %rd1904, __cudart_i2opi_f;
+ mov.u64 %rd2668, %rd2667;
+
+$L__BB0_1221:
.pragma "nounroll";
- shl.b64 %rd661, %rd2500, 2;
- add.s64 %rd663, %rd662, %rd661;
- ld.global.nc.u32 %r2869, [%rd663];
- mad.wide.u32 %rd664, %r2869, %r65, %rd2501;
- shr.u64 %rd2501, %rd664, 32;
- add.s64 %rd665, %rd1, %rd661;
- st.local.u32 [%rd665], %rd664;
- add.s32 %r8230, %r8230, 1;
- cvt.s64.s32 %rd2500, %r8230;
- setp.ne.s32 %p81, %r8230, 6;
- @%p81 bra $L__BB0_70;
-
- add.s32 %r8045, %r2867, -128;
- mov.b32 %r8044, %f5531;
- shr.u32 %r8043, %r8044, 23;
- and.b32 %r8042, %r8043, 255;
- add.s32 %r8041, %r8042, -128;
- shr.u32 %r8040, %r8041, 5;
- st.local.u32 [%rd5], %rd2501;
- mov.u32 %r2871, 6;
- sub.s32 %r2872, %r2871, %r8040;
- mul.wide.s32 %rd666, %r2872, 4;
- add.s64 %rd667, %rd1, %rd666;
- ld.local.u32 %r8231, [%rd667];
- ld.local.u32 %r8232, [%rd667+-4];
- and.b32 %r72, %r8041, 31;
- setp.eq.s32 %p82, %r72, 0;
- @%p82 bra $L__BB0_73;
-
- mov.b32 %r8082, %f5531;
- shr.u32 %r8081, %r8082, 23;
- and.b32 %r8080, %r8081, 255;
- add.s32 %r8079, %r8080, -128;
- shr.u32 %r8078, %r8079, 5;
- mov.u32 %r8077, 4;
- sub.s32 %r8076, %r8077, %r8078;
- mov.u32 %r2873, 32;
- sub.s32 %r2874, %r2873, %r72;
- shr.u32 %r2875, %r8232, %r2874;
- shl.b32 %r2876, %r8231, %r72;
- add.s32 %r8231, %r2875, %r2876;
- mul.wide.s32 %rd668, %r8076, 4;
- add.s64 %rd669, %rd1, %rd668;
- ld.local.u32 %r2877, [%rd669];
- shr.u32 %r2878, %r2877, %r2874;
- shl.b32 %r2879, %r8232, %r72;
- add.s32 %r8232, %r2878, %r2879;
-
-$L__BB0_73:
- mov.b32 %r8046, %f5531;
- and.b32 %r2880, %r8046, -2147483648;
- shr.u32 %r2881, %r8232, 30;
- shl.b32 %r2882, %r8231, 2;
- or.b32 %r2883, %r2881, %r2882;
- shr.u32 %r2884, %r2883, 31;
- shr.u32 %r2885, %r8231, 30;
- add.s32 %r2886, %r2884, %r2885;
- neg.s32 %r2887, %r2886;
- setp.eq.s32 %p83, %r2880, 0;
- selp.b32 %r8233, %r2886, %r2887, %p83;
- setp.ne.s32 %p84, %r2884, 0;
- xor.b32 %r2888, %r2880, -2147483648;
- selp.b32 %r2889, %r2888, %r2880, %p84;
- selp.b32 %r2890, -1, 0, %p84;
- xor.b32 %r2891, %r2883, %r2890;
- shl.b32 %r2892, %r8232, 2;
- xor.b32 %r2893, %r2892, %r2890;
- cvt.u64.u32 %rd670, %r2891;
- cvt.u64.u32 %rd671, %r2893;
- bfi.b64 %rd672, %rd670, %rd671, 32, 32;
- cvt.rn.f64.s64 %fd3, %rd672;
- mul.f64 %fd4, %fd3, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f2288, %fd4;
- setp.eq.s32 %p85, %r2889, 0;
- neg.f32 %f2289, %f2288;
- selp.f32 %f5184, %f2288, %f2289, %p85;
-
-$L__BB0_75:
- add.s32 %r79, %r8233, 1;
- and.b32 %r80, %r79, 1;
- setp.eq.s32 %p86, %r80, 0;
- selp.f32 %f51, %f5184, 0f3F800000, %p86;
- mul.rn.f32 %f52, %f5184, %f5184;
- mov.f32 %f5185, 0fB94D4153;
- @%p86 bra $L__BB0_77;
-
- mov.f32 %f2292, 0fBAB607ED;
- mov.f32 %f2293, 0f37CBAC00;
- fma.rn.f32 %f5185, %f2293, %f52, %f2292;
-
-$L__BB0_77:
- add.s32 %r8085, %r8233, 1;
- add.s32 %r8084, %r8233, 1;
- and.b32 %r8083, %r8084, 1;
- setp.eq.s32 %p1806, %r8083, 0;
- selp.f32 %f2294, 0f3C0885E4, 0f3D2AAABB, %p1806;
- fma.rn.f32 %f2295, %f5185, %f52, %f2294;
- selp.f32 %f2296, 0fBE2AAAA8, 0fBEFFFFFF, %p1806;
- fma.rn.f32 %f2297, %f2295, %f52, %f2296;
- mov.f32 %f2298, 0f00000000;
- fma.rn.f32 %f2299, %f52, %f51, %f2298;
- fma.rn.f32 %f5216, %f2297, %f2299, %f51;
- and.b32 %r2895, %r8084, 2;
- setp.eq.s32 %p88, %r2895, 0;
- @%p88 bra $L__BB0_79;
-
- mov.f32 %f2301, 0fBF800000;
- fma.rn.f32 %f5216, %f5216, %f2301, %f2298;
-
-$L__BB0_79:
- add.f32 %f5332, %f5214, %f5216;
-
-$L__BB0_80:
- neg.s32 %r8048, %r12;
- add.s32 %r8047, %r13, -15;
- setp.ge.s32 %p1800, %r8047, %r8048;
- mov.f32 %f5213, %f5214;
- mov.f32 %f5215, %f5216;
- @%p1800 bra $L__BB0_109;
-
- shl.b32 %r2898, %r12, 5;
- mov.u32 %r2899, -32;
- sub.s32 %r81, %r2899, %r2898;
- setp.ge.s32 %p90, %r14, %r81;
- mov.f32 %f5213, %f5214;
- @%p90 bra $L__BB0_94;
-
- mul.f32 %f2303, %f5347, 0f3F22F983;
- cvt.rni.s32.f32 %r8237, %f2303;
- cvt.rn.f32.s32 %f2304, %r8237;
- mov.f32 %f2305, 0fBFC90FDA;
- fma.rn.f32 %f2306, %f2304, %f2305, %f5347;
- mov.f32 %f2307, 0fB3A22168;
- fma.rn.f32 %f2308, %f2304, %f2307, %f2306;
- mov.f32 %f2309, 0fA7C234C5;
- fma.rn.f32 %f5191, %f2304, %f2309, %f2308;
- abs.f32 %f64, %f5347;
- setp.ltu.f32 %p91, %f64, 0f47CE4780;
- @%p91 bra $L__BB0_90;
-
- setp.eq.f32 %p92, %f64, 0f7F800000;
- @%p92 bra $L__BB0_89;
- bra.uni $L__BB0_84;
-
-$L__BB0_89:
- mov.f32 %f2312, 0f00000000;
- mul.rn.f32 %f5191, %f5347, %f2312;
- mov.u32 %r8237, 0;
- bra.uni $L__BB0_90;
-
-$L__BB0_84:
- mov.b32 %r83, %f5347;
- shr.u32 %r2901, %r83, 23;
- and.b32 %r2902, %r2901, 255;
- shl.b32 %r2903, %r83, 8;
- or.b32 %r85, %r2903, -2147483648;
- mov.u64 %rd2502, 0;
- mov.u32 %r8234, 0;
- mov.u64 %rd676, __cudart_i2opi_f;
- mov.u64 %rd2503, %rd2502;
-
-$L__BB0_85:
+ shl.b64 %rd1903, %rd2667, 2;
+ add.s64 %rd1905, %rd1904, %rd1903;
+ ld.global.nc.u32 %r5912, [%rd1905];
+ mad.wide.u32 %rd1906, %r5912, %r1597, %rd2668;
+ shr.u64 %rd2668, %rd1906, 32;
+ add.s64 %rd1907, %rd1, %rd1903;
+ st.local.u32 [%rd1907], %rd1906;
+ add.s32 %r8291, %r8291, 1;
+ cvt.s64.s32 %rd2667, %r8291;
+ setp.ne.s32 %p1043, %r8291, 6;
+ @%p1043 bra $L__BB0_1221;
+
+ st.local.u32 [%rd4], %rd2668;
+ mov.u32 %r5913, 4;
+ sub.s32 %r1601, %r5913, %r1598;
+ mov.u32 %r5914, 6;
+ sub.s32 %r5915, %r5914, %r1598;
+ mul.wide.s32 %rd1908, %r5915, 4;
+ add.s64 %rd1909, %rd1, %rd1908;
+ ld.local.u32 %r8292, [%rd1909];
+ ld.local.u32 %r8293, [%rd1909+-4];
+ and.b32 %r1604, %r1596, 31;
+ setp.eq.s32 %p1044, %r1604, 0;
+ @%p1044 bra $L__BB0_1224;
+
+ mov.u32 %r5916, 32;
+ sub.s32 %r5917, %r5916, %r1604;
+ shr.u32 %r5918, %r8293, %r5917;
+ shl.b32 %r5919, %r8292, %r1604;
+ add.s32 %r8292, %r5918, %r5919;
+ mul.wide.s32 %rd1910, %r1601, 4;
+ add.s64 %rd1911, %rd1, %rd1910;
+ ld.local.u32 %r5920, [%rd1911];
+ shr.u32 %r5921, %r5920, %r5917;
+ shl.b32 %r5922, %r8293, %r1604;
+ add.s32 %r8293, %r5921, %r5922;
+
+$L__BB0_1224:
+ and.b32 %r5923, %r1595, -2147483648;
+ shr.u32 %r5924, %r8293, 30;
+ shl.b32 %r5925, %r8292, 2;
+ or.b32 %r5926, %r5924, %r5925;
+ shr.u32 %r5927, %r5926, 31;
+ shr.u32 %r5928, %r8292, 30;
+ add.s32 %r5929, %r5927, %r5928;
+ neg.s32 %r5930, %r5929;
+ setp.eq.s32 %p1045, %r5923, 0;
+ selp.b32 %r8294, %r5929, %r5930, %p1045;
+ setp.ne.s32 %p1046, %r5927, 0;
+ xor.b32 %r5931, %r5923, -2147483648;
+ selp.b32 %r5932, %r5931, %r5923, %p1046;
+ selp.b32 %r5933, -1, 0, %p1046;
+ xor.b32 %r5934, %r5926, %r5933;
+ shl.b32 %r5935, %r8293, 2;
+ xor.b32 %r5936, %r5935, %r5933;
+ cvt.u64.u32 %rd1912, %r5934;
+ cvt.u64.u32 %rd1913, %r5936;
+ bfi.b64 %rd1914, %rd1912, %rd1913, 32, 32;
+ cvt.rn.f64.s64 %fd163, %rd1914;
+ mul.f64 %fd164, %fd163, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4071, %fd164;
+ setp.eq.s32 %p1047, %r5932, 0;
+ neg.f32 %f4072, %f4071;
+ selp.f32 %f5746, %f4071, %f4072, %p1047;
+
+$L__BB0_1226:
+ add.s32 %r1611, %r8294, 1;
+ and.b32 %r1612, %r1611, 1;
+ setp.eq.s32 %p1048, %r1612, 0;
+ selp.f32 %f1439, %f5746, 0f3F800000, %p1048;
+ mul.rn.f32 %f1440, %f5746, %f5746;
+ mov.f32 %f5747, 0fB94D4153;
+ @%p1048 bra $L__BB0_1228;
+
+ mov.f32 %f4075, 0fBAB607ED;
+ mov.f32 %f4076, 0f37CBAC00;
+ fma.rn.f32 %f5747, %f4076, %f1440, %f4075;
+
+$L__BB0_1228:
+ selp.f32 %f4077, 0f3C0885E4, 0f3D2AAABB, %p1048;
+ fma.rn.f32 %f4078, %f5747, %f1440, %f4077;
+ selp.f32 %f4079, 0fBE2AAAA8, 0fBEFFFFFF, %p1048;
+ fma.rn.f32 %f4080, %f4078, %f1440, %f4079;
+ mov.f32 %f4081, 0f00000000;
+ fma.rn.f32 %f4082, %f1440, %f1439, %f4081;
+ fma.rn.f32 %f5748, %f4080, %f4082, %f1439;
+ and.b32 %r5938, %r1611, 2;
+ setp.eq.s32 %p1050, %r5938, 0;
+ @%p1050 bra $L__BB0_1230;
+
+ mov.f32 %f4084, 0fBF800000;
+ fma.rn.f32 %f5748, %f5748, %f4084, %f4081;
+
+$L__BB0_1230:
+ add.f32 %f5798, %f5745, %f5748;
+ mul.f32 %f4085, %f1415, 0f3F22F983;
+ cvt.rni.s32.f32 %r8298, %f4085;
+ cvt.rn.f32.s32 %f4086, %r8298;
+ mov.f32 %f4087, 0fBFC90FDA;
+ fma.rn.f32 %f4088, %f4086, %f4087, %f1415;
+ mov.f32 %f4089, 0fB3A22168;
+ fma.rn.f32 %f4090, %f4086, %f4089, %f4088;
+ mov.f32 %f4091, 0fA7C234C5;
+ fma.rn.f32 %f5749, %f4086, %f4091, %f4090;
+ abs.f32 %f1448, %f1415;
+ setp.ltu.f32 %p1051, %f1448, 0f47CE4780;
+ @%p1051 bra $L__BB0_1238;
+
+ setp.eq.f32 %p1052, %f1448, 0f7F800000;
+ @%p1052 bra $L__BB0_1237;
+ bra.uni $L__BB0_1232;
+
+$L__BB0_1237:
+ mov.f32 %f4094, 0f00000000;
+ mul.rn.f32 %f5749, %f1415, %f4094;
+ mov.u32 %r8298, 0;
+ bra.uni $L__BB0_1238;
+
+$L__BB0_1232:
+ mov.b32 %r1614, %f1415;
+ shr.u32 %r5940, %r1614, 23;
+ and.b32 %r5941, %r5940, 255;
+ add.s32 %r1615, %r5941, -128;
+ shl.b32 %r5942, %r1614, 8;
+ or.b32 %r1616, %r5942, -2147483648;
+ shr.u32 %r1617, %r1615, 5;
+ mov.u64 %rd2669, 0;
+ mov.u32 %r8295, 0;
+ mov.u64 %rd1918, __cudart_i2opi_f;
+ mov.u64 %rd2670, %rd2669;
+
+$L__BB0_1233:
.pragma "nounroll";
- shl.b64 %rd675, %rd2502, 2;
- add.s64 %rd677, %rd676, %rd675;
- ld.global.nc.u32 %r2904, [%rd677];
- mad.wide.u32 %rd678, %r2904, %r85, %rd2503;
- shr.u64 %rd2503, %rd678, 32;
- add.s64 %rd679, %rd1, %rd675;
- st.local.u32 [%rd679], %rd678;
- add.s32 %r8234, %r8234, 1;
- cvt.s64.s32 %rd2502, %r8234;
- setp.ne.s32 %p93, %r8234, 6;
- @%p93 bra $L__BB0_85;
-
- add.s32 %r8094, %r2902, -128;
- mov.b32 %r8093, %f5347;
- shr.u32 %r8092, %r8093, 23;
- and.b32 %r8091, %r8092, 255;
- add.s32 %r8090, %r8091, -128;
- shr.u32 %r8089, %r8090, 5;
- st.local.u32 [%rd5], %rd2503;
- mov.u32 %r2906, 6;
- sub.s32 %r2907, %r2906, %r8089;
- mul.wide.s32 %rd680, %r2907, 4;
- add.s64 %rd681, %rd1, %rd680;
- ld.local.u32 %r8235, [%rd681];
- ld.local.u32 %r8236, [%rd681+-4];
- and.b32 %r92, %r8090, 31;
- setp.eq.s32 %p94, %r92, 0;
- @%p94 bra $L__BB0_88;
-
- mov.b32 %r8102, %f5347;
- shr.u32 %r8101, %r8102, 23;
- and.b32 %r8100, %r8101, 255;
- add.s32 %r8099, %r8100, -128;
- shr.u32 %r8098, %r8099, 5;
- mov.u32 %r8097, 4;
- sub.s32 %r8096, %r8097, %r8098;
- mov.u32 %r2908, 32;
- sub.s32 %r2909, %r2908, %r92;
- shr.u32 %r2910, %r8236, %r2909;
- shl.b32 %r2911, %r8235, %r92;
- add.s32 %r8235, %r2910, %r2911;
- mul.wide.s32 %rd682, %r8096, 4;
- add.s64 %rd683, %rd1, %rd682;
- ld.local.u32 %r2912, [%rd683];
- shr.u32 %r2913, %r2912, %r2909;
- shl.b32 %r2914, %r8236, %r92;
- add.s32 %r8236, %r2913, %r2914;
-
-$L__BB0_88:
- mov.b32 %r8095, %f5347;
- and.b32 %r2915, %r8095, -2147483648;
- shr.u32 %r2916, %r8236, 30;
- shl.b32 %r2917, %r8235, 2;
- or.b32 %r2918, %r2916, %r2917;
- shr.u32 %r2919, %r2918, 31;
- shr.u32 %r2920, %r8235, 30;
- add.s32 %r2921, %r2919, %r2920;
- neg.s32 %r2922, %r2921;
- setp.eq.s32 %p95, %r2915, 0;
- selp.b32 %r8237, %r2921, %r2922, %p95;
- setp.ne.s32 %p96, %r2919, 0;
- xor.b32 %r2923, %r2915, -2147483648;
- selp.b32 %r2924, %r2923, %r2915, %p96;
- selp.b32 %r2925, -1, 0, %p96;
- xor.b32 %r2926, %r2918, %r2925;
- shl.b32 %r2927, %r8236, 2;
- xor.b32 %r2928, %r2927, %r2925;
- cvt.u64.u32 %rd684, %r2926;
- cvt.u64.u32 %rd685, %r2928;
- bfi.b64 %rd686, %rd684, %rd685, 32, 32;
- cvt.rn.f64.s64 %fd5, %rd686;
- mul.f64 %fd6, %fd5, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f2310, %fd6;
- setp.eq.s32 %p97, %r2924, 0;
- neg.f32 %f2311, %f2310;
- selp.f32 %f5191, %f2310, %f2311, %p97;
-
-$L__BB0_90:
- and.b32 %r99, %r8237, 1;
- setp.eq.s32 %p98, %r99, 0;
- mul.rn.f32 %f69, %f5191, %f5191;
- mov.f32 %f5192, 0fB94D4153;
- @%p98 bra $L__BB0_92;
-
- mov.f32 %f2314, 0fBAB607ED;
- mov.f32 %f2315, 0f37CBAC00;
- fma.rn.f32 %f5192, %f2315, %f69, %f2314;
-
-$L__BB0_92:
- and.b32 %r8103, %r8237, 1;
- setp.eq.s32 %p1809, %r8103, 0;
- selp.f32 %f5162, %f5191, 0f3F800000, %p1809;
- selp.f32 %f2316, 0f3C0885E4, 0f3D2AAABB, %p1809;
- fma.rn.f32 %f2317, %f5192, %f69, %f2316;
- selp.f32 %f2318, 0fBE2AAAA8, 0fBEFFFFFF, %p1809;
- fma.rn.f32 %f2319, %f2317, %f69, %f2318;
- mov.f32 %f2320, 0f00000000;
- fma.rn.f32 %f2321, %f69, %f5162, %f2320;
- fma.rn.f32 %f5213, %f2319, %f2321, %f5162;
- and.b32 %r2930, %r8237, 2;
- setp.eq.s32 %p100, %r2930, 0;
- @%p100 bra $L__BB0_94;
-
- mov.f32 %f2323, 0fBF800000;
- fma.rn.f32 %f5213, %f5213, %f2323, %f2320;
-
-$L__BB0_94:
- shl.b32 %r8088, %r12, 5;
- mov.u32 %r8087, -32;
- sub.s32 %r8086, %r8087, %r8088;
- setp.ge.s32 %p1807, %r14, %r8086;
- mov.f32 %f5215, %f5216;
- @%p1807 bra $L__BB0_107;
-
- mul.f32 %f2324, %f5339, 0f3F22F983;
- cvt.rni.s32.f32 %r8241, %f2324;
- cvt.rn.f32.s32 %f2325, %r8241;
- mov.f32 %f2326, 0fBFC90FDA;
- fma.rn.f32 %f2327, %f2325, %f2326, %f5339;
- mov.f32 %f2328, 0fB3A22168;
- fma.rn.f32 %f2329, %f2325, %f2328, %f2327;
- mov.f32 %f2330, 0fA7C234C5;
- fma.rn.f32 %f5195, %f2325, %f2330, %f2329;
- abs.f32 %f77, %f5339;
- setp.ltu.f32 %p102, %f77, 0f47CE4780;
- @%p102 bra $L__BB0_103;
-
- setp.eq.f32 %p103, %f77, 0f7F800000;
- @%p103 bra $L__BB0_102;
- bra.uni $L__BB0_97;
-
-$L__BB0_102:
- mov.f32 %f2333, 0f00000000;
- mul.rn.f32 %f5195, %f5339, %f2333;
- mov.u32 %r8241, 0;
- bra.uni $L__BB0_103;
-
-$L__BB0_97:
- mov.b32 %r101, %f5339;
- shr.u32 %r2932, %r101, 23;
- and.b32 %r2933, %r2932, 255;
- shl.b32 %r2934, %r101, 8;
- or.b32 %r103, %r2934, -2147483648;
- mov.u64 %rd2504, 0;
- mov.u32 %r8238, 0;
- mov.u64 %rd690, __cudart_i2opi_f;
- mov.u64 %rd2505, %rd2504;
-
-$L__BB0_98:
+ shl.b64 %rd1917, %rd2669, 2;
+ add.s64 %rd1919, %rd1918, %rd1917;
+ ld.global.nc.u32 %r5943, [%rd1919];
+ mad.wide.u32 %rd1920, %r5943, %r1616, %rd2670;
+ shr.u64 %rd2670, %rd1920, 32;
+ add.s64 %rd1921, %rd1, %rd1917;
+ st.local.u32 [%rd1921], %rd1920;
+ add.s32 %r8295, %r8295, 1;
+ cvt.s64.s32 %rd2669, %r8295;
+ setp.ne.s32 %p1053, %r8295, 6;
+ @%p1053 bra $L__BB0_1233;
+
+ st.local.u32 [%rd4], %rd2670;
+ mov.u32 %r5944, 4;
+ sub.s32 %r1620, %r5944, %r1617;
+ mov.u32 %r5945, 6;
+ sub.s32 %r5946, %r5945, %r1617;
+ mul.wide.s32 %rd1922, %r5946, 4;
+ add.s64 %rd1923, %rd1, %rd1922;
+ ld.local.u32 %r8296, [%rd1923];
+ ld.local.u32 %r8297, [%rd1923+-4];
+ and.b32 %r1623, %r1615, 31;
+ setp.eq.s32 %p1054, %r1623, 0;
+ @%p1054 bra $L__BB0_1236;
+
+ mov.u32 %r5947, 32;
+ sub.s32 %r5948, %r5947, %r1623;
+ shr.u32 %r5949, %r8297, %r5948;
+ shl.b32 %r5950, %r8296, %r1623;
+ add.s32 %r8296, %r5949, %r5950;
+ mul.wide.s32 %rd1924, %r1620, 4;
+ add.s64 %rd1925, %rd1, %rd1924;
+ ld.local.u32 %r5951, [%rd1925];
+ shr.u32 %r5952, %r5951, %r5948;
+ shl.b32 %r5953, %r8297, %r1623;
+ add.s32 %r8297, %r5952, %r5953;
+
+$L__BB0_1236:
+ and.b32 %r5954, %r1614, -2147483648;
+ shr.u32 %r5955, %r8297, 30;
+ shl.b32 %r5956, %r8296, 2;
+ or.b32 %r5957, %r5955, %r5956;
+ shr.u32 %r5958, %r5957, 31;
+ shr.u32 %r5959, %r8296, 30;
+ add.s32 %r5960, %r5958, %r5959;
+ neg.s32 %r5961, %r5960;
+ setp.eq.s32 %p1055, %r5954, 0;
+ selp.b32 %r8298, %r5960, %r5961, %p1055;
+ setp.ne.s32 %p1056, %r5958, 0;
+ xor.b32 %r5962, %r5954, -2147483648;
+ selp.b32 %r5963, %r5962, %r5954, %p1056;
+ selp.b32 %r5964, -1, 0, %p1056;
+ xor.b32 %r5965, %r5957, %r5964;
+ shl.b32 %r5966, %r8297, 2;
+ xor.b32 %r5967, %r5966, %r5964;
+ cvt.u64.u32 %rd1926, %r5965;
+ cvt.u64.u32 %rd1927, %r5967;
+ bfi.b64 %rd1928, %rd1926, %rd1927, 32, 32;
+ cvt.rn.f64.s64 %fd165, %rd1928;
+ mul.f64 %fd166, %fd165, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4092, %fd166;
+ setp.eq.s32 %p1057, %r5963, 0;
+ neg.f32 %f4093, %f4092;
+ selp.f32 %f5749, %f4092, %f4093, %p1057;
+
+$L__BB0_1238:
+ and.b32 %r1630, %r8298, 1;
+ setp.eq.s32 %p1058, %r1630, 0;
+ selp.f32 %f1452, %f5749, 0f3F800000, %p1058;
+ mul.rn.f32 %f1453, %f5749, %f5749;
+ mov.f32 %f5750, 0fB94D4153;
+ @%p1058 bra $L__BB0_1240;
+
+ mov.f32 %f4096, 0fBAB607ED;
+ mov.f32 %f4097, 0f37CBAC00;
+ fma.rn.f32 %f5750, %f4097, %f1453, %f4096;
+
+$L__BB0_1240:
+ selp.f32 %f4098, 0f3C0885E4, 0f3D2AAABB, %p1058;
+ fma.rn.f32 %f4099, %f5750, %f1453, %f4098;
+ selp.f32 %f4100, 0fBE2AAAA8, 0fBEFFFFFF, %p1058;
+ fma.rn.f32 %f4101, %f4099, %f1453, %f4100;
+ mov.f32 %f4102, 0f00000000;
+ fma.rn.f32 %f4103, %f1453, %f1452, %f4102;
+ fma.rn.f32 %f5751, %f4101, %f4103, %f1452;
+ and.b32 %r5969, %r8298, 2;
+ setp.eq.s32 %p1060, %r5969, 0;
+ @%p1060 bra $L__BB0_1242;
+
+ mov.f32 %f4105, 0fBF800000;
+ fma.rn.f32 %f5751, %f5751, %f4105, %f4102;
+
+$L__BB0_1242:
+ mul.f32 %f4106, %f1407, 0f3F22F983;
+ cvt.rni.s32.f32 %r8302, %f4106;
+ cvt.rn.f32.s32 %f4107, %r8302;
+ mov.f32 %f4108, 0fBFC90FDA;
+ fma.rn.f32 %f4109, %f4107, %f4108, %f1407;
+ mov.f32 %f4110, 0fB3A22168;
+ fma.rn.f32 %f4111, %f4107, %f4110, %f4109;
+ mov.f32 %f4112, 0fA7C234C5;
+ fma.rn.f32 %f5752, %f4107, %f4112, %f4111;
+ abs.f32 %f1460, %f1407;
+ setp.ltu.f32 %p1061, %f1460, 0f47CE4780;
+ @%p1061 bra $L__BB0_1250;
+
+ setp.eq.f32 %p1062, %f1460, 0f7F800000;
+ @%p1062 bra $L__BB0_1249;
+ bra.uni $L__BB0_1244;
+
+$L__BB0_1249:
+ mov.f32 %f4115, 0f00000000;
+ mul.rn.f32 %f5752, %f1407, %f4115;
+ mov.u32 %r8302, 0;
+ bra.uni $L__BB0_1250;
+
+$L__BB0_1244:
+ mov.b32 %r1632, %f1407;
+ shr.u32 %r5971, %r1632, 23;
+ and.b32 %r5972, %r5971, 255;
+ add.s32 %r1633, %r5972, -128;
+ shl.b32 %r5973, %r1632, 8;
+ or.b32 %r1634, %r5973, -2147483648;
+ shr.u32 %r1635, %r1633, 5;
+ mov.u64 %rd2671, 0;
+ mov.u32 %r8299, 0;
+ mov.u64 %rd1932, __cudart_i2opi_f;
+ mov.u64 %rd2672, %rd2671;
+
+$L__BB0_1245:
.pragma "nounroll";
- shl.b64 %rd689, %rd2504, 2;
- add.s64 %rd691, %rd690, %rd689;
- ld.global.nc.u32 %r2935, [%rd691];
- mad.wide.u32 %rd692, %r2935, %r103, %rd2505;
- shr.u64 %rd2505, %rd692, 32;
- add.s64 %rd693, %rd1, %rd689;
- st.local.u32 [%rd693], %rd692;
- add.s32 %r8238, %r8238, 1;
- cvt.s64.s32 %rd2504, %r8238;
- setp.ne.s32 %p104, %r8238, 6;
- @%p104 bra $L__BB0_98;
-
- add.s32 %r8112, %r2933, -128;
- mov.b32 %r8111, %f5339;
- shr.u32 %r8110, %r8111, 23;
- and.b32 %r8109, %r8110, 255;
- add.s32 %r8108, %r8109, -128;
- shr.u32 %r8107, %r8108, 5;
- st.local.u32 [%rd5], %rd2505;
- mov.u32 %r2937, 6;
- sub.s32 %r2938, %r2937, %r8107;
- mul.wide.s32 %rd694, %r2938, 4;
- add.s64 %rd695, %rd1, %rd694;
- ld.local.u32 %r8239, [%rd695];
- ld.local.u32 %r8240, [%rd695+-4];
- and.b32 %r110, %r8108, 31;
- setp.eq.s32 %p105, %r110, 0;
- @%p105 bra $L__BB0_101;
-
- mov.b32 %r8120, %f5339;
- shr.u32 %r8119, %r8120, 23;
- and.b32 %r8118, %r8119, 255;
- add.s32 %r8117, %r8118, -128;
- shr.u32 %r8116, %r8117, 5;
- mov.u32 %r8115, 4;
- sub.s32 %r8114, %r8115, %r8116;
- mov.u32 %r2939, 32;
- sub.s32 %r2940, %r2939, %r110;
- shr.u32 %r2941, %r8240, %r2940;
- shl.b32 %r2942, %r8239, %r110;
- add.s32 %r8239, %r2941, %r2942;
- mul.wide.s32 %rd696, %r8114, 4;
- add.s64 %rd697, %rd1, %rd696;
- ld.local.u32 %r2943, [%rd697];
- shr.u32 %r2944, %r2943, %r2940;
- shl.b32 %r2945, %r8240, %r110;
- add.s32 %r8240, %r2944, %r2945;
-
-$L__BB0_101:
- mov.b32 %r8113, %f5339;
- and.b32 %r2946, %r8113, -2147483648;
- shr.u32 %r2947, %r8240, 30;
- shl.b32 %r2948, %r8239, 2;
- or.b32 %r2949, %r2947, %r2948;
- shr.u32 %r2950, %r2949, 31;
- shr.u32 %r2951, %r8239, 30;
- add.s32 %r2952, %r2950, %r2951;
- neg.s32 %r2953, %r2952;
- setp.eq.s32 %p106, %r2946, 0;
- selp.b32 %r8241, %r2952, %r2953, %p106;
- setp.ne.s32 %p107, %r2950, 0;
- xor.b32 %r2954, %r2946, -2147483648;
- selp.b32 %r2955, %r2954, %r2946, %p107;
- selp.b32 %r2956, -1, 0, %p107;
- xor.b32 %r2957, %r2949, %r2956;
- shl.b32 %r2958, %r8240, 2;
- xor.b32 %r2959, %r2958, %r2956;
- cvt.u64.u32 %rd698, %r2957;
- cvt.u64.u32 %rd699, %r2959;
- bfi.b64 %rd700, %rd698, %rd699, 32, 32;
- cvt.rn.f64.s64 %fd7, %rd700;
- mul.f64 %fd8, %fd7, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f2331, %fd8;
- setp.eq.s32 %p108, %r2955, 0;
- neg.f32 %f2332, %f2331;
- selp.f32 %f5195, %f2331, %f2332, %p108;
-
-$L__BB0_103:
- add.s32 %r117, %r8241, 1;
- and.b32 %r118, %r117, 1;
- setp.eq.s32 %p109, %r118, 0;
- selp.f32 %f81, %f5195, 0f3F800000, %p109;
- mul.rn.f32 %f82, %f5195, %f5195;
- mov.f32 %f5196, 0fB94D4153;
- @%p109 bra $L__BB0_105;
-
- mov.f32 %f2335, 0fBAB607ED;
- mov.f32 %f2336, 0f37CBAC00;
- fma.rn.f32 %f5196, %f2336, %f82, %f2335;
-
-$L__BB0_105:
- add.s32 %r8123, %r8241, 1;
- add.s32 %r8122, %r8241, 1;
- and.b32 %r8121, %r8122, 1;
- setp.eq.s32 %p1811, %r8121, 0;
- selp.f32 %f2337, 0f3C0885E4, 0f3D2AAABB, %p1811;
- fma.rn.f32 %f2338, %f5196, %f82, %f2337;
- selp.f32 %f2339, 0fBE2AAAA8, 0fBEFFFFFF, %p1811;
- fma.rn.f32 %f2340, %f2338, %f82, %f2339;
- mov.f32 %f2341, 0f00000000;
- fma.rn.f32 %f2342, %f82, %f81, %f2341;
- fma.rn.f32 %f5215, %f2340, %f2342, %f81;
- and.b32 %r2961, %r8122, 2;
- setp.eq.s32 %p111, %r2961, 0;
- @%p111 bra $L__BB0_107;
-
- mov.f32 %f2344, 0fBF800000;
- fma.rn.f32 %f5215, %f5215, %f2344, %f2341;
-
-$L__BB0_107:
- shl.b32 %r8106, %r12, 5;
- mov.u32 %r8105, -32;
- sub.s32 %r8104, %r8105, %r8106;
- setp.lt.s32 %p1810, %r14, %r8104;
- setp.ge.s32 %p1808, %r14, %r81;
- selp.f32 %f89, %f5215, %f5216, %p1810;
- selp.f32 %f90, %f5213, %f5214, %p1810;
- @%p1808 bra $L__BB0_109;
-
- add.f32 %f5331, %f90, %f89;
- mov.f32 %f5214, %f5213;
- mov.f32 %f5216, %f5215;
-
-$L__BB0_109:
- not.b32 %r8184, %r12;
- add.s32 %r8183, %r13, -15;
- setp.ge.s32 %p1817, %r8183, %r8184;
- @%p1817 bra $L__BB0_138;
-
- shl.b32 %r2964, %r12, 5;
- neg.s32 %r119, %r2964;
- setp.ge.s32 %p115, %r14, %r119;
- @%p115 bra $L__BB0_123;
-
- mul.f32 %f2347, %f5346, 0f3F22F983;
- cvt.rni.s32.f32 %r8245, %f2347;
- cvt.rn.f32.s32 %f2348, %r8245;
- mov.f32 %f2349, 0fBFC90FDA;
- fma.rn.f32 %f2350, %f2348, %f2349, %f5346;
- mov.f32 %f2351, 0fB3A22168;
- fma.rn.f32 %f2352, %f2348, %f2351, %f2350;
- mov.f32 %f2353, 0fA7C234C5;
- fma.rn.f32 %f5204, %f2348, %f2353, %f2352;
- abs.f32 %f98, %f5346;
- setp.ltu.f32 %p116, %f98, 0f47CE4780;
- @%p116 bra $L__BB0_119;
-
- setp.eq.f32 %p117, %f98, 0f7F800000;
- @%p117 bra $L__BB0_118;
- bra.uni $L__BB0_113;
-
-$L__BB0_118:
- mov.f32 %f2356, 0f00000000;
- mul.rn.f32 %f5204, %f5346, %f2356;
- mov.u32 %r8245, 0;
- bra.uni $L__BB0_119;
-
-$L__BB0_113:
- mov.b32 %r121, %f5346;
- shr.u32 %r2966, %r121, 23;
- and.b32 %r2967, %r2966, 255;
- shl.b32 %r2968, %r121, 8;
- or.b32 %r123, %r2968, -2147483648;
- mov.u64 %rd2506, 0;
- mov.u32 %r8242, 0;
- mov.u64 %rd704, __cudart_i2opi_f;
- mov.u64 %rd2507, %rd2506;
-
-$L__BB0_114:
+ shl.b64 %rd1931, %rd2671, 2;
+ add.s64 %rd1933, %rd1932, %rd1931;
+ ld.global.nc.u32 %r5974, [%rd1933];
+ mad.wide.u32 %rd1934, %r5974, %r1634, %rd2672;
+ shr.u64 %rd2672, %rd1934, 32;
+ add.s64 %rd1935, %rd1, %rd1931;
+ st.local.u32 [%rd1935], %rd1934;
+ add.s32 %r8299, %r8299, 1;
+ cvt.s64.s32 %rd2671, %r8299;
+ setp.ne.s32 %p1063, %r8299, 6;
+ @%p1063 bra $L__BB0_1245;
+
+ st.local.u32 [%rd4], %rd2672;
+ mov.u32 %r5975, 4;
+ sub.s32 %r1638, %r5975, %r1635;
+ mov.u32 %r5976, 6;
+ sub.s32 %r5977, %r5976, %r1635;
+ mul.wide.s32 %rd1936, %r5977, 4;
+ add.s64 %rd1937, %rd1, %rd1936;
+ ld.local.u32 %r8300, [%rd1937];
+ ld.local.u32 %r8301, [%rd1937+-4];
+ and.b32 %r1641, %r1633, 31;
+ setp.eq.s32 %p1064, %r1641, 0;
+ @%p1064 bra $L__BB0_1248;
+
+ mov.u32 %r5978, 32;
+ sub.s32 %r5979, %r5978, %r1641;
+ shr.u32 %r5980, %r8301, %r5979;
+ shl.b32 %r5981, %r8300, %r1641;
+ add.s32 %r8300, %r5980, %r5981;
+ mul.wide.s32 %rd1938, %r1638, 4;
+ add.s64 %rd1939, %rd1, %rd1938;
+ ld.local.u32 %r5982, [%rd1939];
+ shr.u32 %r5983, %r5982, %r5979;
+ shl.b32 %r5984, %r8301, %r1641;
+ add.s32 %r8301, %r5983, %r5984;
+
+$L__BB0_1248:
+ and.b32 %r5985, %r1632, -2147483648;
+ shr.u32 %r5986, %r8301, 30;
+ shl.b32 %r5987, %r8300, 2;
+ or.b32 %r5988, %r5986, %r5987;
+ shr.u32 %r5989, %r5988, 31;
+ shr.u32 %r5990, %r8300, 30;
+ add.s32 %r5991, %r5989, %r5990;
+ neg.s32 %r5992, %r5991;
+ setp.eq.s32 %p1065, %r5985, 0;
+ selp.b32 %r8302, %r5991, %r5992, %p1065;
+ setp.ne.s32 %p1066, %r5989, 0;
+ xor.b32 %r5993, %r5985, -2147483648;
+ selp.b32 %r5994, %r5993, %r5985, %p1066;
+ selp.b32 %r5995, -1, 0, %p1066;
+ xor.b32 %r5996, %r5988, %r5995;
+ shl.b32 %r5997, %r8301, 2;
+ xor.b32 %r5998, %r5997, %r5995;
+ cvt.u64.u32 %rd1940, %r5996;
+ cvt.u64.u32 %rd1941, %r5998;
+ bfi.b64 %rd1942, %rd1940, %rd1941, 32, 32;
+ cvt.rn.f64.s64 %fd167, %rd1942;
+ mul.f64 %fd168, %fd167, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4113, %fd168;
+ setp.eq.s32 %p1067, %r5994, 0;
+ neg.f32 %f4114, %f4113;
+ selp.f32 %f5752, %f4113, %f4114, %p1067;
+
+$L__BB0_1250:
+ add.s32 %r1648, %r8302, 1;
+ and.b32 %r1649, %r1648, 1;
+ setp.eq.s32 %p1068, %r1649, 0;
+ selp.f32 %f1464, %f5752, 0f3F800000, %p1068;
+ mul.rn.f32 %f1465, %f5752, %f5752;
+ mov.f32 %f5753, 0fB94D4153;
+ @%p1068 bra $L__BB0_1252;
+
+ mov.f32 %f4117, 0fBAB607ED;
+ mov.f32 %f4118, 0f37CBAC00;
+ fma.rn.f32 %f5753, %f4118, %f1465, %f4117;
+
+$L__BB0_1252:
+ selp.f32 %f4119, 0f3C0885E4, 0f3D2AAABB, %p1068;
+ fma.rn.f32 %f4120, %f5753, %f1465, %f4119;
+ selp.f32 %f4121, 0fBE2AAAA8, 0fBEFFFFFF, %p1068;
+ fma.rn.f32 %f4122, %f4120, %f1465, %f4121;
+ mov.f32 %f4123, 0f00000000;
+ fma.rn.f32 %f4124, %f1465, %f1464, %f4123;
+ fma.rn.f32 %f5754, %f4122, %f4124, %f1464;
+ and.b32 %r6000, %r1648, 2;
+ setp.eq.s32 %p1070, %r6000, 0;
+ @%p1070 bra $L__BB0_1254;
+
+ mov.f32 %f4126, 0fBF800000;
+ fma.rn.f32 %f5754, %f5754, %f4126, %f4123;
+
+$L__BB0_1254:
+ add.f32 %f5797, %f5751, %f5754;
+ mul.f32 %f4127, %f1416, 0f3F22F983;
+ cvt.rni.s32.f32 %r8306, %f4127;
+ cvt.rn.f32.s32 %f4128, %r8306;
+ mov.f32 %f4129, 0fBFC90FDA;
+ fma.rn.f32 %f4130, %f4128, %f4129, %f1416;
+ mov.f32 %f4131, 0fB3A22168;
+ fma.rn.f32 %f4132, %f4128, %f4131, %f4130;
+ mov.f32 %f4133, 0fA7C234C5;
+ fma.rn.f32 %f5755, %f4128, %f4133, %f4132;
+ abs.f32 %f1473, %f1416;
+ setp.ltu.f32 %p1071, %f1473, 0f47CE4780;
+ @%p1071 bra $L__BB0_1262;
+
+ setp.eq.f32 %p1072, %f1473, 0f7F800000;
+ @%p1072 bra $L__BB0_1261;
+ bra.uni $L__BB0_1256;
+
+$L__BB0_1261:
+ mov.f32 %f4136, 0f00000000;
+ mul.rn.f32 %f5755, %f1416, %f4136;
+ mov.u32 %r8306, 0;
+ bra.uni $L__BB0_1262;
+
+$L__BB0_1256:
+ mov.b32 %r1651, %f1416;
+ shr.u32 %r6002, %r1651, 23;
+ and.b32 %r6003, %r6002, 255;
+ add.s32 %r1652, %r6003, -128;
+ shl.b32 %r6004, %r1651, 8;
+ or.b32 %r1653, %r6004, -2147483648;
+ shr.u32 %r1654, %r1652, 5;
+ mov.u64 %rd2673, 0;
+ mov.u32 %r8303, 0;
+ mov.u64 %rd1946, __cudart_i2opi_f;
+ mov.u64 %rd2674, %rd2673;
+
+$L__BB0_1257:
.pragma "nounroll";
- shl.b64 %rd703, %rd2506, 2;
- add.s64 %rd705, %rd704, %rd703;
- ld.global.nc.u32 %r2969, [%rd705];
- mad.wide.u32 %rd706, %r2969, %r123, %rd2507;
- shr.u64 %rd2507, %rd706, 32;
- add.s64 %rd707, %rd1, %rd703;
- st.local.u32 [%rd707], %rd706;
- add.s32 %r8242, %r8242, 1;
- cvt.s64.s32 %rd2506, %r8242;
- setp.ne.s32 %p118, %r8242, 6;
- @%p118 bra $L__BB0_114;
-
- add.s32 %r8129, %r2967, -128;
- mov.b32 %r8128, %f5346;
- shr.u32 %r8127, %r8128, 23;
- and.b32 %r8126, %r8127, 255;
- add.s32 %r8125, %r8126, -128;
- shr.u32 %r8124, %r8125, 5;
- st.local.u32 [%rd5], %rd2507;
- mov.u32 %r2971, 6;
- sub.s32 %r2972, %r2971, %r8124;
- mul.wide.s32 %rd708, %r2972, 4;
- add.s64 %rd709, %rd1, %rd708;
- ld.local.u32 %r8243, [%rd709];
- ld.local.u32 %r8244, [%rd709+-4];
- and.b32 %r130, %r8125, 31;
- setp.eq.s32 %p119, %r130, 0;
- @%p119 bra $L__BB0_117;
-
- mov.b32 %r8141, %f5346;
- shr.u32 %r8140, %r8141, 23;
- and.b32 %r8139, %r8140, 255;
- add.s32 %r8138, %r8139, -128;
- shr.u32 %r8137, %r8138, 5;
- mov.u32 %r8136, 4;
- sub.s32 %r8135, %r8136, %r8137;
- mov.u32 %r2973, 32;
- sub.s32 %r2974, %r2973, %r130;
- shr.u32 %r2975, %r8244, %r2974;
- shl.b32 %r2976, %r8243, %r130;
- add.s32 %r8243, %r2975, %r2976;
- mul.wide.s32 %rd710, %r8135, 4;
- add.s64 %rd711, %rd1, %rd710;
- ld.local.u32 %r2977, [%rd711];
- shr.u32 %r2978, %r2977, %r2974;
- shl.b32 %r2979, %r8244, %r130;
- add.s32 %r8244, %r2978, %r2979;
-
-$L__BB0_117:
- mov.b32 %r8130, %f5346;
- and.b32 %r2980, %r8130, -2147483648;
- shr.u32 %r2981, %r8244, 30;
- shl.b32 %r2982, %r8243, 2;
- or.b32 %r2983, %r2981, %r2982;
- shr.u32 %r2984, %r2983, 31;
- shr.u32 %r2985, %r8243, 30;
- add.s32 %r2986, %r2984, %r2985;
- neg.s32 %r2987, %r2986;
- setp.eq.s32 %p120, %r2980, 0;
- selp.b32 %r8245, %r2986, %r2987, %p120;
- setp.ne.s32 %p121, %r2984, 0;
- xor.b32 %r2988, %r2980, -2147483648;
- selp.b32 %r2989, %r2988, %r2980, %p121;
- selp.b32 %r2990, -1, 0, %p121;
- xor.b32 %r2991, %r2983, %r2990;
- shl.b32 %r2992, %r8244, 2;
- xor.b32 %r2993, %r2992, %r2990;
- cvt.u64.u32 %rd712, %r2991;
- cvt.u64.u32 %rd713, %r2993;
- bfi.b64 %rd714, %rd712, %rd713, 32, 32;
- cvt.rn.f64.s64 %fd9, %rd714;
- mul.f64 %fd10, %fd9, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f2354, %fd10;
- setp.eq.s32 %p122, %r2989, 0;
- neg.f32 %f2355, %f2354;
- selp.f32 %f5204, %f2354, %f2355, %p122;
-
-$L__BB0_119:
- and.b32 %r137, %r8245, 1;
- setp.eq.s32 %p123, %r137, 0;
- mul.rn.f32 %f103, %f5204, %f5204;
- mov.f32 %f5205, 0fB94D4153;
- @%p123 bra $L__BB0_121;
-
- mov.f32 %f2358, 0fBAB607ED;
- mov.f32 %f2359, 0f37CBAC00;
- fma.rn.f32 %f5205, %f2359, %f103, %f2358;
-
-$L__BB0_121:
- selp.f32 %f5163, %f5204, 0f3F800000, %p123;
- selp.f32 %f2360, 0f3C0885E4, 0f3D2AAABB, %p123;
- fma.rn.f32 %f2361, %f5205, %f103, %f2360;
- selp.f32 %f2362, 0fBE2AAAA8, 0fBEFFFFFF, %p123;
- fma.rn.f32 %f2363, %f2361, %f103, %f2362;
- mov.f32 %f2364, 0f00000000;
- fma.rn.f32 %f2365, %f103, %f5163, %f2364;
- fma.rn.f32 %f5213, %f2363, %f2365, %f5163;
- and.b32 %r2995, %r8245, 2;
- setp.eq.s32 %p125, %r2995, 0;
- @%p125 bra $L__BB0_123;
-
- mov.f32 %f2367, 0fBF800000;
- fma.rn.f32 %f5213, %f5213, %f2367, %f2364;
-
-$L__BB0_123:
- shl.b32 %r8198, %r12, 5;
- neg.s32 %r8197, %r8198;
- setp.ge.s32 %p1822, %r14, %r8197;
- @%p1822 bra $L__BB0_136;
-
- mul.f32 %f2368, %f5338, 0f3F22F983;
- cvt.rni.s32.f32 %r8249, %f2368;
- cvt.rn.f32.s32 %f2369, %r8249;
- mov.f32 %f2370, 0fBFC90FDA;
- fma.rn.f32 %f2371, %f2369, %f2370, %f5338;
- mov.f32 %f2372, 0fB3A22168;
- fma.rn.f32 %f2373, %f2369, %f2372, %f2371;
- mov.f32 %f2374, 0fA7C234C5;
- fma.rn.f32 %f5208, %f2369, %f2374, %f2373;
- abs.f32 %f111, %f5338;
- setp.ltu.f32 %p127, %f111, 0f47CE4780;
- @%p127 bra $L__BB0_132;
-
- setp.eq.f32 %p128, %f111, 0f7F800000;
- @%p128 bra $L__BB0_131;
- bra.uni $L__BB0_126;
-
-$L__BB0_131:
- mov.f32 %f2377, 0f00000000;
- mul.rn.f32 %f5208, %f5338, %f2377;
- mov.u32 %r8249, 0;
- bra.uni $L__BB0_132;
-
-$L__BB0_126:
- mov.b32 %r139, %f5338;
- shr.u32 %r2997, %r139, 23;
- and.b32 %r2998, %r2997, 255;
- shl.b32 %r2999, %r139, 8;
- or.b32 %r141, %r2999, -2147483648;
- mov.u64 %rd2508, 0;
- mov.u32 %r8246, 0;
- mov.u64 %rd718, __cudart_i2opi_f;
- mov.u64 %rd2509, %rd2508;
-
-$L__BB0_127:
+ shl.b64 %rd1945, %rd2673, 2;
+ add.s64 %rd1947, %rd1946, %rd1945;
+ ld.global.nc.u32 %r6005, [%rd1947];
+ mad.wide.u32 %rd1948, %r6005, %r1653, %rd2674;
+ shr.u64 %rd2674, %rd1948, 32;
+ add.s64 %rd1949, %rd1, %rd1945;
+ st.local.u32 [%rd1949], %rd1948;
+ add.s32 %r8303, %r8303, 1;
+ cvt.s64.s32 %rd2673, %r8303;
+ setp.ne.s32 %p1073, %r8303, 6;
+ @%p1073 bra $L__BB0_1257;
+
+ st.local.u32 [%rd4], %rd2674;
+ mov.u32 %r6006, 4;
+ sub.s32 %r1657, %r6006, %r1654;
+ mov.u32 %r6007, 6;
+ sub.s32 %r6008, %r6007, %r1654;
+ mul.wide.s32 %rd1950, %r6008, 4;
+ add.s64 %rd1951, %rd1, %rd1950;
+ ld.local.u32 %r8304, [%rd1951];
+ ld.local.u32 %r8305, [%rd1951+-4];
+ and.b32 %r1660, %r1652, 31;
+ setp.eq.s32 %p1074, %r1660, 0;
+ @%p1074 bra $L__BB0_1260;
+
+ mov.u32 %r6009, 32;
+ sub.s32 %r6010, %r6009, %r1660;
+ shr.u32 %r6011, %r8305, %r6010;
+ shl.b32 %r6012, %r8304, %r1660;
+ add.s32 %r8304, %r6011, %r6012;
+ mul.wide.s32 %rd1952, %r1657, 4;
+ add.s64 %rd1953, %rd1, %rd1952;
+ ld.local.u32 %r6013, [%rd1953];
+ shr.u32 %r6014, %r6013, %r6010;
+ shl.b32 %r6015, %r8305, %r1660;
+ add.s32 %r8305, %r6014, %r6015;
+
+$L__BB0_1260:
+ and.b32 %r6016, %r1651, -2147483648;
+ shr.u32 %r6017, %r8305, 30;
+ shl.b32 %r6018, %r8304, 2;
+ or.b32 %r6019, %r6017, %r6018;
+ shr.u32 %r6020, %r6019, 31;
+ shr.u32 %r6021, %r8304, 30;
+ add.s32 %r6022, %r6020, %r6021;
+ neg.s32 %r6023, %r6022;
+ setp.eq.s32 %p1075, %r6016, 0;
+ selp.b32 %r8306, %r6022, %r6023, %p1075;
+ setp.ne.s32 %p1076, %r6020, 0;
+ xor.b32 %r6024, %r6016, -2147483648;
+ selp.b32 %r6025, %r6024, %r6016, %p1076;
+ selp.b32 %r6026, -1, 0, %p1076;
+ xor.b32 %r6027, %r6019, %r6026;
+ shl.b32 %r6028, %r8305, 2;
+ xor.b32 %r6029, %r6028, %r6026;
+ cvt.u64.u32 %rd1954, %r6027;
+ cvt.u64.u32 %rd1955, %r6029;
+ bfi.b64 %rd1956, %rd1954, %rd1955, 32, 32;
+ cvt.rn.f64.s64 %fd169, %rd1956;
+ mul.f64 %fd170, %fd169, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4134, %fd170;
+ setp.eq.s32 %p1077, %r6025, 0;
+ neg.f32 %f4135, %f4134;
+ selp.f32 %f5755, %f4134, %f4135, %p1077;
+
+$L__BB0_1262:
+ and.b32 %r1667, %r8306, 1;
+ setp.eq.s32 %p1078, %r1667, 0;
+ selp.f32 %f1477, %f5755, 0f3F800000, %p1078;
+ mul.rn.f32 %f1478, %f5755, %f5755;
+ mov.f32 %f5756, 0fB94D4153;
+ @%p1078 bra $L__BB0_1264;
+
+ mov.f32 %f4138, 0fBAB607ED;
+ mov.f32 %f4139, 0f37CBAC00;
+ fma.rn.f32 %f5756, %f4139, %f1478, %f4138;
+
+$L__BB0_1264:
+ selp.f32 %f4140, 0f3C0885E4, 0f3D2AAABB, %p1078;
+ fma.rn.f32 %f4141, %f5756, %f1478, %f4140;
+ selp.f32 %f4142, 0fBE2AAAA8, 0fBEFFFFFF, %p1078;
+ fma.rn.f32 %f4143, %f4141, %f1478, %f4142;
+ mov.f32 %f4144, 0f00000000;
+ fma.rn.f32 %f4145, %f1478, %f1477, %f4144;
+ fma.rn.f32 %f5757, %f4143, %f4145, %f1477;
+ and.b32 %r6031, %r8306, 2;
+ setp.eq.s32 %p1080, %r6031, 0;
+ @%p1080 bra $L__BB0_1266;
+
+ mov.f32 %f4147, 0fBF800000;
+ fma.rn.f32 %f5757, %f5757, %f4147, %f4144;
+
+$L__BB0_1266:
+ mul.f32 %f4148, %f1408, 0f3F22F983;
+ cvt.rni.s32.f32 %r8310, %f4148;
+ cvt.rn.f32.s32 %f4149, %r8310;
+ mov.f32 %f4150, 0fBFC90FDA;
+ fma.rn.f32 %f4151, %f4149, %f4150, %f1408;
+ mov.f32 %f4152, 0fB3A22168;
+ fma.rn.f32 %f4153, %f4149, %f4152, %f4151;
+ mov.f32 %f4154, 0fA7C234C5;
+ fma.rn.f32 %f5758, %f4149, %f4154, %f4153;
+ abs.f32 %f1485, %f1408;
+ setp.ltu.f32 %p1081, %f1485, 0f47CE4780;
+ @%p1081 bra $L__BB0_1274;
+
+ setp.eq.f32 %p1082, %f1485, 0f7F800000;
+ @%p1082 bra $L__BB0_1273;
+ bra.uni $L__BB0_1268;
+
+$L__BB0_1273:
+ mov.f32 %f4157, 0f00000000;
+ mul.rn.f32 %f5758, %f1408, %f4157;
+ mov.u32 %r8310, 0;
+ bra.uni $L__BB0_1274;
+
+$L__BB0_1268:
+ mov.b32 %r1669, %f1408;
+ shr.u32 %r6033, %r1669, 23;
+ and.b32 %r6034, %r6033, 255;
+ add.s32 %r1670, %r6034, -128;
+ shl.b32 %r6035, %r1669, 8;
+ or.b32 %r1671, %r6035, -2147483648;
+ shr.u32 %r1672, %r1670, 5;
+ mov.u64 %rd2675, 0;
+ mov.u32 %r8307, 0;
+ mov.u64 %rd1960, __cudart_i2opi_f;
+ mov.u64 %rd2676, %rd2675;
+
+$L__BB0_1269:
.pragma "nounroll";
- shl.b64 %rd717, %rd2508, 2;
- add.s64 %rd719, %rd718, %rd717;
- ld.global.nc.u32 %r3000, [%rd719];
- mad.wide.u32 %rd720, %r3000, %r141, %rd2509;
- shr.u64 %rd2509, %rd720, 32;
- add.s64 %rd721, %rd1, %rd717;
- st.local.u32 [%rd721], %rd720;
- add.s32 %r8246, %r8246, 1;
- cvt.s64.s32 %rd2508, %r8246;
- setp.ne.s32 %p129, %r8246, 6;
- @%p129 bra $L__BB0_127;
-
- add.s32 %r8147, %r2998, -128;
- mov.b32 %r8146, %f5338;
- shr.u32 %r8145, %r8146, 23;
- and.b32 %r8144, %r8145, 255;
- add.s32 %r8143, %r8144, -128;
- shr.u32 %r8142, %r8143, 5;
- st.local.u32 [%rd5], %rd2509;
- mov.u32 %r3002, 6;
- sub.s32 %r3003, %r3002, %r8142;
- mul.wide.s32 %rd722, %r3003, 4;
- add.s64 %rd723, %rd1, %rd722;
- ld.local.u32 %r8247, [%rd723];
- ld.local.u32 %r8248, [%rd723+-4];
- and.b32 %r148, %r8143, 31;
- setp.eq.s32 %p130, %r148, 0;
- @%p130 bra $L__BB0_130;
-
- mov.b32 %r8159, %f5338;
- shr.u32 %r8158, %r8159, 23;
- and.b32 %r8157, %r8158, 255;
- add.s32 %r8156, %r8157, -128;
- shr.u32 %r8155, %r8156, 5;
- mov.u32 %r8154, 4;
- sub.s32 %r8153, %r8154, %r8155;
- mov.u32 %r3004, 32;
- sub.s32 %r3005, %r3004, %r148;
- shr.u32 %r3006, %r8248, %r3005;
- shl.b32 %r3007, %r8247, %r148;
- add.s32 %r8247, %r3006, %r3007;
- mul.wide.s32 %rd724, %r8153, 4;
- add.s64 %rd725, %rd1, %rd724;
- ld.local.u32 %r3008, [%rd725];
- shr.u32 %r3009, %r3008, %r3005;
- shl.b32 %r3010, %r8248, %r148;
- add.s32 %r8248, %r3009, %r3010;
-
-$L__BB0_130:
- mov.b32 %r8148, %f5338;
- and.b32 %r3011, %r8148, -2147483648;
- shr.u32 %r3012, %r8248, 30;
- shl.b32 %r3013, %r8247, 2;
- or.b32 %r3014, %r3012, %r3013;
- shr.u32 %r3015, %r3014, 31;
- shr.u32 %r3016, %r8247, 30;
- add.s32 %r3017, %r3015, %r3016;
- neg.s32 %r3018, %r3017;
- setp.eq.s32 %p131, %r3011, 0;
- selp.b32 %r8249, %r3017, %r3018, %p131;
- setp.ne.s32 %p132, %r3015, 0;
- xor.b32 %r3019, %r3011, -2147483648;
- selp.b32 %r3020, %r3019, %r3011, %p132;
- selp.b32 %r3021, -1, 0, %p132;
- xor.b32 %r3022, %r3014, %r3021;
- shl.b32 %r3023, %r8248, 2;
- xor.b32 %r3024, %r3023, %r3021;
- cvt.u64.u32 %rd726, %r3022;
- cvt.u64.u32 %rd727, %r3024;
- bfi.b64 %rd728, %rd726, %rd727, 32, 32;
- cvt.rn.f64.s64 %fd11, %rd728;
- mul.f64 %fd12, %fd11, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f2375, %fd12;
- setp.eq.s32 %p133, %r3020, 0;
- neg.f32 %f2376, %f2375;
- selp.f32 %f5208, %f2375, %f2376, %p133;
-
-$L__BB0_132:
- add.s32 %r155, %r8249, 1;
- and.b32 %r156, %r155, 1;
- setp.eq.s32 %p134, %r156, 0;
- selp.f32 %f115, %f5208, 0f3F800000, %p134;
- mul.rn.f32 %f116, %f5208, %f5208;
- mov.f32 %f5209, 0fB94D4153;
- @%p134 bra $L__BB0_134;
-
- mov.f32 %f2379, 0fBAB607ED;
- mov.f32 %f2380, 0f37CBAC00;
- fma.rn.f32 %f5209, %f2380, %f116, %f2379;
-
-$L__BB0_134:
- selp.f32 %f2381, 0f3C0885E4, 0f3D2AAABB, %p134;
- fma.rn.f32 %f2382, %f5209, %f116, %f2381;
- selp.f32 %f2383, 0fBE2AAAA8, 0fBEFFFFFF, %p134;
- fma.rn.f32 %f2384, %f2382, %f116, %f2383;
- mov.f32 %f2385, 0f00000000;
- fma.rn.f32 %f2386, %f116, %f115, %f2385;
- fma.rn.f32 %f5215, %f2384, %f2386, %f115;
- and.b32 %r3026, %r155, 2;
- setp.eq.s32 %p136, %r3026, 0;
- @%p136 bra $L__BB0_136;
-
- mov.f32 %f2388, 0fBF800000;
- fma.rn.f32 %f5215, %f5215, %f2388, %f2385;
-
-$L__BB0_136:
- shl.b32 %r8152, %r12, 5;
- neg.s32 %r8151, %r8152;
- setp.lt.s32 %p1814, %r14, %r8151;
- shl.b32 %r8150, %r12, 5;
- neg.s32 %r8149, %r8150;
- setp.ge.s32 %p1813, %r14, %r8149;
- selp.f32 %f123, %f5215, %f5216, %p1814;
- selp.f32 %f124, %f5213, %f5214, %p1814;
- @%p1813 bra $L__BB0_138;
-
- add.f32 %f5330, %f124, %f123;
- mov.f32 %f5214, %f5213;
- mov.f32 %f5216, %f5215;
-
-$L__BB0_138:
- not.b32 %r8134, %r12;
- add.s32 %r8133, %r13, -15;
- setp.ge.s32 %p1812, %r8133, %r8134;
- @%p1812 bra $L__BB0_167;
-
- shl.b32 %r3029, %r12, 5;
- mov.u32 %r3030, -32;
- sub.s32 %r157, %r3030, %r3029;
- setp.ge.s32 %p140, %r14, %r157;
- @%p140 bra $L__BB0_152;
-
- mul.f32 %f2391, %f5345, 0f3F22F983;
- cvt.rni.s32.f32 %r8253, %f2391;
- cvt.rn.f32.s32 %f2392, %r8253;
- mov.f32 %f2393, 0fBFC90FDA;
- fma.rn.f32 %f2394, %f2392, %f2393, %f5345;
- mov.f32 %f2395, 0fB3A22168;
- fma.rn.f32 %f2396, %f2392, %f2395, %f2394;
- mov.f32 %f2397, 0fA7C234C5;
- fma.rn.f32 %f5217, %f2392, %f2397, %f2396;
- abs.f32 %f132, %f5345;
- setp.ltu.f32 %p141, %f132, 0f47CE4780;
- @%p141 bra $L__BB0_148;
-
- setp.eq.f32 %p142, %f132, 0f7F800000;
- @%p142 bra $L__BB0_147;
- bra.uni $L__BB0_142;
-
-$L__BB0_147:
- mov.f32 %f2400, 0f00000000;
- mul.rn.f32 %f5217, %f5345, %f2400;
- mov.u32 %r8253, 0;
- bra.uni $L__BB0_148;
-
-$L__BB0_142:
- mov.b32 %r159, %f5345;
- shr.u32 %r3032, %r159, 23;
- and.b32 %r3033, %r3032, 255;
- shl.b32 %r3034, %r159, 8;
- or.b32 %r161, %r3034, -2147483648;
- mov.u64 %rd2510, 0;
- mov.u32 %r8250, 0;
- mov.u64 %rd732, __cudart_i2opi_f;
- mov.u64 %rd2511, %rd2510;
-
-$L__BB0_143:
+ shl.b64 %rd1959, %rd2675, 2;
+ add.s64 %rd1961, %rd1960, %rd1959;
+ ld.global.nc.u32 %r6036, [%rd1961];
+ mad.wide.u32 %rd1962, %r6036, %r1671, %rd2676;
+ shr.u64 %rd2676, %rd1962, 32;
+ add.s64 %rd1963, %rd1, %rd1959;
+ st.local.u32 [%rd1963], %rd1962;
+ add.s32 %r8307, %r8307, 1;
+ cvt.s64.s32 %rd2675, %r8307;
+ setp.ne.s32 %p1083, %r8307, 6;
+ @%p1083 bra $L__BB0_1269;
+
+ st.local.u32 [%rd4], %rd2676;
+ mov.u32 %r6037, 4;
+ sub.s32 %r1675, %r6037, %r1672;
+ mov.u32 %r6038, 6;
+ sub.s32 %r6039, %r6038, %r1672;
+ mul.wide.s32 %rd1964, %r6039, 4;
+ add.s64 %rd1965, %rd1, %rd1964;
+ ld.local.u32 %r8308, [%rd1965];
+ ld.local.u32 %r8309, [%rd1965+-4];
+ and.b32 %r1678, %r1670, 31;
+ setp.eq.s32 %p1084, %r1678, 0;
+ @%p1084 bra $L__BB0_1272;
+
+ mov.u32 %r6040, 32;
+ sub.s32 %r6041, %r6040, %r1678;
+ shr.u32 %r6042, %r8309, %r6041;
+ shl.b32 %r6043, %r8308, %r1678;
+ add.s32 %r8308, %r6042, %r6043;
+ mul.wide.s32 %rd1966, %r1675, 4;
+ add.s64 %rd1967, %rd1, %rd1966;
+ ld.local.u32 %r6044, [%rd1967];
+ shr.u32 %r6045, %r6044, %r6041;
+ shl.b32 %r6046, %r8309, %r1678;
+ add.s32 %r8309, %r6045, %r6046;
+
+$L__BB0_1272:
+ and.b32 %r6047, %r1669, -2147483648;
+ shr.u32 %r6048, %r8309, 30;
+ shl.b32 %r6049, %r8308, 2;
+ or.b32 %r6050, %r6048, %r6049;
+ shr.u32 %r6051, %r6050, 31;
+ shr.u32 %r6052, %r8308, 30;
+ add.s32 %r6053, %r6051, %r6052;
+ neg.s32 %r6054, %r6053;
+ setp.eq.s32 %p1085, %r6047, 0;
+ selp.b32 %r8310, %r6053, %r6054, %p1085;
+ setp.ne.s32 %p1086, %r6051, 0;
+ xor.b32 %r6055, %r6047, -2147483648;
+ selp.b32 %r6056, %r6055, %r6047, %p1086;
+ selp.b32 %r6057, -1, 0, %p1086;
+ xor.b32 %r6058, %r6050, %r6057;
+ shl.b32 %r6059, %r8309, 2;
+ xor.b32 %r6060, %r6059, %r6057;
+ cvt.u64.u32 %rd1968, %r6058;
+ cvt.u64.u32 %rd1969, %r6060;
+ bfi.b64 %rd1970, %rd1968, %rd1969, 32, 32;
+ cvt.rn.f64.s64 %fd171, %rd1970;
+ mul.f64 %fd172, %fd171, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4155, %fd172;
+ setp.eq.s32 %p1087, %r6056, 0;
+ neg.f32 %f4156, %f4155;
+ selp.f32 %f5758, %f4155, %f4156, %p1087;
+
+$L__BB0_1274:
+ add.s32 %r1685, %r8310, 1;
+ and.b32 %r1686, %r1685, 1;
+ setp.eq.s32 %p1088, %r1686, 0;
+ selp.f32 %f1489, %f5758, 0f3F800000, %p1088;
+ mul.rn.f32 %f1490, %f5758, %f5758;
+ mov.f32 %f5759, 0fB94D4153;
+ @%p1088 bra $L__BB0_1276;
+
+ mov.f32 %f4159, 0fBAB607ED;
+ mov.f32 %f4160, 0f37CBAC00;
+ fma.rn.f32 %f5759, %f4160, %f1490, %f4159;
+
+$L__BB0_1276:
+ selp.f32 %f4161, 0f3C0885E4, 0f3D2AAABB, %p1088;
+ fma.rn.f32 %f4162, %f5759, %f1490, %f4161;
+ selp.f32 %f4163, 0fBE2AAAA8, 0fBEFFFFFF, %p1088;
+ fma.rn.f32 %f4164, %f4162, %f1490, %f4163;
+ mov.f32 %f4165, 0f00000000;
+ fma.rn.f32 %f4166, %f1490, %f1489, %f4165;
+ fma.rn.f32 %f5760, %f4164, %f4166, %f1489;
+ and.b32 %r6062, %r1685, 2;
+ setp.eq.s32 %p1090, %r6062, 0;
+ @%p1090 bra $L__BB0_1278;
+
+ mov.f32 %f4168, 0fBF800000;
+ fma.rn.f32 %f5760, %f5760, %f4168, %f4165;
+
+$L__BB0_1278:
+ add.f32 %f5796, %f5757, %f5760;
+ mul.f32 %f4169, %f1417, 0f3F22F983;
+ cvt.rni.s32.f32 %r8314, %f4169;
+ cvt.rn.f32.s32 %f4170, %r8314;
+ mov.f32 %f4171, 0fBFC90FDA;
+ fma.rn.f32 %f4172, %f4170, %f4171, %f1417;
+ mov.f32 %f4173, 0fB3A22168;
+ fma.rn.f32 %f4174, %f4170, %f4173, %f4172;
+ mov.f32 %f4175, 0fA7C234C5;
+ fma.rn.f32 %f5761, %f4170, %f4175, %f4174;
+ abs.f32 %f1498, %f1417;
+ setp.ltu.f32 %p1091, %f1498, 0f47CE4780;
+ @%p1091 bra $L__BB0_1286;
+
+ setp.eq.f32 %p1092, %f1498, 0f7F800000;
+ @%p1092 bra $L__BB0_1285;
+ bra.uni $L__BB0_1280;
+
+$L__BB0_1285:
+ mov.f32 %f4178, 0f00000000;
+ mul.rn.f32 %f5761, %f1417, %f4178;
+ mov.u32 %r8314, 0;
+ bra.uni $L__BB0_1286;
+
+$L__BB0_1280:
+ mov.b32 %r1688, %f1417;
+ shr.u32 %r6064, %r1688, 23;
+ and.b32 %r6065, %r6064, 255;
+ add.s32 %r1689, %r6065, -128;
+ shl.b32 %r6066, %r1688, 8;
+ or.b32 %r1690, %r6066, -2147483648;
+ shr.u32 %r1691, %r1689, 5;
+ mov.u64 %rd2677, 0;
+ mov.u32 %r8311, 0;
+ mov.u64 %rd1974, __cudart_i2opi_f;
+ mov.u64 %rd2678, %rd2677;
+
+$L__BB0_1281:
.pragma "nounroll";
- shl.b64 %rd731, %rd2510, 2;
- add.s64 %rd733, %rd732, %rd731;
- ld.global.nc.u32 %r3035, [%rd733];
- mad.wide.u32 %rd734, %r3035, %r161, %rd2511;
- shr.u64 %rd2511, %rd734, 32;
- add.s64 %rd735, %rd1, %rd731;
- st.local.u32 [%rd735], %rd734;
- add.s32 %r8250, %r8250, 1;
- cvt.s64.s32 %rd2510, %r8250;
- setp.ne.s32 %p143, %r8250, 6;
- @%p143 bra $L__BB0_143;
-
- add.s32 %r8165, %r3033, -128;
- mov.b32 %r8164, %f5345;
- shr.u32 %r8163, %r8164, 23;
- and.b32 %r8162, %r8163, 255;
- add.s32 %r8161, %r8162, -128;
- shr.u32 %r8160, %r8161, 5;
- st.local.u32 [%rd5], %rd2511;
- mov.u32 %r3036, 4;
- sub.s32 %r165, %r3036, %r8160;
- mov.u32 %r3037, 6;
- sub.s32 %r3038, %r3037, %r8160;
- mul.wide.s32 %rd736, %r3038, 4;
- add.s64 %rd737, %rd1, %rd736;
- ld.local.u32 %r8251, [%rd737];
- ld.local.u32 %r8252, [%rd737+-4];
- and.b32 %r168, %r8161, 31;
- setp.eq.s32 %p144, %r168, 0;
- @%p144 bra $L__BB0_146;
-
- mov.u32 %r3039, 32;
- sub.s32 %r3040, %r3039, %r168;
- shr.u32 %r3041, %r8252, %r3040;
- shl.b32 %r3042, %r8251, %r168;
- add.s32 %r8251, %r3041, %r3042;
- mul.wide.s32 %rd738, %r165, 4;
- add.s64 %rd739, %rd1, %rd738;
- ld.local.u32 %r3043, [%rd739];
- shr.u32 %r3044, %r3043, %r3040;
- shl.b32 %r3045, %r8252, %r168;
- add.s32 %r8252, %r3044, %r3045;
-
-$L__BB0_146:
- mov.b32 %r8166, %f5345;
- and.b32 %r3046, %r8166, -2147483648;
- shr.u32 %r3047, %r8252, 30;
- shl.b32 %r3048, %r8251, 2;
- or.b32 %r3049, %r3047, %r3048;
- shr.u32 %r3050, %r3049, 31;
- shr.u32 %r3051, %r8251, 30;
- add.s32 %r3052, %r3050, %r3051;
- neg.s32 %r3053, %r3052;
- setp.eq.s32 %p145, %r3046, 0;
- selp.b32 %r8253, %r3052, %r3053, %p145;
- setp.ne.s32 %p146, %r3050, 0;
- xor.b32 %r3054, %r3046, -2147483648;
- selp.b32 %r3055, %r3054, %r3046, %p146;
- selp.b32 %r3056, -1, 0, %p146;
- xor.b32 %r3057, %r3049, %r3056;
- shl.b32 %r3058, %r8252, 2;
- xor.b32 %r3059, %r3058, %r3056;
- cvt.u64.u32 %rd740, %r3057;
- cvt.u64.u32 %rd741, %r3059;
- bfi.b64 %rd742, %rd740, %rd741, 32, 32;
- cvt.rn.f64.s64 %fd13, %rd742;
- mul.f64 %fd14, %fd13, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f2398, %fd14;
- setp.eq.s32 %p147, %r3055, 0;
- neg.f32 %f2399, %f2398;
- selp.f32 %f5217, %f2398, %f2399, %p147;
-
-$L__BB0_148:
- and.b32 %r175, %r8253, 1;
- setp.eq.s32 %p148, %r175, 0;
- selp.f32 %f136, %f5217, 0f3F800000, %p148;
- mul.rn.f32 %f137, %f5217, %f5217;
- mov.f32 %f5218, 0fB94D4153;
- @%p148 bra $L__BB0_150;
-
- mov.f32 %f2402, 0fBAB607ED;
- mov.f32 %f2403, 0f37CBAC00;
- fma.rn.f32 %f5218, %f2403, %f137, %f2402;
-
-$L__BB0_150:
- selp.f32 %f2404, 0f3C0885E4, 0f3D2AAABB, %p148;
- fma.rn.f32 %f2405, %f5218, %f137, %f2404;
- selp.f32 %f2406, 0fBE2AAAA8, 0fBEFFFFFF, %p148;
- fma.rn.f32 %f2407, %f2405, %f137, %f2406;
- mov.f32 %f2408, 0f00000000;
- fma.rn.f32 %f2409, %f137, %f136, %f2408;
- fma.rn.f32 %f5213, %f2407, %f2409, %f136;
- and.b32 %r3061, %r8253, 2;
- setp.eq.s32 %p150, %r3061, 0;
- @%p150 bra $L__BB0_152;
-
- mov.f32 %f2411, 0fBF800000;
- fma.rn.f32 %f5213, %f5213, %f2411, %f2408;
-
-$L__BB0_152:
- shl.b32 %r8201, %r12, 5;
- mov.u32 %r8200, -32;
- sub.s32 %r8199, %r8200, %r8201;
- setp.ge.s32 %p1823, %r14, %r8199;
- @%p1823 bra $L__BB0_165;
-
- mul.f32 %f2412, %f5337, 0f3F22F983;
- cvt.rni.s32.f32 %r8257, %f2412;
- cvt.rn.f32.s32 %f2413, %r8257;
- mov.f32 %f2414, 0fBFC90FDA;
- fma.rn.f32 %f2415, %f2413, %f2414, %f5337;
- mov.f32 %f2416, 0fB3A22168;
- fma.rn.f32 %f2417, %f2413, %f2416, %f2415;
- mov.f32 %f2418, 0fA7C234C5;
- fma.rn.f32 %f5221, %f2413, %f2418, %f2417;
- abs.f32 %f145, %f5337;
- setp.ltu.f32 %p152, %f145, 0f47CE4780;
- @%p152 bra $L__BB0_161;
-
- setp.eq.f32 %p153, %f145, 0f7F800000;
- @%p153 bra $L__BB0_160;
- bra.uni $L__BB0_155;
-
-$L__BB0_160:
- mov.f32 %f2421, 0f00000000;
- mul.rn.f32 %f5221, %f5337, %f2421;
- mov.u32 %r8257, 0;
- bra.uni $L__BB0_161;
-
-$L__BB0_155:
- mov.b32 %r177, %f5337;
- shr.u32 %r3063, %r177, 23;
- and.b32 %r3064, %r3063, 255;
- shl.b32 %r3065, %r177, 8;
- or.b32 %r179, %r3065, -2147483648;
- mov.u64 %rd2512, 0;
- mov.u32 %r8254, 0;
- mov.u64 %rd746, __cudart_i2opi_f;
- mov.u64 %rd2513, %rd2512;
-
-$L__BB0_156:
+ shl.b64 %rd1973, %rd2677, 2;
+ add.s64 %rd1975, %rd1974, %rd1973;
+ ld.global.nc.u32 %r6067, [%rd1975];
+ mad.wide.u32 %rd1976, %r6067, %r1690, %rd2678;
+ shr.u64 %rd2678, %rd1976, 32;
+ add.s64 %rd1977, %rd1, %rd1973;
+ st.local.u32 [%rd1977], %rd1976;
+ add.s32 %r8311, %r8311, 1;
+ cvt.s64.s32 %rd2677, %r8311;
+ setp.ne.s32 %p1093, %r8311, 6;
+ @%p1093 bra $L__BB0_1281;
+
+ st.local.u32 [%rd4], %rd2678;
+ mov.u32 %r6068, 4;
+ sub.s32 %r1694, %r6068, %r1691;
+ mov.u32 %r6069, 6;
+ sub.s32 %r6070, %r6069, %r1691;
+ mul.wide.s32 %rd1978, %r6070, 4;
+ add.s64 %rd1979, %rd1, %rd1978;
+ ld.local.u32 %r8312, [%rd1979];
+ ld.local.u32 %r8313, [%rd1979+-4];
+ and.b32 %r1697, %r1689, 31;
+ setp.eq.s32 %p1094, %r1697, 0;
+ @%p1094 bra $L__BB0_1284;
+
+ mov.u32 %r6071, 32;
+ sub.s32 %r6072, %r6071, %r1697;
+ shr.u32 %r6073, %r8313, %r6072;
+ shl.b32 %r6074, %r8312, %r1697;
+ add.s32 %r8312, %r6073, %r6074;
+ mul.wide.s32 %rd1980, %r1694, 4;
+ add.s64 %rd1981, %rd1, %rd1980;
+ ld.local.u32 %r6075, [%rd1981];
+ shr.u32 %r6076, %r6075, %r6072;
+ shl.b32 %r6077, %r8313, %r1697;
+ add.s32 %r8313, %r6076, %r6077;
+
+$L__BB0_1284:
+ and.b32 %r6078, %r1688, -2147483648;
+ shr.u32 %r6079, %r8313, 30;
+ shl.b32 %r6080, %r8312, 2;
+ or.b32 %r6081, %r6079, %r6080;
+ shr.u32 %r6082, %r6081, 31;
+ shr.u32 %r6083, %r8312, 30;
+ add.s32 %r6084, %r6082, %r6083;
+ neg.s32 %r6085, %r6084;
+ setp.eq.s32 %p1095, %r6078, 0;
+ selp.b32 %r8314, %r6084, %r6085, %p1095;
+ setp.ne.s32 %p1096, %r6082, 0;
+ xor.b32 %r6086, %r6078, -2147483648;
+ selp.b32 %r6087, %r6086, %r6078, %p1096;
+ selp.b32 %r6088, -1, 0, %p1096;
+ xor.b32 %r6089, %r6081, %r6088;
+ shl.b32 %r6090, %r8313, 2;
+ xor.b32 %r6091, %r6090, %r6088;
+ cvt.u64.u32 %rd1982, %r6089;
+ cvt.u64.u32 %rd1983, %r6091;
+ bfi.b64 %rd1984, %rd1982, %rd1983, 32, 32;
+ cvt.rn.f64.s64 %fd173, %rd1984;
+ mul.f64 %fd174, %fd173, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4176, %fd174;
+ setp.eq.s32 %p1097, %r6087, 0;
+ neg.f32 %f4177, %f4176;
+ selp.f32 %f5761, %f4176, %f4177, %p1097;
+
+$L__BB0_1286:
+ and.b32 %r1704, %r8314, 1;
+ setp.eq.s32 %p1098, %r1704, 0;
+ selp.f32 %f1502, %f5761, 0f3F800000, %p1098;
+ mul.rn.f32 %f1503, %f5761, %f5761;
+ mov.f32 %f5762, 0fB94D4153;
+ @%p1098 bra $L__BB0_1288;
+
+ mov.f32 %f4180, 0fBAB607ED;
+ mov.f32 %f4181, 0f37CBAC00;
+ fma.rn.f32 %f5762, %f4181, %f1503, %f4180;
+
+$L__BB0_1288:
+ selp.f32 %f4182, 0f3C0885E4, 0f3D2AAABB, %p1098;
+ fma.rn.f32 %f4183, %f5762, %f1503, %f4182;
+ selp.f32 %f4184, 0fBE2AAAA8, 0fBEFFFFFF, %p1098;
+ fma.rn.f32 %f4185, %f4183, %f1503, %f4184;
+ mov.f32 %f4186, 0f00000000;
+ fma.rn.f32 %f4187, %f1503, %f1502, %f4186;
+ fma.rn.f32 %f5763, %f4185, %f4187, %f1502;
+ and.b32 %r6093, %r8314, 2;
+ setp.eq.s32 %p1100, %r6093, 0;
+ @%p1100 bra $L__BB0_1290;
+
+ mov.f32 %f4189, 0fBF800000;
+ fma.rn.f32 %f5763, %f5763, %f4189, %f4186;
+
+$L__BB0_1290:
+ mul.f32 %f4190, %f1409, 0f3F22F983;
+ cvt.rni.s32.f32 %r8318, %f4190;
+ cvt.rn.f32.s32 %f4191, %r8318;
+ mov.f32 %f4192, 0fBFC90FDA;
+ fma.rn.f32 %f4193, %f4191, %f4192, %f1409;
+ mov.f32 %f4194, 0fB3A22168;
+ fma.rn.f32 %f4195, %f4191, %f4194, %f4193;
+ mov.f32 %f4196, 0fA7C234C5;
+ fma.rn.f32 %f5764, %f4191, %f4196, %f4195;
+ abs.f32 %f1510, %f1409;
+ setp.ltu.f32 %p1101, %f1510, 0f47CE4780;
+ @%p1101 bra $L__BB0_1298;
+
+ setp.eq.f32 %p1102, %f1510, 0f7F800000;
+ @%p1102 bra $L__BB0_1297;
+ bra.uni $L__BB0_1292;
+
+$L__BB0_1297:
+ mov.f32 %f4199, 0f00000000;
+ mul.rn.f32 %f5764, %f1409, %f4199;
+ mov.u32 %r8318, 0;
+ bra.uni $L__BB0_1298;
+
+$L__BB0_1292:
+ mov.b32 %r1706, %f1409;
+ shr.u32 %r6095, %r1706, 23;
+ and.b32 %r6096, %r6095, 255;
+ add.s32 %r1707, %r6096, -128;
+ shl.b32 %r6097, %r1706, 8;
+ or.b32 %r1708, %r6097, -2147483648;
+ shr.u32 %r1709, %r1707, 5;
+ mov.u64 %rd2679, 0;
+ mov.u32 %r8315, 0;
+ mov.u64 %rd1988, __cudart_i2opi_f;
+ mov.u64 %rd2680, %rd2679;
+
+$L__BB0_1293:
.pragma "nounroll";
- shl.b64 %rd745, %rd2512, 2;
- add.s64 %rd747, %rd746, %rd745;
- ld.global.nc.u32 %r3066, [%rd747];
- mad.wide.u32 %rd748, %r3066, %r179, %rd2513;
- shr.u64 %rd2513, %rd748, 32;
- add.s64 %rd749, %rd1, %rd745;
- st.local.u32 [%rd749], %rd748;
- add.s32 %r8254, %r8254, 1;
- cvt.s64.s32 %rd2512, %r8254;
- setp.ne.s32 %p154, %r8254, 6;
- @%p154 bra $L__BB0_156;
-
- add.s32 %r8175, %r3064, -128;
- mov.b32 %r8174, %f5337;
- shr.u32 %r8173, %r8174, 23;
- and.b32 %r8172, %r8173, 255;
- add.s32 %r8171, %r8172, -128;
- shr.u32 %r8170, %r8171, 5;
- st.local.u32 [%rd5], %rd2513;
- mov.u32 %r3067, 4;
- sub.s32 %r183, %r3067, %r8170;
- mov.u32 %r3068, 6;
- sub.s32 %r3069, %r3068, %r8170;
- mul.wide.s32 %rd750, %r3069, 4;
- add.s64 %rd751, %rd1, %rd750;
- ld.local.u32 %r8255, [%rd751];
- ld.local.u32 %r8256, [%rd751+-4];
- and.b32 %r186, %r8171, 31;
- setp.eq.s32 %p155, %r186, 0;
- @%p155 bra $L__BB0_159;
-
- mov.u32 %r3070, 32;
- sub.s32 %r3071, %r3070, %r186;
- shr.u32 %r3072, %r8256, %r3071;
- shl.b32 %r3073, %r8255, %r186;
- add.s32 %r8255, %r3072, %r3073;
- mul.wide.s32 %rd752, %r183, 4;
- add.s64 %rd753, %rd1, %rd752;
- ld.local.u32 %r3074, [%rd753];
- shr.u32 %r3075, %r3074, %r3071;
- shl.b32 %r3076, %r8256, %r186;
- add.s32 %r8256, %r3075, %r3076;
-
-$L__BB0_159:
- mov.b32 %r8176, %f5337;
- and.b32 %r3077, %r8176, -2147483648;
- shr.u32 %r3078, %r8256, 30;
- shl.b32 %r3079, %r8255, 2;
- or.b32 %r3080, %r3078, %r3079;
- shr.u32 %r3081, %r3080, 31;
- shr.u32 %r3082, %r8255, 30;
- add.s32 %r3083, %r3081, %r3082;
- neg.s32 %r3084, %r3083;
- setp.eq.s32 %p156, %r3077, 0;
- selp.b32 %r8257, %r3083, %r3084, %p156;
- setp.ne.s32 %p157, %r3081, 0;
- xor.b32 %r3085, %r3077, -2147483648;
- selp.b32 %r3086, %r3085, %r3077, %p157;
- selp.b32 %r3087, -1, 0, %p157;
- xor.b32 %r3088, %r3080, %r3087;
- shl.b32 %r3089, %r8256, 2;
- xor.b32 %r3090, %r3089, %r3087;
- cvt.u64.u32 %rd754, %r3088;
- cvt.u64.u32 %rd755, %r3090;
- bfi.b64 %rd756, %rd754, %rd755, 32, 32;
- cvt.rn.f64.s64 %fd15, %rd756;
- mul.f64 %fd16, %fd15, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f2419, %fd16;
- setp.eq.s32 %p158, %r3086, 0;
- neg.f32 %f2420, %f2419;
- selp.f32 %f5221, %f2419, %f2420, %p158;
-
-$L__BB0_161:
- add.s32 %r193, %r8257, 1;
- and.b32 %r194, %r193, 1;
- setp.eq.s32 %p159, %r194, 0;
- selp.f32 %f149, %f5221, 0f3F800000, %p159;
- mul.rn.f32 %f150, %f5221, %f5221;
- mov.f32 %f5222, 0fB94D4153;
- @%p159 bra $L__BB0_163;
-
- mov.f32 %f2423, 0fBAB607ED;
- mov.f32 %f2424, 0f37CBAC00;
- fma.rn.f32 %f5222, %f2424, %f150, %f2423;
-
-$L__BB0_163:
- selp.f32 %f2425, 0f3C0885E4, 0f3D2AAABB, %p159;
- fma.rn.f32 %f2426, %f5222, %f150, %f2425;
- selp.f32 %f2427, 0fBE2AAAA8, 0fBEFFFFFF, %p159;
- fma.rn.f32 %f2428, %f2426, %f150, %f2427;
- mov.f32 %f2429, 0f00000000;
- fma.rn.f32 %f2430, %f150, %f149, %f2429;
- fma.rn.f32 %f5215, %f2428, %f2430, %f149;
- and.b32 %r3092, %r193, 2;
- setp.eq.s32 %p161, %r3092, 0;
- @%p161 bra $L__BB0_165;
-
- mov.f32 %f2432, 0fBF800000;
- fma.rn.f32 %f5215, %f5215, %f2432, %f2429;
-
-$L__BB0_165:
- shl.b32 %r8182, %r12, 5;
- mov.u32 %r8181, -32;
- sub.s32 %r8180, %r8181, %r8182;
- setp.lt.s32 %p1816, %r14, %r8180;
- shl.b32 %r8179, %r12, 5;
- mov.u32 %r8178, -32;
- sub.s32 %r8177, %r8178, %r8179;
- setp.ge.s32 %p1815, %r14, %r8177;
- selp.f32 %f157, %f5215, %f5216, %p1816;
- selp.f32 %f158, %f5213, %f5214, %p1816;
- @%p1815 bra $L__BB0_167;
-
- add.f32 %f5329, %f158, %f157;
- mov.f32 %f5214, %f5213;
- mov.f32 %f5216, %f5215;
-
-$L__BB0_167:
- add.s32 %r8049, %r13, -15;
- mov.u32 %r3093, -2;
- sub.s32 %r3094, %r3093, %r12;
- setp.ge.s32 %p164, %r8049, %r3094;
- @%p164 bra $L__BB0_196;
-
- shl.b32 %r3096, %r12, 5;
- neg.s32 %r195, %r3096;
- setp.ge.s32 %p165, %r14, %r195;
- @%p165 bra $L__BB0_181;
-
- mul.f32 %f2435, %f5344, 0f3F22F983;
- cvt.rni.s32.f32 %r8261, %f2435;
- cvt.rn.f32.s32 %f2436, %r8261;
- mov.f32 %f2437, 0fBFC90FDA;
- fma.rn.f32 %f2438, %f2436, %f2437, %f5344;
- mov.f32 %f2439, 0fB3A22168;
- fma.rn.f32 %f2440, %f2436, %f2439, %f2438;
- mov.f32 %f2441, 0fA7C234C5;
- fma.rn.f32 %f5230, %f2436, %f2441, %f2440;
- abs.f32 %f166, %f5344;
- setp.ltu.f32 %p166, %f166, 0f47CE4780;
- @%p166 bra $L__BB0_177;
-
- setp.eq.f32 %p167, %f166, 0f7F800000;
- @%p167 bra $L__BB0_176;
- bra.uni $L__BB0_171;
-
-$L__BB0_176:
- mov.f32 %f2444, 0f00000000;
- mul.rn.f32 %f5230, %f5344, %f2444;
- mov.u32 %r8261, 0;
- bra.uni $L__BB0_177;
-
-$L__BB0_171:
- mov.b32 %r197, %f5344;
- shr.u32 %r3098, %r197, 23;
- and.b32 %r3099, %r3098, 255;
- shl.b32 %r3100, %r197, 8;
- or.b32 %r199, %r3100, -2147483648;
- mov.u64 %rd2514, 0;
- mov.u32 %r8258, 0;
- mov.u64 %rd760, __cudart_i2opi_f;
- mov.u64 %rd2515, %rd2514;
-
-$L__BB0_172:
+ shl.b64 %rd1987, %rd2679, 2;
+ add.s64 %rd1989, %rd1988, %rd1987;
+ ld.global.nc.u32 %r6098, [%rd1989];
+ mad.wide.u32 %rd1990, %r6098, %r1708, %rd2680;
+ shr.u64 %rd2680, %rd1990, 32;
+ add.s64 %rd1991, %rd1, %rd1987;
+ st.local.u32 [%rd1991], %rd1990;
+ add.s32 %r8315, %r8315, 1;
+ cvt.s64.s32 %rd2679, %r8315;
+ setp.ne.s32 %p1103, %r8315, 6;
+ @%p1103 bra $L__BB0_1293;
+
+ st.local.u32 [%rd4], %rd2680;
+ mov.u32 %r6099, 4;
+ sub.s32 %r1712, %r6099, %r1709;
+ mov.u32 %r6100, 6;
+ sub.s32 %r6101, %r6100, %r1709;
+ mul.wide.s32 %rd1992, %r6101, 4;
+ add.s64 %rd1993, %rd1, %rd1992;
+ ld.local.u32 %r8316, [%rd1993];
+ ld.local.u32 %r8317, [%rd1993+-4];
+ and.b32 %r1715, %r1707, 31;
+ setp.eq.s32 %p1104, %r1715, 0;
+ @%p1104 bra $L__BB0_1296;
+
+ mov.u32 %r6102, 32;
+ sub.s32 %r6103, %r6102, %r1715;
+ shr.u32 %r6104, %r8317, %r6103;
+ shl.b32 %r6105, %r8316, %r1715;
+ add.s32 %r8316, %r6104, %r6105;
+ mul.wide.s32 %rd1994, %r1712, 4;
+ add.s64 %rd1995, %rd1, %rd1994;
+ ld.local.u32 %r6106, [%rd1995];
+ shr.u32 %r6107, %r6106, %r6103;
+ shl.b32 %r6108, %r8317, %r1715;
+ add.s32 %r8317, %r6107, %r6108;
+
+$L__BB0_1296:
+ and.b32 %r6109, %r1706, -2147483648;
+ shr.u32 %r6110, %r8317, 30;
+ shl.b32 %r6111, %r8316, 2;
+ or.b32 %r6112, %r6110, %r6111;
+ shr.u32 %r6113, %r6112, 31;
+ shr.u32 %r6114, %r8316, 30;
+ add.s32 %r6115, %r6113, %r6114;
+ neg.s32 %r6116, %r6115;
+ setp.eq.s32 %p1105, %r6109, 0;
+ selp.b32 %r8318, %r6115, %r6116, %p1105;
+ setp.ne.s32 %p1106, %r6113, 0;
+ xor.b32 %r6117, %r6109, -2147483648;
+ selp.b32 %r6118, %r6117, %r6109, %p1106;
+ selp.b32 %r6119, -1, 0, %p1106;
+ xor.b32 %r6120, %r6112, %r6119;
+ shl.b32 %r6121, %r8317, 2;
+ xor.b32 %r6122, %r6121, %r6119;
+ cvt.u64.u32 %rd1996, %r6120;
+ cvt.u64.u32 %rd1997, %r6122;
+ bfi.b64 %rd1998, %rd1996, %rd1997, 32, 32;
+ cvt.rn.f64.s64 %fd175, %rd1998;
+ mul.f64 %fd176, %fd175, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4197, %fd176;
+ setp.eq.s32 %p1107, %r6118, 0;
+ neg.f32 %f4198, %f4197;
+ selp.f32 %f5764, %f4197, %f4198, %p1107;
+
+$L__BB0_1298:
+ add.s32 %r1722, %r8318, 1;
+ and.b32 %r1723, %r1722, 1;
+ setp.eq.s32 %p1108, %r1723, 0;
+ selp.f32 %f1514, %f5764, 0f3F800000, %p1108;
+ mul.rn.f32 %f1515, %f5764, %f5764;
+ mov.f32 %f5765, 0fB94D4153;
+ @%p1108 bra $L__BB0_1300;
+
+ mov.f32 %f4201, 0fBAB607ED;
+ mov.f32 %f4202, 0f37CBAC00;
+ fma.rn.f32 %f5765, %f4202, %f1515, %f4201;
+
+$L__BB0_1300:
+ selp.f32 %f4203, 0f3C0885E4, 0f3D2AAABB, %p1108;
+ fma.rn.f32 %f4204, %f5765, %f1515, %f4203;
+ selp.f32 %f4205, 0fBE2AAAA8, 0fBEFFFFFF, %p1108;
+ fma.rn.f32 %f4206, %f4204, %f1515, %f4205;
+ mov.f32 %f4207, 0f00000000;
+ fma.rn.f32 %f4208, %f1515, %f1514, %f4207;
+ fma.rn.f32 %f5766, %f4206, %f4208, %f1514;
+ and.b32 %r6124, %r1722, 2;
+ setp.eq.s32 %p1110, %r6124, 0;
+ @%p1110 bra $L__BB0_1302;
+
+ mov.f32 %f4210, 0fBF800000;
+ fma.rn.f32 %f5766, %f5766, %f4210, %f4207;
+
+$L__BB0_1302:
+ add.f32 %f5795, %f5763, %f5766;
+ mul.f32 %f4211, %f1418, 0f3F22F983;
+ cvt.rni.s32.f32 %r8322, %f4211;
+ cvt.rn.f32.s32 %f4212, %r8322;
+ mov.f32 %f4213, 0fBFC90FDA;
+ fma.rn.f32 %f4214, %f4212, %f4213, %f1418;
+ mov.f32 %f4215, 0fB3A22168;
+ fma.rn.f32 %f4216, %f4212, %f4215, %f4214;
+ mov.f32 %f4217, 0fA7C234C5;
+ fma.rn.f32 %f5767, %f4212, %f4217, %f4216;
+ abs.f32 %f1523, %f1418;
+ setp.ltu.f32 %p1111, %f1523, 0f47CE4780;
+ @%p1111 bra $L__BB0_1310;
+
+ setp.eq.f32 %p1112, %f1523, 0f7F800000;
+ @%p1112 bra $L__BB0_1309;
+ bra.uni $L__BB0_1304;
+
+$L__BB0_1309:
+ mov.f32 %f4220, 0f00000000;
+ mul.rn.f32 %f5767, %f1418, %f4220;
+ mov.u32 %r8322, 0;
+ bra.uni $L__BB0_1310;
+
+$L__BB0_1304:
+ mov.b32 %r1725, %f1418;
+ shr.u32 %r6126, %r1725, 23;
+ and.b32 %r6127, %r6126, 255;
+ add.s32 %r1726, %r6127, -128;
+ shl.b32 %r6128, %r1725, 8;
+ or.b32 %r1727, %r6128, -2147483648;
+ shr.u32 %r1728, %r1726, 5;
+ mov.u64 %rd2681, 0;
+ mov.u32 %r8319, 0;
+ mov.u64 %rd2002, __cudart_i2opi_f;
+ mov.u64 %rd2682, %rd2681;
+
+$L__BB0_1305:
.pragma "nounroll";
- shl.b64 %rd759, %rd2514, 2;
- add.s64 %rd761, %rd760, %rd759;
- ld.global.nc.u32 %r3101, [%rd761];
- mad.wide.u32 %rd762, %r3101, %r199, %rd2515;
- shr.u64 %rd2515, %rd762, 32;
- add.s64 %rd763, %rd1, %rd759;
- st.local.u32 [%rd763], %rd762;
- add.s32 %r8258, %r8258, 1;
- cvt.s64.s32 %rd2514, %r8258;
- setp.ne.s32 %p168, %r8258, 6;
- @%p168 bra $L__BB0_172;
-
- add.s32 %r8207, %r3099, -128;
- mov.b32 %r8206, %f5344;
- shr.u32 %r8205, %r8206, 23;
- and.b32 %r8204, %r8205, 255;
- add.s32 %r8203, %r8204, -128;
- shr.u32 %r8202, %r8203, 5;
- st.local.u32 [%rd5], %rd2515;
- mov.u32 %r3102, 4;
- sub.s32 %r203, %r3102, %r8202;
- mov.u32 %r3103, 6;
- sub.s32 %r3104, %r3103, %r8202;
- mul.wide.s32 %rd764, %r3104, 4;
- add.s64 %rd765, %rd1, %rd764;
- ld.local.u32 %r8259, [%rd765];
- ld.local.u32 %r8260, [%rd765+-4];
- and.b32 %r206, %r8203, 31;
- setp.eq.s32 %p169, %r206, 0;
- @%p169 bra $L__BB0_175;
-
- mov.u32 %r3105, 32;
- sub.s32 %r3106, %r3105, %r206;
- shr.u32 %r3107, %r8260, %r3106;
- shl.b32 %r3108, %r8259, %r206;
- add.s32 %r8259, %r3107, %r3108;
- mul.wide.s32 %rd766, %r203, 4;
- add.s64 %rd767, %rd1, %rd766;
- ld.local.u32 %r3109, [%rd767];
- shr.u32 %r3110, %r3109, %r3106;
- shl.b32 %r3111, %r8260, %r206;
- add.s32 %r8260, %r3110, %r3111;
-
-$L__BB0_175:
- mov.b32 %r8208, %f5344;
- and.b32 %r3112, %r8208, -2147483648;
- shr.u32 %r3113, %r8260, 30;
- shl.b32 %r3114, %r8259, 2;
- or.b32 %r3115, %r3113, %r3114;
- shr.u32 %r3116, %r3115, 31;
- shr.u32 %r3117, %r8259, 30;
- add.s32 %r3118, %r3116, %r3117;
- neg.s32 %r3119, %r3118;
- setp.eq.s32 %p170, %r3112, 0;
- selp.b32 %r8261, %r3118, %r3119, %p170;
- setp.ne.s32 %p171, %r3116, 0;
- xor.b32 %r3120, %r3112, -2147483648;
- selp.b32 %r3121, %r3120, %r3112, %p171;
- selp.b32 %r3122, -1, 0, %p171;
- xor.b32 %r3123, %r3115, %r3122;
- shl.b32 %r3124, %r8260, 2;
- xor.b32 %r3125, %r3124, %r3122;
- cvt.u64.u32 %rd768, %r3123;
- cvt.u64.u32 %rd769, %r3125;
- bfi.b64 %rd770, %rd768, %rd769, 32, 32;
- cvt.rn.f64.s64 %fd17, %rd770;
- mul.f64 %fd18, %fd17, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f2442, %fd18;
- setp.eq.s32 %p172, %r3121, 0;
- neg.f32 %f2443, %f2442;
- selp.f32 %f5230, %f2442, %f2443, %p172;
-
-$L__BB0_177:
- and.b32 %r213, %r8261, 1;
- setp.eq.s32 %p173, %r213, 0;
- selp.f32 %f170, %f5230, 0f3F800000, %p173;
- mul.rn.f32 %f171, %f5230, %f5230;
- mov.f32 %f5231, 0fB94D4153;
- @%p173 bra $L__BB0_179;
-
- mov.f32 %f2446, 0fBAB607ED;
- mov.f32 %f2447, 0f37CBAC00;
- fma.rn.f32 %f5231, %f2447, %f171, %f2446;
-
-$L__BB0_179:
- selp.f32 %f2448, 0f3C0885E4, 0f3D2AAABB, %p173;
- fma.rn.f32 %f2449, %f5231, %f171, %f2448;
- selp.f32 %f2450, 0fBE2AAAA8, 0fBEFFFFFF, %p173;
- fma.rn.f32 %f2451, %f2449, %f171, %f2450;
- mov.f32 %f2452, 0f00000000;
- fma.rn.f32 %f2453, %f171, %f170, %f2452;
- fma.rn.f32 %f5213, %f2451, %f2453, %f170;
- and.b32 %r3127, %r8261, 2;
- setp.eq.s32 %p175, %r3127, 0;
- @%p175 bra $L__BB0_181;
-
- mov.f32 %f2455, 0fBF800000;
- fma.rn.f32 %f5213, %f5213, %f2455, %f2452;
-
-$L__BB0_181:
- shl.b32 %r8210, %r12, 5;
- neg.s32 %r8209, %r8210;
- setp.lt.s32 %p4, %r14, %r8209;
- @%p165 bra $L__BB0_194;
-
- mul.f32 %f2456, %f5336, 0f3F22F983;
- cvt.rni.s32.f32 %r8265, %f2456;
- cvt.rn.f32.s32 %f2457, %r8265;
- mov.f32 %f2458, 0fBFC90FDA;
- fma.rn.f32 %f2459, %f2457, %f2458, %f5336;
- mov.f32 %f2460, 0fB3A22168;
- fma.rn.f32 %f2461, %f2457, %f2460, %f2459;
- mov.f32 %f2462, 0fA7C234C5;
- fma.rn.f32 %f5234, %f2457, %f2462, %f2461;
- abs.f32 %f179, %f5336;
- setp.ltu.f32 %p177, %f179, 0f47CE4780;
- @%p177 bra $L__BB0_190;
-
- setp.eq.f32 %p178, %f179, 0f7F800000;
- @%p178 bra $L__BB0_189;
- bra.uni $L__BB0_184;
-
-$L__BB0_189:
- mov.f32 %f2465, 0f00000000;
- mul.rn.f32 %f5234, %f5336, %f2465;
- mov.u32 %r8265, 0;
- bra.uni $L__BB0_190;
-
-$L__BB0_184:
- mov.b32 %r215, %f5336;
- shr.u32 %r3129, %r215, 23;
- and.b32 %r3130, %r3129, 255;
- add.s32 %r216, %r3130, -128;
- shl.b32 %r3131, %r215, 8;
- or.b32 %r217, %r3131, -2147483648;
- shr.u32 %r218, %r216, 5;
- mov.u64 %rd2516, 0;
- mov.u32 %r8262, 0;
- mov.u64 %rd774, __cudart_i2opi_f;
- mov.u64 %rd2517, %rd2516;
-
-$L__BB0_185:
+ shl.b64 %rd2001, %rd2681, 2;
+ add.s64 %rd2003, %rd2002, %rd2001;
+ ld.global.nc.u32 %r6129, [%rd2003];
+ mad.wide.u32 %rd2004, %r6129, %r1727, %rd2682;
+ shr.u64 %rd2682, %rd2004, 32;
+ add.s64 %rd2005, %rd1, %rd2001;
+ st.local.u32 [%rd2005], %rd2004;
+ add.s32 %r8319, %r8319, 1;
+ cvt.s64.s32 %rd2681, %r8319;
+ setp.ne.s32 %p1113, %r8319, 6;
+ @%p1113 bra $L__BB0_1305;
+
+ st.local.u32 [%rd4], %rd2682;
+ mov.u32 %r6130, 4;
+ sub.s32 %r1731, %r6130, %r1728;
+ mov.u32 %r6131, 6;
+ sub.s32 %r6132, %r6131, %r1728;
+ mul.wide.s32 %rd2006, %r6132, 4;
+ add.s64 %rd2007, %rd1, %rd2006;
+ ld.local.u32 %r8320, [%rd2007];
+ ld.local.u32 %r8321, [%rd2007+-4];
+ and.b32 %r1734, %r1726, 31;
+ setp.eq.s32 %p1114, %r1734, 0;
+ @%p1114 bra $L__BB0_1308;
+
+ mov.u32 %r6133, 32;
+ sub.s32 %r6134, %r6133, %r1734;
+ shr.u32 %r6135, %r8321, %r6134;
+ shl.b32 %r6136, %r8320, %r1734;
+ add.s32 %r8320, %r6135, %r6136;
+ mul.wide.s32 %rd2008, %r1731, 4;
+ add.s64 %rd2009, %rd1, %rd2008;
+ ld.local.u32 %r6137, [%rd2009];
+ shr.u32 %r6138, %r6137, %r6134;
+ shl.b32 %r6139, %r8321, %r1734;
+ add.s32 %r8321, %r6138, %r6139;
+
+$L__BB0_1308:
+ and.b32 %r6140, %r1725, -2147483648;
+ shr.u32 %r6141, %r8321, 30;
+ shl.b32 %r6142, %r8320, 2;
+ or.b32 %r6143, %r6141, %r6142;
+ shr.u32 %r6144, %r6143, 31;
+ shr.u32 %r6145, %r8320, 30;
+ add.s32 %r6146, %r6144, %r6145;
+ neg.s32 %r6147, %r6146;
+ setp.eq.s32 %p1115, %r6140, 0;
+ selp.b32 %r8322, %r6146, %r6147, %p1115;
+ setp.ne.s32 %p1116, %r6144, 0;
+ xor.b32 %r6148, %r6140, -2147483648;
+ selp.b32 %r6149, %r6148, %r6140, %p1116;
+ selp.b32 %r6150, -1, 0, %p1116;
+ xor.b32 %r6151, %r6143, %r6150;
+ shl.b32 %r6152, %r8321, 2;
+ xor.b32 %r6153, %r6152, %r6150;
+ cvt.u64.u32 %rd2010, %r6151;
+ cvt.u64.u32 %rd2011, %r6153;
+ bfi.b64 %rd2012, %rd2010, %rd2011, 32, 32;
+ cvt.rn.f64.s64 %fd177, %rd2012;
+ mul.f64 %fd178, %fd177, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4218, %fd178;
+ setp.eq.s32 %p1117, %r6149, 0;
+ neg.f32 %f4219, %f4218;
+ selp.f32 %f5767, %f4218, %f4219, %p1117;
+
+$L__BB0_1310:
+ and.b32 %r1741, %r8322, 1;
+ setp.eq.s32 %p1118, %r1741, 0;
+ selp.f32 %f1527, %f5767, 0f3F800000, %p1118;
+ mul.rn.f32 %f1528, %f5767, %f5767;
+ mov.f32 %f5768, 0fB94D4153;
+ @%p1118 bra $L__BB0_1312;
+
+ mov.f32 %f4222, 0fBAB607ED;
+ mov.f32 %f4223, 0f37CBAC00;
+ fma.rn.f32 %f5768, %f4223, %f1528, %f4222;
+
+$L__BB0_1312:
+ selp.f32 %f4224, 0f3C0885E4, 0f3D2AAABB, %p1118;
+ fma.rn.f32 %f4225, %f5768, %f1528, %f4224;
+ selp.f32 %f4226, 0fBE2AAAA8, 0fBEFFFFFF, %p1118;
+ fma.rn.f32 %f4227, %f4225, %f1528, %f4226;
+ mov.f32 %f4228, 0f00000000;
+ fma.rn.f32 %f4229, %f1528, %f1527, %f4228;
+ fma.rn.f32 %f5769, %f4227, %f4229, %f1527;
+ and.b32 %r6155, %r8322, 2;
+ setp.eq.s32 %p1120, %r6155, 0;
+ @%p1120 bra $L__BB0_1314;
+
+ mov.f32 %f4231, 0fBF800000;
+ fma.rn.f32 %f5769, %f5769, %f4231, %f4228;
+
+$L__BB0_1314:
+ mul.f32 %f4232, %f1410, 0f3F22F983;
+ cvt.rni.s32.f32 %r8326, %f4232;
+ cvt.rn.f32.s32 %f4233, %r8326;
+ mov.f32 %f4234, 0fBFC90FDA;
+ fma.rn.f32 %f4235, %f4233, %f4234, %f1410;
+ mov.f32 %f4236, 0fB3A22168;
+ fma.rn.f32 %f4237, %f4233, %f4236, %f4235;
+ mov.f32 %f4238, 0fA7C234C5;
+ fma.rn.f32 %f5770, %f4233, %f4238, %f4237;
+ abs.f32 %f1535, %f1410;
+ setp.ltu.f32 %p1121, %f1535, 0f47CE4780;
+ @%p1121 bra $L__BB0_1322;
+
+ setp.eq.f32 %p1122, %f1535, 0f7F800000;
+ @%p1122 bra $L__BB0_1321;
+ bra.uni $L__BB0_1316;
+
+$L__BB0_1321:
+ mov.f32 %f4241, 0f00000000;
+ mul.rn.f32 %f5770, %f1410, %f4241;
+ mov.u32 %r8326, 0;
+ bra.uni $L__BB0_1322;
+
+$L__BB0_1316:
+ mov.b32 %r1743, %f1410;
+ shr.u32 %r6157, %r1743, 23;
+ and.b32 %r6158, %r6157, 255;
+ add.s32 %r1744, %r6158, -128;
+ shl.b32 %r6159, %r1743, 8;
+ or.b32 %r1745, %r6159, -2147483648;
+ shr.u32 %r1746, %r1744, 5;
+ mov.u64 %rd2683, 0;
+ mov.u32 %r8323, 0;
+ mov.u64 %rd2016, __cudart_i2opi_f;
+ mov.u64 %rd2684, %rd2683;
+
+$L__BB0_1317:
.pragma "nounroll";
- shl.b64 %rd773, %rd2516, 2;
- add.s64 %rd775, %rd774, %rd773;
- ld.global.nc.u32 %r3132, [%rd775];
- mad.wide.u32 %rd776, %r3132, %r217, %rd2517;
- shr.u64 %rd2517, %rd776, 32;
- add.s64 %rd777, %rd1, %rd773;
- st.local.u32 [%rd777], %rd776;
- add.s32 %r8262, %r8262, 1;
- cvt.s64.s32 %rd2516, %r8262;
- setp.ne.s32 %p179, %r8262, 6;
- @%p179 bra $L__BB0_185;
-
- st.local.u32 [%rd5], %rd2517;
- mov.u32 %r3133, 4;
- sub.s32 %r221, %r3133, %r218;
- mov.u32 %r3134, 6;
- sub.s32 %r3135, %r3134, %r218;
- mul.wide.s32 %rd778, %r3135, 4;
- add.s64 %rd779, %rd1, %rd778;
- ld.local.u32 %r8263, [%rd779];
- ld.local.u32 %r8264, [%rd779+-4];
- and.b32 %r224, %r216, 31;
- setp.eq.s32 %p180, %r224, 0;
- @%p180 bra $L__BB0_188;
-
- mov.u32 %r3136, 32;
- sub.s32 %r3137, %r3136, %r224;
- shr.u32 %r3138, %r8264, %r3137;
- shl.b32 %r3139, %r8263, %r224;
- add.s32 %r8263, %r3138, %r3139;
- mul.wide.s32 %rd780, %r221, 4;
- add.s64 %rd781, %rd1, %rd780;
- ld.local.u32 %r3140, [%rd781];
- shr.u32 %r3141, %r3140, %r3137;
- shl.b32 %r3142, %r8264, %r224;
- add.s32 %r8264, %r3141, %r3142;
-
-$L__BB0_188:
- and.b32 %r3143, %r215, -2147483648;
- shr.u32 %r3144, %r8264, 30;
- shl.b32 %r3145, %r8263, 2;
- or.b32 %r3146, %r3144, %r3145;
- shr.u32 %r3147, %r3146, 31;
- shr.u32 %r3148, %r8263, 30;
- add.s32 %r3149, %r3147, %r3148;
- neg.s32 %r3150, %r3149;
- setp.eq.s32 %p181, %r3143, 0;
- selp.b32 %r8265, %r3149, %r3150, %p181;
- setp.ne.s32 %p182, %r3147, 0;
- xor.b32 %r3151, %r3143, -2147483648;
- selp.b32 %r3152, %r3151, %r3143, %p182;
- selp.b32 %r3153, -1, 0, %p182;
- xor.b32 %r3154, %r3146, %r3153;
- shl.b32 %r3155, %r8264, 2;
- xor.b32 %r3156, %r3155, %r3153;
- cvt.u64.u32 %rd782, %r3154;
- cvt.u64.u32 %rd783, %r3156;
- bfi.b64 %rd784, %rd782, %rd783, 32, 32;
- cvt.rn.f64.s64 %fd19, %rd784;
- mul.f64 %fd20, %fd19, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f2463, %fd20;
- setp.eq.s32 %p183, %r3152, 0;
- neg.f32 %f2464, %f2463;
- selp.f32 %f5234, %f2463, %f2464, %p183;
-
-$L__BB0_190:
- add.s32 %r231, %r8265, 1;
- and.b32 %r232, %r231, 1;
- setp.eq.s32 %p184, %r232, 0;
- selp.f32 %f183, %f5234, 0f3F800000, %p184;
- mul.rn.f32 %f184, %f5234, %f5234;
- mov.f32 %f5235, 0fB94D4153;
- @%p184 bra $L__BB0_192;
-
- mov.f32 %f2467, 0fBAB607ED;
- mov.f32 %f2468, 0f37CBAC00;
- fma.rn.f32 %f5235, %f2468, %f184, %f2467;
-
-$L__BB0_192:
- selp.f32 %f2469, 0f3C0885E4, 0f3D2AAABB, %p184;
- fma.rn.f32 %f2470, %f5235, %f184, %f2469;
- selp.f32 %f2471, 0fBE2AAAA8, 0fBEFFFFFF, %p184;
- fma.rn.f32 %f2472, %f2470, %f184, %f2471;
- mov.f32 %f2473, 0f00000000;
- fma.rn.f32 %f2474, %f184, %f183, %f2473;
- fma.rn.f32 %f5215, %f2472, %f2474, %f183;
- and.b32 %r3158, %r231, 2;
- setp.eq.s32 %p186, %r3158, 0;
- @%p186 bra $L__BB0_194;
-
- mov.f32 %f2476, 0fBF800000;
- fma.rn.f32 %f5215, %f5215, %f2476, %f2473;
-
-$L__BB0_194:
- selp.f32 %f191, %f5215, %f5216, %p4;
- selp.f32 %f192, %f5213, %f5214, %p4;
- @%p165 bra $L__BB0_196;
-
- add.f32 %f5328, %f192, %f191;
- mov.f32 %f5214, %f5213;
- mov.f32 %f5216, %f5215;
-
-$L__BB0_196:
- mov.u32 %r8213, -2;
- sub.s32 %r8212, %r8213, %r12;
- add.s32 %r8211, %r13, -15;
- setp.ge.s32 %p1824, %r8211, %r8212;
- @%p1824 bra $L__BB0_225;
-
- shl.b32 %r3162, %r12, 5;
- mov.u32 %r3163, -32;
- sub.s32 %r233, %r3163, %r3162;
- setp.ge.s32 %p190, %r14, %r233;
- @%p190 bra $L__BB0_210;
-
- mul.f32 %f2479, %f5343, 0f3F22F983;
- cvt.rni.s32.f32 %r8269, %f2479;
- cvt.rn.f32.s32 %f2480, %r8269;
- mov.f32 %f2481, 0fBFC90FDA;
- fma.rn.f32 %f2482, %f2480, %f2481, %f5343;
- mov.f32 %f2483, 0fB3A22168;
- fma.rn.f32 %f2484, %f2480, %f2483, %f2482;
- mov.f32 %f2485, 0fA7C234C5;
- fma.rn.f32 %f5243, %f2480, %f2485, %f2484;
- abs.f32 %f200, %f5343;
- setp.ltu.f32 %p191, %f200, 0f47CE4780;
- @%p191 bra $L__BB0_206;
-
- setp.eq.f32 %p192, %f200, 0f7F800000;
- @%p192 bra $L__BB0_205;
- bra.uni $L__BB0_200;
-
-$L__BB0_205:
- mov.f32 %f2488, 0f00000000;
- mul.rn.f32 %f5243, %f5343, %f2488;
- mov.u32 %r8269, 0;
- bra.uni $L__BB0_206;
-
-$L__BB0_200:
- mov.b32 %r235, %f5343;
- shr.u32 %r3165, %r235, 23;
- and.b32 %r3166, %r3165, 255;
- add.s32 %r236, %r3166, -128;
- shl.b32 %r3167, %r235, 8;
- or.b32 %r237, %r3167, -2147483648;
- shr.u32 %r238, %r236, 5;
- mov.u64 %rd2518, 0;
- mov.u32 %r8266, 0;
- mov.u64 %rd788, __cudart_i2opi_f;
- mov.u64 %rd2519, %rd2518;
-
-$L__BB0_201:
+ shl.b64 %rd2015, %rd2683, 2;
+ add.s64 %rd2017, %rd2016, %rd2015;
+ ld.global.nc.u32 %r6160, [%rd2017];
+ mad.wide.u32 %rd2018, %r6160, %r1745, %rd2684;
+ shr.u64 %rd2684, %rd2018, 32;
+ add.s64 %rd2019, %rd1, %rd2015;
+ st.local.u32 [%rd2019], %rd2018;
+ add.s32 %r8323, %r8323, 1;
+ cvt.s64.s32 %rd2683, %r8323;
+ setp.ne.s32 %p1123, %r8323, 6;
+ @%p1123 bra $L__BB0_1317;
+
+ st.local.u32 [%rd4], %rd2684;
+ mov.u32 %r6161, 4;
+ sub.s32 %r1749, %r6161, %r1746;
+ mov.u32 %r6162, 6;
+ sub.s32 %r6163, %r6162, %r1746;
+ mul.wide.s32 %rd2020, %r6163, 4;
+ add.s64 %rd2021, %rd1, %rd2020;
+ ld.local.u32 %r8324, [%rd2021];
+ ld.local.u32 %r8325, [%rd2021+-4];
+ and.b32 %r1752, %r1744, 31;
+ setp.eq.s32 %p1124, %r1752, 0;
+ @%p1124 bra $L__BB0_1320;
+
+ mov.u32 %r6164, 32;
+ sub.s32 %r6165, %r6164, %r1752;
+ shr.u32 %r6166, %r8325, %r6165;
+ shl.b32 %r6167, %r8324, %r1752;
+ add.s32 %r8324, %r6166, %r6167;
+ mul.wide.s32 %rd2022, %r1749, 4;
+ add.s64 %rd2023, %rd1, %rd2022;
+ ld.local.u32 %r6168, [%rd2023];
+ shr.u32 %r6169, %r6168, %r6165;
+ shl.b32 %r6170, %r8325, %r1752;
+ add.s32 %r8325, %r6169, %r6170;
+
+$L__BB0_1320:
+ and.b32 %r6171, %r1743, -2147483648;
+ shr.u32 %r6172, %r8325, 30;
+ shl.b32 %r6173, %r8324, 2;
+ or.b32 %r6174, %r6172, %r6173;
+ shr.u32 %r6175, %r6174, 31;
+ shr.u32 %r6176, %r8324, 30;
+ add.s32 %r6177, %r6175, %r6176;
+ neg.s32 %r6178, %r6177;
+ setp.eq.s32 %p1125, %r6171, 0;
+ selp.b32 %r8326, %r6177, %r6178, %p1125;
+ setp.ne.s32 %p1126, %r6175, 0;
+ xor.b32 %r6179, %r6171, -2147483648;
+ selp.b32 %r6180, %r6179, %r6171, %p1126;
+ selp.b32 %r6181, -1, 0, %p1126;
+ xor.b32 %r6182, %r6174, %r6181;
+ shl.b32 %r6183, %r8325, 2;
+ xor.b32 %r6184, %r6183, %r6181;
+ cvt.u64.u32 %rd2024, %r6182;
+ cvt.u64.u32 %rd2025, %r6184;
+ bfi.b64 %rd2026, %rd2024, %rd2025, 32, 32;
+ cvt.rn.f64.s64 %fd179, %rd2026;
+ mul.f64 %fd180, %fd179, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4239, %fd180;
+ setp.eq.s32 %p1127, %r6180, 0;
+ neg.f32 %f4240, %f4239;
+ selp.f32 %f5770, %f4239, %f4240, %p1127;
+
+$L__BB0_1322:
+ add.s32 %r1759, %r8326, 1;
+ and.b32 %r1760, %r1759, 1;
+ setp.eq.s32 %p1128, %r1760, 0;
+ selp.f32 %f1539, %f5770, 0f3F800000, %p1128;
+ mul.rn.f32 %f1540, %f5770, %f5770;
+ mov.f32 %f5771, 0fB94D4153;
+ @%p1128 bra $L__BB0_1324;
+
+ mov.f32 %f4243, 0fBAB607ED;
+ mov.f32 %f4244, 0f37CBAC00;
+ fma.rn.f32 %f5771, %f4244, %f1540, %f4243;
+
+$L__BB0_1324:
+ selp.f32 %f4245, 0f3C0885E4, 0f3D2AAABB, %p1128;
+ fma.rn.f32 %f4246, %f5771, %f1540, %f4245;
+ selp.f32 %f4247, 0fBE2AAAA8, 0fBEFFFFFF, %p1128;
+ fma.rn.f32 %f4248, %f4246, %f1540, %f4247;
+ mov.f32 %f4249, 0f00000000;
+ fma.rn.f32 %f4250, %f1540, %f1539, %f4249;
+ fma.rn.f32 %f5772, %f4248, %f4250, %f1539;
+ and.b32 %r6186, %r1759, 2;
+ setp.eq.s32 %p1130, %r6186, 0;
+ @%p1130 bra $L__BB0_1326;
+
+ mov.f32 %f4252, 0fBF800000;
+ fma.rn.f32 %f5772, %f5772, %f4252, %f4249;
+
+$L__BB0_1326:
+ add.f32 %f5794, %f5769, %f5772;
+ mul.f32 %f4253, %f1419, 0f3F22F983;
+ cvt.rni.s32.f32 %r8330, %f4253;
+ cvt.rn.f32.s32 %f4254, %r8330;
+ mov.f32 %f4255, 0fBFC90FDA;
+ fma.rn.f32 %f4256, %f4254, %f4255, %f1419;
+ mov.f32 %f4257, 0fB3A22168;
+ fma.rn.f32 %f4258, %f4254, %f4257, %f4256;
+ mov.f32 %f4259, 0fA7C234C5;
+ fma.rn.f32 %f5773, %f4254, %f4259, %f4258;
+ abs.f32 %f1548, %f1419;
+ setp.ltu.f32 %p1131, %f1548, 0f47CE4780;
+ @%p1131 bra $L__BB0_1334;
+
+ setp.eq.f32 %p1132, %f1548, 0f7F800000;
+ @%p1132 bra $L__BB0_1333;
+ bra.uni $L__BB0_1328;
+
+$L__BB0_1333:
+ mov.f32 %f4262, 0f00000000;
+ mul.rn.f32 %f5773, %f1419, %f4262;
+ mov.u32 %r8330, 0;
+ bra.uni $L__BB0_1334;
+
+$L__BB0_1328:
+ mov.b32 %r1762, %f1419;
+ shr.u32 %r6188, %r1762, 23;
+ and.b32 %r6189, %r6188, 255;
+ add.s32 %r1763, %r6189, -128;
+ shl.b32 %r6190, %r1762, 8;
+ or.b32 %r1764, %r6190, -2147483648;
+ shr.u32 %r1765, %r1763, 5;
+ mov.u64 %rd2685, 0;
+ mov.u32 %r8327, 0;
+ mov.u64 %rd2030, __cudart_i2opi_f;
+ mov.u64 %rd2686, %rd2685;
+
+$L__BB0_1329:
.pragma "nounroll";
- shl.b64 %rd787, %rd2518, 2;
- add.s64 %rd789, %rd788, %rd787;
- ld.global.nc.u32 %r3168, [%rd789];
- mad.wide.u32 %rd790, %r3168, %r237, %rd2519;
- shr.u64 %rd2519, %rd790, 32;
- add.s64 %rd791, %rd1, %rd787;
- st.local.u32 [%rd791], %rd790;
- add.s32 %r8266, %r8266, 1;
- cvt.s64.s32 %rd2518, %r8266;
- setp.ne.s32 %p193, %r8266, 6;
- @%p193 bra $L__BB0_201;
-
- st.local.u32 [%rd5], %rd2519;
- mov.u32 %r3169, 4;
- sub.s32 %r241, %r3169, %r238;
- mov.u32 %r3170, 6;
- sub.s32 %r3171, %r3170, %r238;
- mul.wide.s32 %rd792, %r3171, 4;
- add.s64 %rd793, %rd1, %rd792;
- ld.local.u32 %r8267, [%rd793];
- ld.local.u32 %r8268, [%rd793+-4];
- and.b32 %r244, %r236, 31;
- setp.eq.s32 %p194, %r244, 0;
- @%p194 bra $L__BB0_204;
-
- mov.u32 %r3172, 32;
- sub.s32 %r3173, %r3172, %r244;
- shr.u32 %r3174, %r8268, %r3173;
- shl.b32 %r3175, %r8267, %r244;
- add.s32 %r8267, %r3174, %r3175;
- mul.wide.s32 %rd794, %r241, 4;
- add.s64 %rd795, %rd1, %rd794;
- ld.local.u32 %r3176, [%rd795];
- shr.u32 %r3177, %r3176, %r3173;
- shl.b32 %r3178, %r8268, %r244;
- add.s32 %r8268, %r3177, %r3178;
-
-$L__BB0_204:
- and.b32 %r3179, %r235, -2147483648;
- shr.u32 %r3180, %r8268, 30;
- shl.b32 %r3181, %r8267, 2;
- or.b32 %r3182, %r3180, %r3181;
- shr.u32 %r3183, %r3182, 31;
- shr.u32 %r3184, %r8267, 30;
- add.s32 %r3185, %r3183, %r3184;
- neg.s32 %r3186, %r3185;
- setp.eq.s32 %p195, %r3179, 0;
- selp.b32 %r8269, %r3185, %r3186, %p195;
- setp.ne.s32 %p196, %r3183, 0;
- xor.b32 %r3187, %r3179, -2147483648;
- selp.b32 %r3188, %r3187, %r3179, %p196;
- selp.b32 %r3189, -1, 0, %p196;
- xor.b32 %r3190, %r3182, %r3189;
- shl.b32 %r3191, %r8268, 2;
- xor.b32 %r3192, %r3191, %r3189;
- cvt.u64.u32 %rd796, %r3190;
- cvt.u64.u32 %rd797, %r3192;
- bfi.b64 %rd798, %rd796, %rd797, 32, 32;
- cvt.rn.f64.s64 %fd21, %rd798;
- mul.f64 %fd22, %fd21, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f2486, %fd22;
- setp.eq.s32 %p197, %r3188, 0;
- neg.f32 %f2487, %f2486;
- selp.f32 %f5243, %f2486, %f2487, %p197;
-
-$L__BB0_206:
- and.b32 %r251, %r8269, 1;
- setp.eq.s32 %p198, %r251, 0;
- selp.f32 %f204, %f5243, 0f3F800000, %p198;
- mul.rn.f32 %f205, %f5243, %f5243;
- mov.f32 %f5244, 0fB94D4153;
- @%p198 bra $L__BB0_208;
-
- mov.f32 %f2490, 0fBAB607ED;
- mov.f32 %f2491, 0f37CBAC00;
- fma.rn.f32 %f5244, %f2491, %f205, %f2490;
-
-$L__BB0_208:
- selp.f32 %f2492, 0f3C0885E4, 0f3D2AAABB, %p198;
- fma.rn.f32 %f2493, %f5244, %f205, %f2492;
- selp.f32 %f2494, 0fBE2AAAA8, 0fBEFFFFFF, %p198;
- fma.rn.f32 %f2495, %f2493, %f205, %f2494;
- mov.f32 %f2496, 0f00000000;
- fma.rn.f32 %f2497, %f205, %f204, %f2496;
- fma.rn.f32 %f5213, %f2495, %f2497, %f204;
- and.b32 %r3194, %r8269, 2;
- setp.eq.s32 %p200, %r3194, 0;
- @%p200 bra $L__BB0_210;
-
- mov.f32 %f2499, 0fBF800000;
- fma.rn.f32 %f5213, %f5213, %f2499, %f2496;
-
-$L__BB0_210:
- setp.lt.s32 %p5, %r14, %r233;
- @%p190 bra $L__BB0_223;
-
- mul.f32 %f2500, %f5335, 0f3F22F983;
- cvt.rni.s32.f32 %r8273, %f2500;
- cvt.rn.f32.s32 %f2501, %r8273;
- mov.f32 %f2502, 0fBFC90FDA;
- fma.rn.f32 %f2503, %f2501, %f2502, %f5335;
- mov.f32 %f2504, 0fB3A22168;
- fma.rn.f32 %f2505, %f2501, %f2504, %f2503;
- mov.f32 %f2506, 0fA7C234C5;
- fma.rn.f32 %f5247, %f2501, %f2506, %f2505;
- abs.f32 %f213, %f5335;
- setp.ltu.f32 %p202, %f213, 0f47CE4780;
- @%p202 bra $L__BB0_219;
-
- setp.eq.f32 %p203, %f213, 0f7F800000;
- @%p203 bra $L__BB0_218;
- bra.uni $L__BB0_213;
-
-$L__BB0_218:
- mov.f32 %f2509, 0f00000000;
- mul.rn.f32 %f5247, %f5335, %f2509;
- mov.u32 %r8273, 0;
- bra.uni $L__BB0_219;
-
-$L__BB0_213:
- mov.b32 %r253, %f5335;
- shr.u32 %r3196, %r253, 23;
- and.b32 %r3197, %r3196, 255;
- add.s32 %r254, %r3197, -128;
- shl.b32 %r3198, %r253, 8;
- or.b32 %r255, %r3198, -2147483648;
- shr.u32 %r256, %r254, 5;
- mov.u64 %rd2520, 0;
- mov.u32 %r8270, 0;
- mov.u64 %rd802, __cudart_i2opi_f;
- mov.u64 %rd2521, %rd2520;
-
-$L__BB0_214:
+ shl.b64 %rd2029, %rd2685, 2;
+ add.s64 %rd2031, %rd2030, %rd2029;
+ ld.global.nc.u32 %r6191, [%rd2031];
+ mad.wide.u32 %rd2032, %r6191, %r1764, %rd2686;
+ shr.u64 %rd2686, %rd2032, 32;
+ add.s64 %rd2033, %rd1, %rd2029;
+ st.local.u32 [%rd2033], %rd2032;
+ add.s32 %r8327, %r8327, 1;
+ cvt.s64.s32 %rd2685, %r8327;
+ setp.ne.s32 %p1133, %r8327, 6;
+ @%p1133 bra $L__BB0_1329;
+
+ st.local.u32 [%rd4], %rd2686;
+ mov.u32 %r6192, 4;
+ sub.s32 %r1768, %r6192, %r1765;
+ mov.u32 %r6193, 6;
+ sub.s32 %r6194, %r6193, %r1765;
+ mul.wide.s32 %rd2034, %r6194, 4;
+ add.s64 %rd2035, %rd1, %rd2034;
+ ld.local.u32 %r8328, [%rd2035];
+ ld.local.u32 %r8329, [%rd2035+-4];
+ and.b32 %r1771, %r1763, 31;
+ setp.eq.s32 %p1134, %r1771, 0;
+ @%p1134 bra $L__BB0_1332;
+
+ mov.u32 %r6195, 32;
+ sub.s32 %r6196, %r6195, %r1771;
+ shr.u32 %r6197, %r8329, %r6196;
+ shl.b32 %r6198, %r8328, %r1771;
+ add.s32 %r8328, %r6197, %r6198;
+ mul.wide.s32 %rd2036, %r1768, 4;
+ add.s64 %rd2037, %rd1, %rd2036;
+ ld.local.u32 %r6199, [%rd2037];
+ shr.u32 %r6200, %r6199, %r6196;
+ shl.b32 %r6201, %r8329, %r1771;
+ add.s32 %r8329, %r6200, %r6201;
+
+$L__BB0_1332:
+ and.b32 %r6202, %r1762, -2147483648;
+ shr.u32 %r6203, %r8329, 30;
+ shl.b32 %r6204, %r8328, 2;
+ or.b32 %r6205, %r6203, %r6204;
+ shr.u32 %r6206, %r6205, 31;
+ shr.u32 %r6207, %r8328, 30;
+ add.s32 %r6208, %r6206, %r6207;
+ neg.s32 %r6209, %r6208;
+ setp.eq.s32 %p1135, %r6202, 0;
+ selp.b32 %r8330, %r6208, %r6209, %p1135;
+ setp.ne.s32 %p1136, %r6206, 0;
+ xor.b32 %r6210, %r6202, -2147483648;
+ selp.b32 %r6211, %r6210, %r6202, %p1136;
+ selp.b32 %r6212, -1, 0, %p1136;
+ xor.b32 %r6213, %r6205, %r6212;
+ shl.b32 %r6214, %r8329, 2;
+ xor.b32 %r6215, %r6214, %r6212;
+ cvt.u64.u32 %rd2038, %r6213;
+ cvt.u64.u32 %rd2039, %r6215;
+ bfi.b64 %rd2040, %rd2038, %rd2039, 32, 32;
+ cvt.rn.f64.s64 %fd181, %rd2040;
+ mul.f64 %fd182, %fd181, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4260, %fd182;
+ setp.eq.s32 %p1137, %r6211, 0;
+ neg.f32 %f4261, %f4260;
+ selp.f32 %f5773, %f4260, %f4261, %p1137;
+
+$L__BB0_1334:
+ and.b32 %r1778, %r8330, 1;
+ setp.eq.s32 %p1138, %r1778, 0;
+ selp.f32 %f1552, %f5773, 0f3F800000, %p1138;
+ mul.rn.f32 %f1553, %f5773, %f5773;
+ mov.f32 %f5774, 0fB94D4153;
+ @%p1138 bra $L__BB0_1336;
+
+ mov.f32 %f4264, 0fBAB607ED;
+ mov.f32 %f4265, 0f37CBAC00;
+ fma.rn.f32 %f5774, %f4265, %f1553, %f4264;
+
+$L__BB0_1336:
+ selp.f32 %f4266, 0f3C0885E4, 0f3D2AAABB, %p1138;
+ fma.rn.f32 %f4267, %f5774, %f1553, %f4266;
+ selp.f32 %f4268, 0fBE2AAAA8, 0fBEFFFFFF, %p1138;
+ fma.rn.f32 %f4269, %f4267, %f1553, %f4268;
+ mov.f32 %f4270, 0f00000000;
+ fma.rn.f32 %f4271, %f1553, %f1552, %f4270;
+ fma.rn.f32 %f5775, %f4269, %f4271, %f1552;
+ and.b32 %r6217, %r8330, 2;
+ setp.eq.s32 %p1140, %r6217, 0;
+ @%p1140 bra $L__BB0_1338;
+
+ mov.f32 %f4273, 0fBF800000;
+ fma.rn.f32 %f5775, %f5775, %f4273, %f4270;
+
+$L__BB0_1338:
+ mul.f32 %f4274, %f1411, 0f3F22F983;
+ cvt.rni.s32.f32 %r8334, %f4274;
+ cvt.rn.f32.s32 %f4275, %r8334;
+ mov.f32 %f4276, 0fBFC90FDA;
+ fma.rn.f32 %f4277, %f4275, %f4276, %f1411;
+ mov.f32 %f4278, 0fB3A22168;
+ fma.rn.f32 %f4279, %f4275, %f4278, %f4277;
+ mov.f32 %f4280, 0fA7C234C5;
+ fma.rn.f32 %f5776, %f4275, %f4280, %f4279;
+ abs.f32 %f1560, %f1411;
+ setp.ltu.f32 %p1141, %f1560, 0f47CE4780;
+ @%p1141 bra $L__BB0_1346;
+
+ setp.eq.f32 %p1142, %f1560, 0f7F800000;
+ @%p1142 bra $L__BB0_1345;
+ bra.uni $L__BB0_1340;
+
+$L__BB0_1345:
+ mov.f32 %f4283, 0f00000000;
+ mul.rn.f32 %f5776, %f1411, %f4283;
+ mov.u32 %r8334, 0;
+ bra.uni $L__BB0_1346;
+
+$L__BB0_1340:
+ mov.b32 %r1780, %f1411;
+ shr.u32 %r6219, %r1780, 23;
+ and.b32 %r6220, %r6219, 255;
+ add.s32 %r1781, %r6220, -128;
+ shl.b32 %r6221, %r1780, 8;
+ or.b32 %r1782, %r6221, -2147483648;
+ shr.u32 %r1783, %r1781, 5;
+ mov.u64 %rd2687, 0;
+ mov.u32 %r8331, 0;
+ mov.u64 %rd2044, __cudart_i2opi_f;
+ mov.u64 %rd2688, %rd2687;
+
+$L__BB0_1341:
.pragma "nounroll";
- shl.b64 %rd801, %rd2520, 2;
- add.s64 %rd803, %rd802, %rd801;
- ld.global.nc.u32 %r3199, [%rd803];
- mad.wide.u32 %rd804, %r3199, %r255, %rd2521;
- shr.u64 %rd2521, %rd804, 32;
- add.s64 %rd805, %rd1, %rd801;
- st.local.u32 [%rd805], %rd804;
- add.s32 %r8270, %r8270, 1;
- cvt.s64.s32 %rd2520, %r8270;
- setp.ne.s32 %p204, %r8270, 6;
- @%p204 bra $L__BB0_214;
-
- st.local.u32 [%rd5], %rd2521;
- mov.u32 %r3200, 4;
- sub.s32 %r259, %r3200, %r256;
- mov.u32 %r3201, 6;
- sub.s32 %r3202, %r3201, %r256;
- mul.wide.s32 %rd806, %r3202, 4;
- add.s64 %rd807, %rd1, %rd806;
- ld.local.u32 %r8271, [%rd807];
- ld.local.u32 %r8272, [%rd807+-4];
- and.b32 %r262, %r254, 31;
- setp.eq.s32 %p205, %r262, 0;
- @%p205 bra $L__BB0_217;
-
- mov.u32 %r3203, 32;
- sub.s32 %r3204, %r3203, %r262;
- shr.u32 %r3205, %r8272, %r3204;
- shl.b32 %r3206, %r8271, %r262;
- add.s32 %r8271, %r3205, %r3206;
- mul.wide.s32 %rd808, %r259, 4;
- add.s64 %rd809, %rd1, %rd808;
- ld.local.u32 %r3207, [%rd809];
- shr.u32 %r3208, %r3207, %r3204;
- shl.b32 %r3209, %r8272, %r262;
- add.s32 %r8272, %r3208, %r3209;
-
-$L__BB0_217:
- and.b32 %r3210, %r253, -2147483648;
- shr.u32 %r3211, %r8272, 30;
- shl.b32 %r3212, %r8271, 2;
- or.b32 %r3213, %r3211, %r3212;
- shr.u32 %r3214, %r3213, 31;
- shr.u32 %r3215, %r8271, 30;
- add.s32 %r3216, %r3214, %r3215;
- neg.s32 %r3217, %r3216;
- setp.eq.s32 %p206, %r3210, 0;
- selp.b32 %r8273, %r3216, %r3217, %p206;
- setp.ne.s32 %p207, %r3214, 0;
- xor.b32 %r3218, %r3210, -2147483648;
- selp.b32 %r3219, %r3218, %r3210, %p207;
- selp.b32 %r3220, -1, 0, %p207;
- xor.b32 %r3221, %r3213, %r3220;
- shl.b32 %r3222, %r8272, 2;
- xor.b32 %r3223, %r3222, %r3220;
- cvt.u64.u32 %rd810, %r3221;
- cvt.u64.u32 %rd811, %r3223;
- bfi.b64 %rd812, %rd810, %rd811, 32, 32;
- cvt.rn.f64.s64 %fd23, %rd812;
- mul.f64 %fd24, %fd23, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f2507, %fd24;
- setp.eq.s32 %p208, %r3219, 0;
- neg.f32 %f2508, %f2507;
- selp.f32 %f5247, %f2507, %f2508, %p208;
-
-$L__BB0_219:
- add.s32 %r269, %r8273, 1;
- and.b32 %r270, %r269, 1;
- setp.eq.s32 %p209, %r270, 0;
- selp.f32 %f217, %f5247, 0f3F800000, %p209;
- mul.rn.f32 %f218, %f5247, %f5247;
- mov.f32 %f5248, 0fB94D4153;
- @%p209 bra $L__BB0_221;
-
- mov.f32 %f2511, 0fBAB607ED;
- mov.f32 %f2512, 0f37CBAC00;
- fma.rn.f32 %f5248, %f2512, %f218, %f2511;
-
-$L__BB0_221:
- selp.f32 %f2513, 0f3C0885E4, 0f3D2AAABB, %p209;
- fma.rn.f32 %f2514, %f5248, %f218, %f2513;
- selp.f32 %f2515, 0fBE2AAAA8, 0fBEFFFFFF, %p209;
- fma.rn.f32 %f2516, %f2514, %f218, %f2515;
- mov.f32 %f2517, 0f00000000;
- fma.rn.f32 %f2518, %f218, %f217, %f2517;
- fma.rn.f32 %f5215, %f2516, %f2518, %f217;
- and.b32 %r3225, %r269, 2;
- setp.eq.s32 %p211, %r3225, 0;
- @%p211 bra $L__BB0_223;
-
- mov.f32 %f2520, 0fBF800000;
- fma.rn.f32 %f5215, %f5215, %f2520, %f2517;
-
-$L__BB0_223:
- selp.f32 %f225, %f5215, %f5216, %p5;
- selp.f32 %f226, %f5213, %f5214, %p5;
- @%p190 bra $L__BB0_225;
-
- add.f32 %f5327, %f226, %f225;
- mov.f32 %f5214, %f5213;
- mov.f32 %f5216, %f5215;
-
-$L__BB0_225:
- add.s32 %r8050, %r13, -15;
- mov.u32 %r3226, -3;
- sub.s32 %r3227, %r3226, %r12;
- setp.ge.s32 %p214, %r8050, %r3227;
- @%p214 bra $L__BB0_254;
-
- shl.b32 %r3229, %r12, 5;
- neg.s32 %r271, %r3229;
- setp.ge.s32 %p215, %r14, %r271;
- @%p215 bra $L__BB0_239;
-
- mul.f32 %f2523, %f5342, 0f3F22F983;
- cvt.rni.s32.f32 %r8277, %f2523;
- cvt.rn.f32.s32 %f2524, %r8277;
- mov.f32 %f2525, 0fBFC90FDA;
- fma.rn.f32 %f2526, %f2524, %f2525, %f5342;
- mov.f32 %f2527, 0fB3A22168;
- fma.rn.f32 %f2528, %f2524, %f2527, %f2526;
- mov.f32 %f2529, 0fA7C234C5;
- fma.rn.f32 %f5256, %f2524, %f2529, %f2528;
- abs.f32 %f234, %f5342;
- setp.ltu.f32 %p216, %f234, 0f47CE4780;
- @%p216 bra $L__BB0_235;
-
- setp.eq.f32 %p217, %f234, 0f7F800000;
- @%p217 bra $L__BB0_234;
- bra.uni $L__BB0_229;
-
-$L__BB0_234:
- mov.f32 %f2532, 0f00000000;
- mul.rn.f32 %f5256, %f5342, %f2532;
- mov.u32 %r8277, 0;
- bra.uni $L__BB0_235;
-
-$L__BB0_229:
- mov.b32 %r273, %f5342;
- shr.u32 %r3231, %r273, 23;
- and.b32 %r3232, %r3231, 255;
- add.s32 %r274, %r3232, -128;
- shl.b32 %r3233, %r273, 8;
- or.b32 %r275, %r3233, -2147483648;
- shr.u32 %r276, %r274, 5;
- mov.u64 %rd2522, 0;
- mov.u32 %r8274, 0;
- mov.u64 %rd816, __cudart_i2opi_f;
- mov.u64 %rd2523, %rd2522;
-
-$L__BB0_230:
- .pragma "nounroll";
- shl.b64 %rd815, %rd2522, 2;
- add.s64 %rd817, %rd816, %rd815;
- ld.global.nc.u32 %r3234, [%rd817];
- mad.wide.u32 %rd818, %r3234, %r275, %rd2523;
- shr.u64 %rd2523, %rd818, 32;
- add.s64 %rd819, %rd1, %rd815;
- st.local.u32 [%rd819], %rd818;
- add.s32 %r8274, %r8274, 1;
- cvt.s64.s32 %rd2522, %r8274;
- setp.ne.s32 %p218, %r8274, 6;
- @%p218 bra $L__BB0_230;
-
- st.local.u32 [%rd5], %rd2523;
- mov.u32 %r3235, 4;
- sub.s32 %r279, %r3235, %r276;
- mov.u32 %r3236, 6;
- sub.s32 %r3237, %r3236, %r276;
- mul.wide.s32 %rd820, %r3237, 4;
- add.s64 %rd821, %rd1, %rd820;
- ld.local.u32 %r8275, [%rd821];
- ld.local.u32 %r8276, [%rd821+-4];
- and.b32 %r282, %r274, 31;
- setp.eq.s32 %p219, %r282, 0;
- @%p219 bra $L__BB0_233;
-
- mov.u32 %r3238, 32;
- sub.s32 %r3239, %r3238, %r282;
- shr.u32 %r3240, %r8276, %r3239;
- shl.b32 %r3241, %r8275, %r282;
- add.s32 %r8275, %r3240, %r3241;
- mul.wide.s32 %rd822, %r279, 4;
- add.s64 %rd823, %rd1, %rd822;
- ld.local.u32 %r3242, [%rd823];
- shr.u32 %r3243, %r3242, %r3239;
- shl.b32 %r3244, %r8276, %r282;
- add.s32 %r8276, %r3243, %r3244;
-
-$L__BB0_233:
- and.b32 %r3245, %r273, -2147483648;
- shr.u32 %r3246, %r8276, 30;
- shl.b32 %r3247, %r8275, 2;
- or.b32 %r3248, %r3246, %r3247;
- shr.u32 %r3249, %r3248, 31;
- shr.u32 %r3250, %r8275, 30;
- add.s32 %r3251, %r3249, %r3250;
- neg.s32 %r3252, %r3251;
- setp.eq.s32 %p220, %r3245, 0;
- selp.b32 %r8277, %r3251, %r3252, %p220;
- setp.ne.s32 %p221, %r3249, 0;
- xor.b32 %r3253, %r3245, -2147483648;
- selp.b32 %r3254, %r3253, %r3245, %p221;
- selp.b32 %r3255, -1, 0, %p221;
- xor.b32 %r3256, %r3248, %r3255;
- shl.b32 %r3257, %r8276, 2;
- xor.b32 %r3258, %r3257, %r3255;
- cvt.u64.u32 %rd824, %r3256;
- cvt.u64.u32 %rd825, %r3258;
- bfi.b64 %rd826, %rd824, %rd825, 32, 32;
- cvt.rn.f64.s64 %fd25, %rd826;
- mul.f64 %fd26, %fd25, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f2530, %fd26;
- setp.eq.s32 %p222, %r3254, 0;
- neg.f32 %f2531, %f2530;
- selp.f32 %f5256, %f2530, %f2531, %p222;
-
-$L__BB0_235:
- and.b32 %r289, %r8277, 1;
- setp.eq.s32 %p223, %r289, 0;
- selp.f32 %f238, %f5256, 0f3F800000, %p223;
- mul.rn.f32 %f239, %f5256, %f5256;
- mov.f32 %f5257, 0fB94D4153;
- @%p223 bra $L__BB0_237;
-
- mov.f32 %f2534, 0fBAB607ED;
- mov.f32 %f2535, 0f37CBAC00;
- fma.rn.f32 %f5257, %f2535, %f239, %f2534;
-
-$L__BB0_237:
- selp.f32 %f2536, 0f3C0885E4, 0f3D2AAABB, %p223;
- fma.rn.f32 %f2537, %f5257, %f239, %f2536;
- selp.f32 %f2538, 0fBE2AAAA8, 0fBEFFFFFF, %p223;
- fma.rn.f32 %f2539, %f2537, %f239, %f2538;
- mov.f32 %f2540, 0f00000000;
- fma.rn.f32 %f2541, %f239, %f238, %f2540;
- fma.rn.f32 %f5213, %f2539, %f2541, %f238;
- and.b32 %r3260, %r8277, 2;
- setp.eq.s32 %p225, %r3260, 0;
- @%p225 bra $L__BB0_239;
-
- mov.f32 %f2543, 0fBF800000;
- fma.rn.f32 %f5213, %f5213, %f2543, %f2540;
-
-$L__BB0_239:
- setp.lt.s32 %p6, %r14, %r271;
- @%p215 bra $L__BB0_252;
-
- mul.f32 %f2544, %f5334, 0f3F22F983;
- cvt.rni.s32.f32 %r8281, %f2544;
- cvt.rn.f32.s32 %f2545, %r8281;
- mov.f32 %f2546, 0fBFC90FDA;
- fma.rn.f32 %f2547, %f2545, %f2546, %f5334;
- mov.f32 %f2548, 0fB3A22168;
- fma.rn.f32 %f2549, %f2545, %f2548, %f2547;
- mov.f32 %f2550, 0fA7C234C5;
- fma.rn.f32 %f5260, %f2545, %f2550, %f2549;
- abs.f32 %f247, %f5334;
- setp.ltu.f32 %p227, %f247, 0f47CE4780;
- @%p227 bra $L__BB0_248;
-
- setp.eq.f32 %p228, %f247, 0f7F800000;
- @%p228 bra $L__BB0_247;
- bra.uni $L__BB0_242;
-
-$L__BB0_247:
- mov.f32 %f2553, 0f00000000;
- mul.rn.f32 %f5260, %f5334, %f2553;
- mov.u32 %r8281, 0;
- bra.uni $L__BB0_248;
-
-$L__BB0_242:
- mov.b32 %r291, %f5334;
- shr.u32 %r3262, %r291, 23;
- and.b32 %r3263, %r3262, 255;
- add.s32 %r292, %r3263, -128;
- shl.b32 %r3264, %r291, 8;
- or.b32 %r293, %r3264, -2147483648;
- shr.u32 %r294, %r292, 5;
- mov.u64 %rd2524, 0;
- mov.u32 %r8278, 0;
- mov.u64 %rd830, __cudart_i2opi_f;
- mov.u64 %rd2525, %rd2524;
-
-$L__BB0_243:
- .pragma "nounroll";
- shl.b64 %rd829, %rd2524, 2;
- add.s64 %rd831, %rd830, %rd829;
- ld.global.nc.u32 %r3265, [%rd831];
- mad.wide.u32 %rd832, %r3265, %r293, %rd2525;
- shr.u64 %rd2525, %rd832, 32;
- add.s64 %rd833, %rd1, %rd829;
- st.local.u32 [%rd833], %rd832;
- add.s32 %r8278, %r8278, 1;
- cvt.s64.s32 %rd2524, %r8278;
- setp.ne.s32 %p229, %r8278, 6;
- @%p229 bra $L__BB0_243;
-
- st.local.u32 [%rd5], %rd2525;
- mov.u32 %r3266, 4;
- sub.s32 %r297, %r3266, %r294;
- mov.u32 %r3267, 6;
- sub.s32 %r3268, %r3267, %r294;
- mul.wide.s32 %rd834, %r3268, 4;
- add.s64 %rd835, %rd1, %rd834;
- ld.local.u32 %r8279, [%rd835];
- ld.local.u32 %r8280, [%rd835+-4];
- and.b32 %r300, %r292, 31;
- setp.eq.s32 %p230, %r300, 0;
- @%p230 bra $L__BB0_246;
-
- mov.u32 %r3269, 32;
- sub.s32 %r3270, %r3269, %r300;
- shr.u32 %r3271, %r8280, %r3270;
- shl.b32 %r3272, %r8279, %r300;
- add.s32 %r8279, %r3271, %r3272;
- mul.wide.s32 %rd836, %r297, 4;
- add.s64 %rd837, %rd1, %rd836;
- ld.local.u32 %r3273, [%rd837];
- shr.u32 %r3274, %r3273, %r3270;
- shl.b32 %r3275, %r8280, %r300;
- add.s32 %r8280, %r3274, %r3275;
-
-$L__BB0_246:
- and.b32 %r3276, %r291, -2147483648;
- shr.u32 %r3277, %r8280, 30;
- shl.b32 %r3278, %r8279, 2;
- or.b32 %r3279, %r3277, %r3278;
- shr.u32 %r3280, %r3279, 31;
- shr.u32 %r3281, %r8279, 30;
- add.s32 %r3282, %r3280, %r3281;
- neg.s32 %r3283, %r3282;
- setp.eq.s32 %p231, %r3276, 0;
- selp.b32 %r8281, %r3282, %r3283, %p231;
- setp.ne.s32 %p232, %r3280, 0;
- xor.b32 %r3284, %r3276, -2147483648;
- selp.b32 %r3285, %r3284, %r3276, %p232;
- selp.b32 %r3286, -1, 0, %p232;
- xor.b32 %r3287, %r3279, %r3286;
- shl.b32 %r3288, %r8280, 2;
- xor.b32 %r3289, %r3288, %r3286;
- cvt.u64.u32 %rd838, %r3287;
- cvt.u64.u32 %rd839, %r3289;
- bfi.b64 %rd840, %rd838, %rd839, 32, 32;
- cvt.rn.f64.s64 %fd27, %rd840;
- mul.f64 %fd28, %fd27, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f2551, %fd28;
- setp.eq.s32 %p233, %r3285, 0;
- neg.f32 %f2552, %f2551;
- selp.f32 %f5260, %f2551, %f2552, %p233;
-
-$L__BB0_248:
- add.s32 %r307, %r8281, 1;
- and.b32 %r308, %r307, 1;
- setp.eq.s32 %p234, %r308, 0;
- selp.f32 %f251, %f5260, 0f3F800000, %p234;
- mul.rn.f32 %f252, %f5260, %f5260;
- mov.f32 %f5261, 0fB94D4153;
- @%p234 bra $L__BB0_250;
-
- mov.f32 %f2555, 0fBAB607ED;
- mov.f32 %f2556, 0f37CBAC00;
- fma.rn.f32 %f5261, %f2556, %f252, %f2555;
-
-$L__BB0_250:
- selp.f32 %f2557, 0f3C0885E4, 0f3D2AAABB, %p234;
- fma.rn.f32 %f2558, %f5261, %f252, %f2557;
- selp.f32 %f2559, 0fBE2AAAA8, 0fBEFFFFFF, %p234;
- fma.rn.f32 %f2560, %f2558, %f252, %f2559;
- mov.f32 %f2561, 0f00000000;
- fma.rn.f32 %f2562, %f252, %f251, %f2561;
- fma.rn.f32 %f5215, %f2560, %f2562, %f251;
- and.b32 %r3291, %r307, 2;
- setp.eq.s32 %p236, %r3291, 0;
- @%p236 bra $L__BB0_252;
-
- mov.f32 %f2564, 0fBF800000;
- fma.rn.f32 %f5215, %f5215, %f2564, %f2561;
-
-$L__BB0_252:
- selp.f32 %f259, %f5215, %f5216, %p6;
- selp.f32 %f260, %f5213, %f5214, %p6;
- @%p215 bra $L__BB0_254;
-
- add.f32 %f5326, %f260, %f259;
- mov.f32 %f5214, %f5213;
- mov.f32 %f5216, %f5215;
-
-$L__BB0_254:
- @%p214 bra $L__BB0_476;
-
- shl.b32 %r3295, %r12, 5;
- mov.u32 %r3296, -32;
- sub.s32 %r309, %r3296, %r3295;
- setp.ge.s32 %p240, %r14, %r309;
- @%p240 bra $L__BB0_268;
-
- mul.f32 %f2567, %f5341, 0f3F22F983;
- cvt.rni.s32.f32 %r8285, %f2567;
- cvt.rn.f32.s32 %f2568, %r8285;
- mov.f32 %f2569, 0fBFC90FDA;
- fma.rn.f32 %f2570, %f2568, %f2569, %f5341;
- mov.f32 %f2571, 0fB3A22168;
- fma.rn.f32 %f2572, %f2568, %f2571, %f2570;
- mov.f32 %f2573, 0fA7C234C5;
- fma.rn.f32 %f5269, %f2568, %f2573, %f2572;
- abs.f32 %f268, %f5341;
- setp.ltu.f32 %p241, %f268, 0f47CE4780;
- @%p241 bra $L__BB0_264;
-
- setp.eq.f32 %p242, %f268, 0f7F800000;
- @%p242 bra $L__BB0_263;
- bra.uni $L__BB0_258;
-
-$L__BB0_263:
- mov.f32 %f2576, 0f00000000;
- mul.rn.f32 %f5269, %f5341, %f2576;
- mov.u32 %r8285, 0;
- bra.uni $L__BB0_264;
-
-$L__BB0_258:
- mov.b32 %r311, %f5341;
- shr.u32 %r3298, %r311, 23;
- and.b32 %r3299, %r3298, 255;
- add.s32 %r312, %r3299, -128;
- shl.b32 %r3300, %r311, 8;
- or.b32 %r313, %r3300, -2147483648;
- shr.u32 %r314, %r312, 5;
- mov.u64 %rd2526, 0;
- mov.u32 %r8282, 0;
- mov.u64 %rd844, __cudart_i2opi_f;
- mov.u64 %rd2527, %rd2526;
-
-$L__BB0_259:
- .pragma "nounroll";
- shl.b64 %rd843, %rd2526, 2;
- add.s64 %rd845, %rd844, %rd843;
- ld.global.nc.u32 %r3301, [%rd845];
- mad.wide.u32 %rd846, %r3301, %r313, %rd2527;
- shr.u64 %rd2527, %rd846, 32;
- add.s64 %rd847, %rd1, %rd843;
- st.local.u32 [%rd847], %rd846;
- add.s32 %r8282, %r8282, 1;
- cvt.s64.s32 %rd2526, %r8282;
- setp.ne.s32 %p243, %r8282, 6;
- @%p243 bra $L__BB0_259;
-
- st.local.u32 [%rd5], %rd2527;
- mov.u32 %r3302, 4;
- sub.s32 %r317, %r3302, %r314;
- mov.u32 %r3303, 6;
- sub.s32 %r3304, %r3303, %r314;
- mul.wide.s32 %rd848, %r3304, 4;
- add.s64 %rd849, %rd1, %rd848;
- ld.local.u32 %r8283, [%rd849];
- ld.local.u32 %r8284, [%rd849+-4];
- and.b32 %r320, %r312, 31;
- setp.eq.s32 %p244, %r320, 0;
- @%p244 bra $L__BB0_262;
-
- mov.u32 %r3305, 32;
- sub.s32 %r3306, %r3305, %r320;
- shr.u32 %r3307, %r8284, %r3306;
- shl.b32 %r3308, %r8283, %r320;
- add.s32 %r8283, %r3307, %r3308;
- mul.wide.s32 %rd850, %r317, 4;
- add.s64 %rd851, %rd1, %rd850;
- ld.local.u32 %r3309, [%rd851];
- shr.u32 %r3310, %r3309, %r3306;
- shl.b32 %r3311, %r8284, %r320;
- add.s32 %r8284, %r3310, %r3311;
-
-$L__BB0_262:
- and.b32 %r3312, %r311, -2147483648;
- shr.u32 %r3313, %r8284, 30;
- shl.b32 %r3314, %r8283, 2;
- or.b32 %r3315, %r3313, %r3314;
- shr.u32 %r3316, %r3315, 31;
- shr.u32 %r3317, %r8283, 30;
- add.s32 %r3318, %r3316, %r3317;
- neg.s32 %r3319, %r3318;
- setp.eq.s32 %p245, %r3312, 0;
- selp.b32 %r8285, %r3318, %r3319, %p245;
- setp.ne.s32 %p246, %r3316, 0;
- xor.b32 %r3320, %r3312, -2147483648;
- selp.b32 %r3321, %r3320, %r3312, %p246;
- selp.b32 %r3322, -1, 0, %p246;
- xor.b32 %r3323, %r3315, %r3322;
- shl.b32 %r3324, %r8284, 2;
- xor.b32 %r3325, %r3324, %r3322;
- cvt.u64.u32 %rd852, %r3323;
- cvt.u64.u32 %rd853, %r3325;
- bfi.b64 %rd854, %rd852, %rd853, 32, 32;
- cvt.rn.f64.s64 %fd29, %rd854;
- mul.f64 %fd30, %fd29, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f2574, %fd30;
- setp.eq.s32 %p247, %r3321, 0;
- neg.f32 %f2575, %f2574;
- selp.f32 %f5269, %f2574, %f2575, %p247;
-
-$L__BB0_264:
- and.b32 %r327, %r8285, 1;
- setp.eq.s32 %p248, %r327, 0;
- selp.f32 %f272, %f5269, 0f3F800000, %p248;
- mul.rn.f32 %f273, %f5269, %f5269;
- mov.f32 %f5270, 0fB94D4153;
- @%p248 bra $L__BB0_266;
-
- mov.f32 %f2578, 0fBAB607ED;
- mov.f32 %f2579, 0f37CBAC00;
- fma.rn.f32 %f5270, %f2579, %f273, %f2578;
-
-$L__BB0_266:
- selp.f32 %f2580, 0f3C0885E4, 0f3D2AAABB, %p248;
- fma.rn.f32 %f2581, %f5270, %f273, %f2580;
- selp.f32 %f2582, 0fBE2AAAA8, 0fBEFFFFFF, %p248;
- fma.rn.f32 %f2583, %f2581, %f273, %f2582;
- mov.f32 %f2584, 0f00000000;
- fma.rn.f32 %f2585, %f273, %f272, %f2584;
- fma.rn.f32 %f5213, %f2583, %f2585, %f272;
- and.b32 %r3327, %r8285, 2;
- setp.eq.s32 %p250, %r3327, 0;
- @%p250 bra $L__BB0_268;
-
- mov.f32 %f2587, 0fBF800000;
- fma.rn.f32 %f5213, %f5213, %f2587, %f2584;
-
-$L__BB0_268:
- setp.lt.s32 %p7, %r14, %r309;
- @%p240 bra $L__BB0_281;
-
- mul.f32 %f2588, %f5333, 0f3F22F983;
- cvt.rni.s32.f32 %r8289, %f2588;
- cvt.rn.f32.s32 %f2589, %r8289;
- mov.f32 %f2590, 0fBFC90FDA;
- fma.rn.f32 %f2591, %f2589, %f2590, %f5333;
- mov.f32 %f2592, 0fB3A22168;
- fma.rn.f32 %f2593, %f2589, %f2592, %f2591;
- mov.f32 %f2594, 0fA7C234C5;
- fma.rn.f32 %f5273, %f2589, %f2594, %f2593;
- abs.f32 %f281, %f5333;
- setp.ltu.f32 %p252, %f281, 0f47CE4780;
- @%p252 bra $L__BB0_277;
-
- setp.eq.f32 %p253, %f281, 0f7F800000;
- @%p253 bra $L__BB0_276;
- bra.uni $L__BB0_271;
-
-$L__BB0_276:
- mov.f32 %f2597, 0f00000000;
- mul.rn.f32 %f5273, %f5333, %f2597;
- mov.u32 %r8289, 0;
- bra.uni $L__BB0_277;
-
-$L__BB0_271:
- mov.b32 %r329, %f5333;
- shr.u32 %r3329, %r329, 23;
- and.b32 %r3330, %r3329, 255;
- add.s32 %r330, %r3330, -128;
- shl.b32 %r3331, %r329, 8;
- or.b32 %r331, %r3331, -2147483648;
- shr.u32 %r332, %r330, 5;
- mov.u64 %rd2528, 0;
- mov.u32 %r8286, 0;
- mov.u64 %rd858, __cudart_i2opi_f;
- mov.u64 %rd2529, %rd2528;
-
-$L__BB0_272:
- .pragma "nounroll";
- shl.b64 %rd857, %rd2528, 2;
- add.s64 %rd859, %rd858, %rd857;
- ld.global.nc.u32 %r3332, [%rd859];
- mad.wide.u32 %rd860, %r3332, %r331, %rd2529;
- shr.u64 %rd2529, %rd860, 32;
- add.s64 %rd861, %rd1, %rd857;
- st.local.u32 [%rd861], %rd860;
- add.s32 %r8286, %r8286, 1;
- cvt.s64.s32 %rd2528, %r8286;
- setp.ne.s32 %p254, %r8286, 6;
- @%p254 bra $L__BB0_272;
-
- st.local.u32 [%rd5], %rd2529;
- mov.u32 %r3333, 4;
- sub.s32 %r335, %r3333, %r332;
- mov.u32 %r3334, 6;
- sub.s32 %r3335, %r3334, %r332;
- mul.wide.s32 %rd862, %r3335, 4;
- add.s64 %rd863, %rd1, %rd862;
- ld.local.u32 %r8287, [%rd863];
- ld.local.u32 %r8288, [%rd863+-4];
- and.b32 %r338, %r330, 31;
- setp.eq.s32 %p255, %r338, 0;
- @%p255 bra $L__BB0_275;
-
- mov.u32 %r3336, 32;
- sub.s32 %r3337, %r3336, %r338;
- shr.u32 %r3338, %r8288, %r3337;
- shl.b32 %r3339, %r8287, %r338;
- add.s32 %r8287, %r3338, %r3339;
- mul.wide.s32 %rd864, %r335, 4;
- add.s64 %rd865, %rd1, %rd864;
- ld.local.u32 %r3340, [%rd865];
- shr.u32 %r3341, %r3340, %r3337;
- shl.b32 %r3342, %r8288, %r338;
- add.s32 %r8288, %r3341, %r3342;
-
-$L__BB0_275:
- and.b32 %r3343, %r329, -2147483648;
- shr.u32 %r3344, %r8288, 30;
- shl.b32 %r3345, %r8287, 2;
- or.b32 %r3346, %r3344, %r3345;
- shr.u32 %r3347, %r3346, 31;
- shr.u32 %r3348, %r8287, 30;
- add.s32 %r3349, %r3347, %r3348;
- neg.s32 %r3350, %r3349;
- setp.eq.s32 %p256, %r3343, 0;
- selp.b32 %r8289, %r3349, %r3350, %p256;
- setp.ne.s32 %p257, %r3347, 0;
- xor.b32 %r3351, %r3343, -2147483648;
- selp.b32 %r3352, %r3351, %r3343, %p257;
- selp.b32 %r3353, -1, 0, %p257;
- xor.b32 %r3354, %r3346, %r3353;
- shl.b32 %r3355, %r8288, 2;
- xor.b32 %r3356, %r3355, %r3353;
- cvt.u64.u32 %rd866, %r3354;
- cvt.u64.u32 %rd867, %r3356;
- bfi.b64 %rd868, %rd866, %rd867, 32, 32;
- cvt.rn.f64.s64 %fd31, %rd868;
- mul.f64 %fd32, %fd31, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f2595, %fd32;
- setp.eq.s32 %p258, %r3352, 0;
- neg.f32 %f2596, %f2595;
- selp.f32 %f5273, %f2595, %f2596, %p258;
-
-$L__BB0_277:
- add.s32 %r345, %r8289, 1;
- and.b32 %r346, %r345, 1;
- setp.eq.s32 %p259, %r346, 0;
- selp.f32 %f285, %f5273, 0f3F800000, %p259;
- mul.rn.f32 %f286, %f5273, %f5273;
- mov.f32 %f5274, 0fB94D4153;
- @%p259 bra $L__BB0_279;
-
- mov.f32 %f2599, 0fBAB607ED;
- mov.f32 %f2600, 0f37CBAC00;
- fma.rn.f32 %f5274, %f2600, %f286, %f2599;
-
-$L__BB0_279:
- selp.f32 %f2601, 0f3C0885E4, 0f3D2AAABB, %p259;
- fma.rn.f32 %f2602, %f5274, %f286, %f2601;
- selp.f32 %f2603, 0fBE2AAAA8, 0fBEFFFFFF, %p259;
- fma.rn.f32 %f2604, %f2602, %f286, %f2603;
- mov.f32 %f2605, 0f00000000;
- fma.rn.f32 %f2606, %f286, %f285, %f2605;
- fma.rn.f32 %f5215, %f2604, %f2606, %f285;
- and.b32 %r3358, %r345, 2;
- setp.eq.s32 %p261, %r3358, 0;
- @%p261 bra $L__BB0_281;
-
- mov.f32 %f2608, 0fBF800000;
- fma.rn.f32 %f5215, %f5215, %f2608, %f2605;
-
-$L__BB0_281:
- selp.f32 %f293, %f5215, %f5216, %p7;
- selp.f32 %f294, %f5213, %f5214, %p7;
- @%p240 bra $L__BB0_476;
-
- add.f32 %f5325, %f294, %f293;
- mov.f32 %f5214, %f5213;
- mov.f32 %f5216, %f5215;
-
-$L__BB0_476:
- @%p32 bra $L__BB0_478;
-
- shl.b32 %r3908, %r12, 2;
- mov.u32 %r3909, -4;
- sub.s32 %r3910, %r3909, %r3908;
- add.s32 %r3911, %r13, -12;
- setp.lt.s32 %p425, %r3911, %r3910;
- @%p425 bra $L__BB0_758;
- bra.uni $L__BB0_478;
-
-$L__BB0_758:
- mov.u32 %r4637, %ctaid.x;
- mul.lo.s32 %r4638, %r2614, %r4637;
- shl.b32 %r4639, %r12, 5;
- add.s32 %r4640, %r4639, %r1;
- mul.hi.s32 %r4641, %r4640, -1840700269;
- add.s32 %r4642, %r4641, %r4640;
- shr.u32 %r4643, %r4642, 31;
- shr.s32 %r4644, %r4642, 2;
- add.s32 %r4645, %r4644, %r4643;
- mul.lo.s32 %r4646, %r4645, %r2615;
- mul.lo.s32 %r4647, %r4645, 7;
- sub.s32 %r4648, %r4640, %r4647;
- mul.lo.s32 %r4649, %r4648, %r2616;
- add.s32 %r4650, %r13, 4;
- mad.lo.s32 %r4651, %r4650, %r2613, %r4638;
- add.s32 %r4652, %r4651, %r4646;
- add.s32 %r4653, %r4652, %r4649;
- mul.wide.s32 %rd1373, %r4653, 4;
- add.s64 %rd1374, %rd3, %rd1373;
- ld.global.f32 %f839, [%rd1374];
- add.s32 %r4654, %r4640, 32;
- mul.hi.s32 %r4655, %r4654, -1840700269;
- add.s32 %r4656, %r4655, %r4654;
- shr.u32 %r4657, %r4656, 31;
- shr.s32 %r4658, %r4656, 2;
- add.s32 %r4659, %r4658, %r4657;
- mul.lo.s32 %r4660, %r4659, %r2615;
- mul.lo.s32 %r4661, %r4659, 7;
- sub.s32 %r4662, %r4654, %r4661;
- mul.lo.s32 %r4663, %r4662, %r2616;
- add.s32 %r4664, %r4651, %r4660;
- add.s32 %r4665, %r4664, %r4663;
- mul.wide.s32 %rd1375, %r4665, 4;
- add.s64 %rd1376, %rd3, %rd1375;
- ld.global.f32 %f840, [%rd1376];
- add.s32 %r4666, %r2612, %r4638;
- mad.lo.s32 %r4667, %r13, %r2613, %r4666;
- add.s32 %r4668, %r4667, %r4646;
- add.s32 %r4669, %r4668, %r4649;
- mul.wide.s32 %rd1377, %r4669, 4;
- add.s64 %rd1378, %rd3, %rd1377;
- ld.global.f32 %f841, [%rd1378];
- add.s32 %r4670, %r4667, %r4660;
- add.s32 %r4671, %r4670, %r4663;
- mul.wide.s32 %rd1379, %r4671, 4;
- add.s64 %rd1380, %rd3, %rd1379;
- ld.global.f32 %f842, [%rd1380];
- mul.wide.s32 %rd1381, %r2613, 4;
- add.s64 %rd1382, %rd1378, %rd1381;
- ld.global.f32 %f843, [%rd1382];
- add.s64 %rd1383, %rd1380, %rd1381;
- ld.global.f32 %f844, [%rd1383];
- add.s64 %rd1384, %rd1382, %rd1381;
- ld.global.f32 %f845, [%rd1384];
- add.s64 %rd1385, %rd1383, %rd1381;
- ld.global.f32 %f846, [%rd1385];
- mul.hi.s32 %r4672, %r4640, 954437177;
- shr.u32 %r4673, %r4672, 31;
- shr.s32 %r4674, %r4672, 1;
- add.s32 %r4675, %r4674, %r4673;
- mul.lo.s32 %r4676, %r4675, %r2605;
- mul.lo.s32 %r4677, %r4675, 9;
- sub.s32 %r4678, %r4640, %r4677;
- mul.lo.s32 %r4679, %r4678, %r2606;
- add.s32 %r4680, %r13, 1;
- mul.lo.s32 %r4681, %r4680, %r2603;
- mad.lo.s32 %r4682, %r2604, %r4637, %r2602;
- add.s32 %r4683, %r4682, %r4681;
- add.s32 %r4684, %r4683, %r4676;
- add.s32 %r4685, %r4684, %r4679;
- mul.wide.s32 %rd1386, %r4685, 4;
- add.s64 %rd1387, %rd2, %rd1386;
- ld.global.f32 %f847, [%rd1387];
- mul.hi.s32 %r4686, %r4654, 954437177;
- shr.u32 %r4687, %r4686, 31;
- shr.s32 %r4688, %r4686, 1;
- add.s32 %r4689, %r4688, %r4687;
- mul.lo.s32 %r4690, %r4689, %r2605;
- mul.lo.s32 %r4691, %r4689, 9;
- sub.s32 %r4692, %r4654, %r4691;
- mul.lo.s32 %r4693, %r4692, %r2606;
- add.s32 %r4694, %r4683, %r4690;
- add.s32 %r4695, %r4694, %r4693;
- mul.wide.s32 %rd1388, %r4695, 4;
- add.s64 %rd1389, %rd2, %rd1388;
- ld.global.f32 %f848, [%rd1389];
- add.s32 %r4696, %r4683, %r2603;
- add.s32 %r4697, %r4696, %r4676;
- add.s32 %r4698, %r4697, %r4679;
- mul.wide.s32 %rd1390, %r4698, 4;
- add.s64 %rd1391, %rd2, %rd1390;
- ld.global.f32 %f849, [%rd1391];
- add.s32 %r4699, %r4696, %r4690;
- add.s32 %r4700, %r4699, %r4693;
- mul.wide.s32 %rd1392, %r4700, 4;
- add.s64 %rd1393, %rd2, %rd1392;
- ld.global.f32 %f850, [%rd1393];
- add.s32 %r4701, %r4682, %r2602;
- mad.lo.s32 %r4702, %r13, %r2603, %r4701;
- add.s32 %r4703, %r4702, %r4676;
- add.s32 %r4704, %r4703, %r4679;
- mul.wide.s32 %rd1394, %r4704, 4;
- add.s64 %rd1395, %rd2, %rd1394;
- ld.global.f32 %f851, [%rd1395];
- add.s32 %r4705, %r4702, %r4690;
- add.s32 %r4706, %r4705, %r4693;
- mul.wide.s32 %rd1396, %r4706, 4;
- add.s64 %rd1397, %rd2, %rd1396;
- ld.global.f32 %f852, [%rd1397];
- add.s32 %r4707, %r4701, %r4681;
- add.s32 %r4708, %r4707, %r4676;
- add.s32 %r4709, %r4708, %r4679;
- mul.wide.s32 %rd1398, %r4709, 4;
- add.s64 %rd1399, %rd2, %rd1398;
- ld.global.f32 %f853, [%rd1399];
- add.s32 %r4710, %r4707, %r4690;
- add.s32 %r4711, %r4710, %r4693;
- mul.wide.s32 %rd1400, %r4711, 4;
- add.s64 %rd1401, %rd2, %rd1400;
- ld.global.f32 %f854, [%rd1401];
- mul.f32 %f3299, %f847, 0f3F22F983;
- cvt.rni.s32.f32 %r8421, %f3299;
- cvt.rn.f32.s32 %f3300, %r8421;
- mov.f32 %f3301, 0fBFC90FDA;
- fma.rn.f32 %f3302, %f3300, %f3301, %f847;
- mov.f32 %f3303, 0fB3A22168;
- fma.rn.f32 %f3304, %f3300, %f3303, %f3302;
- mov.f32 %f3305, 0fA7C234C5;
- fma.rn.f32 %f5468, %f3300, %f3305, %f3304;
- abs.f32 %f856, %f847;
- setp.ltu.f32 %p658, %f856, 0f47CE4780;
- @%p658 bra $L__BB0_766;
-
- setp.eq.f32 %p659, %f856, 0f7F800000;
- @%p659 bra $L__BB0_765;
- bra.uni $L__BB0_760;
-
-$L__BB0_765:
- mov.f32 %f3308, 0f00000000;
- mul.rn.f32 %f5468, %f847, %f3308;
- mov.u32 %r8421, 0;
- bra.uni $L__BB0_766;
-
-$L__BB0_478:
- mov.u32 %r643, %ctaid.x;
- mul.lo.s32 %r644, %r2614, %r643;
- add.s32 %r3912, %r13, -15;
- mov.u32 %r3913, -4;
- sub.s32 %r645, %r3913, %r12;
- setp.ge.s32 %p426, %r3912, %r645;
- add.s32 %r3914, %r13, 4;
- mad.lo.s32 %r646, %r3914, %r2613, %r644;
- @%p426 bra $L__BB0_481;
-
- shl.b32 %r647, %r12, 5;
- neg.s32 %r3915, %r647;
- setp.ge.s32 %p427, %r14, %r3915;
- @%p427 bra $L__BB0_481;
-
- add.s32 %r3916, %r647, %r1;
- mul.hi.s32 %r3917, %r3916, -1840700269;
- add.s32 %r3918, %r3917, %r3916;
- shr.u32 %r3919, %r3918, 31;
- shr.s32 %r3920, %r3918, 2;
- add.s32 %r3921, %r3920, %r3919;
- mul.lo.s32 %r3922, %r3921, 7;
- sub.s32 %r3923, %r3916, %r3922;
- mad.lo.s32 %r3924, %r3921, %r2615, %r646;
- mad.lo.s32 %r3925, %r3923, %r2616, %r3924;
- mul.wide.s32 %rd1117, %r3925, 4;
- add.s64 %rd1118, %rd3, %rd1117;
- ld.global.f32 %f5531, [%rd1118];
-
-$L__BB0_481:
- @%p426 bra $L__BB0_484;
-
- shl.b32 %r648, %r12, 5;
- mov.u32 %r3927, -32;
- sub.s32 %r3928, %r3927, %r648;
- setp.ge.s32 %p429, %r14, %r3928;
- @%p429 bra $L__BB0_484;
-
- add.s32 %r3929, %r648, %r1;
- add.s32 %r3930, %r3929, 32;
- mul.hi.s32 %r3931, %r3930, -1840700269;
- add.s32 %r3932, %r3931, %r3930;
- shr.u32 %r3933, %r3932, 31;
- shr.s32 %r3934, %r3932, 2;
- add.s32 %r3935, %r3934, %r3933;
- mul.lo.s32 %r3936, %r3935, 7;
- sub.s32 %r3937, %r3930, %r3936;
- mad.lo.s32 %r3938, %r3935, %r2615, %r646;
- mad.lo.s32 %r3939, %r3937, %r2616, %r3938;
- mul.wide.s32 %rd1119, %r3939, 4;
- add.s64 %rd1120, %rd3, %rd1119;
- ld.global.f32 %f5339, [%rd1120];
-
-$L__BB0_484:
- mov.u32 %r3941, -5;
- sub.s32 %r649, %r3941, %r12;
- setp.ge.s32 %p430, %r3912, %r649;
- add.s32 %r3942, %r2612, %r644;
- mad.lo.s32 %r650, %r13, %r2613, %r3942;
- @%p430 bra $L__BB0_487;
-
- shl.b32 %r651, %r12, 5;
- neg.s32 %r3943, %r651;
- setp.ge.s32 %p431, %r14, %r3943;
- @%p431 bra $L__BB0_487;
-
- add.s32 %r3944, %r651, %r1;
- mul.hi.s32 %r3945, %r3944, -1840700269;
- add.s32 %r3946, %r3945, %r3944;
- shr.u32 %r3947, %r3946, 31;
- shr.s32 %r3948, %r3946, 2;
- add.s32 %r3949, %r3948, %r3947;
- mul.lo.s32 %r3950, %r3949, 7;
- sub.s32 %r3951, %r3944, %r3950;
- mad.lo.s32 %r3952, %r3949, %r2615, %r650;
- mad.lo.s32 %r3953, %r3951, %r2616, %r3952;
- mul.wide.s32 %rd1121, %r3953, 4;
- add.s64 %rd1122, %rd3, %rd1121;
- ld.global.f32 %f5338, [%rd1122];
-
-$L__BB0_487:
- @%p430 bra $L__BB0_490;
-
- shl.b32 %r652, %r12, 5;
- mov.u32 %r3955, -32;
- sub.s32 %r3956, %r3955, %r652;
- setp.ge.s32 %p433, %r14, %r3956;
- @%p433 bra $L__BB0_490;
-
- add.s32 %r3957, %r652, %r1;
- add.s32 %r3958, %r3957, 32;
- mul.hi.s32 %r3959, %r3958, -1840700269;
- add.s32 %r3960, %r3959, %r3958;
- shr.u32 %r3961, %r3960, 31;
- shr.s32 %r3962, %r3960, 2;
- add.s32 %r3963, %r3962, %r3961;
- mul.lo.s32 %r3964, %r3963, 7;
- sub.s32 %r3965, %r3958, %r3964;
- mad.lo.s32 %r3966, %r3963, %r2615, %r650;
- mad.lo.s32 %r3967, %r3965, %r2616, %r3966;
- mul.wide.s32 %rd1123, %r3967, 4;
- add.s64 %rd1124, %rd3, %rd1123;
- ld.global.f32 %f5337, [%rd1124];
-
-$L__BB0_490:
- mov.u32 %r3969, -6;
- sub.s32 %r653, %r3969, %r12;
- setp.ge.s32 %p434, %r3912, %r653;
- add.s32 %r654, %r650, %r2613;
- @%p434 bra $L__BB0_493;
-
- shl.b32 %r655, %r12, 5;
- neg.s32 %r3970, %r655;
- setp.ge.s32 %p435, %r14, %r3970;
- @%p435 bra $L__BB0_493;
-
- add.s32 %r3971, %r655, %r1;
- mul.hi.s32 %r3972, %r3971, -1840700269;
- add.s32 %r3973, %r3972, %r3971;
- shr.u32 %r3974, %r3973, 31;
- shr.s32 %r3975, %r3973, 2;
- add.s32 %r3976, %r3975, %r3974;
- mul.lo.s32 %r3977, %r3976, 7;
- sub.s32 %r3978, %r3971, %r3977;
- mad.lo.s32 %r3979, %r3976, %r2615, %r654;
- mad.lo.s32 %r3980, %r3978, %r2616, %r3979;
- mul.wide.s32 %rd1125, %r3980, 4;
- add.s64 %rd1126, %rd3, %rd1125;
- ld.global.f32 %f5336, [%rd1126];
-
-$L__BB0_493:
- @%p434 bra $L__BB0_496;
-
- shl.b32 %r656, %r12, 5;
- mov.u32 %r3982, -32;
- sub.s32 %r3983, %r3982, %r656;
- setp.ge.s32 %p437, %r14, %r3983;
- @%p437 bra $L__BB0_496;
-
- add.s32 %r3984, %r656, %r1;
- add.s32 %r3985, %r3984, 32;
- mul.hi.s32 %r3986, %r3985, -1840700269;
- add.s32 %r3987, %r3986, %r3985;
- shr.u32 %r3988, %r3987, 31;
- shr.s32 %r3989, %r3987, 2;
- add.s32 %r3990, %r3989, %r3988;
- mul.lo.s32 %r3991, %r3990, 7;
- sub.s32 %r3992, %r3985, %r3991;
- mad.lo.s32 %r3993, %r3990, %r2615, %r654;
- mad.lo.s32 %r3994, %r3992, %r2616, %r3993;
- mul.wide.s32 %rd1127, %r3994, 4;
- add.s64 %rd1128, %rd3, %rd1127;
- ld.global.f32 %f5335, [%rd1128];
-
-$L__BB0_496:
- mov.u32 %r3996, -7;
- sub.s32 %r657, %r3996, %r12;
- setp.ge.s32 %p438, %r3912, %r657;
- add.s32 %r658, %r654, %r2613;
- @%p438 bra $L__BB0_499;
-
- shl.b32 %r659, %r12, 5;
- neg.s32 %r3997, %r659;
- setp.ge.s32 %p439, %r14, %r3997;
- @%p439 bra $L__BB0_499;
-
- add.s32 %r3998, %r659, %r1;
- mul.hi.s32 %r3999, %r3998, -1840700269;
- add.s32 %r4000, %r3999, %r3998;
- shr.u32 %r4001, %r4000, 31;
- shr.s32 %r4002, %r4000, 2;
- add.s32 %r4003, %r4002, %r4001;
- mul.lo.s32 %r4004, %r4003, 7;
- sub.s32 %r4005, %r3998, %r4004;
- mad.lo.s32 %r4006, %r4003, %r2615, %r658;
- mad.lo.s32 %r4007, %r4005, %r2616, %r4006;
- mul.wide.s32 %rd1129, %r4007, 4;
- add.s64 %rd1130, %rd3, %rd1129;
- ld.global.f32 %f5334, [%rd1130];
-
-$L__BB0_499:
- @%p438 bra $L__BB0_502;
-
- shl.b32 %r660, %r12, 5;
- mov.u32 %r4009, -32;
- sub.s32 %r4010, %r4009, %r660;
- setp.ge.s32 %p441, %r14, %r4010;
- @%p441 bra $L__BB0_502;
-
- add.s32 %r4011, %r660, %r1;
- add.s32 %r4012, %r4011, 32;
- mul.hi.s32 %r4013, %r4012, -1840700269;
- add.s32 %r4014, %r4013, %r4012;
- shr.u32 %r4015, %r4014, 31;
- shr.s32 %r4016, %r4014, 2;
- add.s32 %r4017, %r4016, %r4015;
- mul.lo.s32 %r4018, %r4017, 7;
- sub.s32 %r4019, %r4012, %r4018;
- mad.lo.s32 %r4020, %r4017, %r2615, %r658;
- mad.lo.s32 %r4021, %r4019, %r2616, %r4020;
- mul.wide.s32 %rd1131, %r4021, 4;
- add.s64 %rd1132, %rd3, %rd1131;
- ld.global.f32 %f5333, [%rd1132];
-
-$L__BB0_502:
- mul.lo.s32 %r661, %r2604, %r643;
- add.s32 %r4023, %r2602, %r661;
- add.s32 %r4024, %r13, 1;
- mul.lo.s32 %r662, %r4024, %r2603;
- add.s32 %r663, %r4023, %r662;
- @%p426 bra $L__BB0_505;
-
- shl.b32 %r664, %r12, 5;
- neg.s32 %r4025, %r664;
- setp.ge.s32 %p443, %r14, %r4025;
- @%p443 bra $L__BB0_505;
-
- add.s32 %r4026, %r664, %r1;
- mul.hi.s32 %r4027, %r4026, 954437177;
- shr.u32 %r4028, %r4027, 31;
- shr.s32 %r4029, %r4027, 1;
- add.s32 %r4030, %r4029, %r4028;
- mul.lo.s32 %r4031, %r4030, 9;
- sub.s32 %r4032, %r4026, %r4031;
- mad.lo.s32 %r4033, %r4030, %r2605, %r663;
- mad.lo.s32 %r4034, %r4032, %r2606, %r4033;
- mul.wide.s32 %rd1133, %r4034, 4;
- add.s64 %rd1134, %rd2, %rd1133;
- ld.global.f32 %f5348, [%rd1134];
-
-$L__BB0_505:
- @%p426 bra $L__BB0_508;
-
- shl.b32 %r665, %r12, 5;
- mov.u32 %r4036, -32;
- sub.s32 %r4037, %r4036, %r665;
- setp.ge.s32 %p445, %r14, %r4037;
- @%p445 bra $L__BB0_508;
-
- add.s32 %r4038, %r665, %r1;
- add.s32 %r4039, %r4038, 32;
- mul.hi.s32 %r4040, %r4039, 954437177;
- shr.u32 %r4041, %r4040, 31;
- shr.s32 %r4042, %r4040, 1;
- add.s32 %r4043, %r4042, %r4041;
- mul.lo.s32 %r4044, %r4043, 9;
- sub.s32 %r4045, %r4039, %r4044;
- mad.lo.s32 %r4046, %r4043, %r2605, %r663;
- mad.lo.s32 %r4047, %r4045, %r2606, %r4046;
- mul.wide.s32 %rd1135, %r4047, 4;
- add.s64 %rd1136, %rd2, %rd1135;
- ld.global.f32 %f5347, [%rd1136];
-
-$L__BB0_508:
- add.s32 %r666, %r663, %r2603;
- @%p430 bra $L__BB0_511;
-
- shl.b32 %r667, %r12, 5;
- neg.s32 %r4049, %r667;
- setp.ge.s32 %p447, %r14, %r4049;
- @%p447 bra $L__BB0_511;
-
- add.s32 %r4050, %r667, %r1;
- mul.hi.s32 %r4051, %r4050, 954437177;
- shr.u32 %r4052, %r4051, 31;
- shr.s32 %r4053, %r4051, 1;
- add.s32 %r4054, %r4053, %r4052;
- mul.lo.s32 %r4055, %r4054, 9;
- sub.s32 %r4056, %r4050, %r4055;
- mad.lo.s32 %r4057, %r4054, %r2605, %r666;
- mad.lo.s32 %r4058, %r4056, %r2606, %r4057;
- mul.wide.s32 %rd1137, %r4058, 4;
- add.s64 %rd1138, %rd2, %rd1137;
- ld.global.f32 %f5346, [%rd1138];
-
-$L__BB0_511:
- @%p430 bra $L__BB0_514;
-
- shl.b32 %r668, %r12, 5;
- mov.u32 %r4060, -32;
- sub.s32 %r4061, %r4060, %r668;
- setp.ge.s32 %p449, %r14, %r4061;
- @%p449 bra $L__BB0_514;
-
- add.s32 %r4062, %r668, %r1;
- add.s32 %r4063, %r4062, 32;
- mul.hi.s32 %r4064, %r4063, 954437177;
- shr.u32 %r4065, %r4064, 31;
- shr.s32 %r4066, %r4064, 1;
- add.s32 %r4067, %r4066, %r4065;
- mul.lo.s32 %r4068, %r4067, 9;
- sub.s32 %r4069, %r4063, %r4068;
- mad.lo.s32 %r4070, %r4067, %r2605, %r666;
- mad.lo.s32 %r4071, %r4069, %r2606, %r4070;
- mul.wide.s32 %rd1139, %r4071, 4;
- add.s64 %rd1140, %rd2, %rd1139;
- ld.global.f32 %f5345, [%rd1140];
-
-$L__BB0_514:
- shl.b32 %r4073, %r2602, 1;
- add.s32 %r669, %r4073, %r661;
- mad.lo.s32 %r670, %r13, %r2603, %r669;
- @%p434 bra $L__BB0_517;
-
- shl.b32 %r671, %r12, 5;
- neg.s32 %r4074, %r671;
- setp.ge.s32 %p451, %r14, %r4074;
- @%p451 bra $L__BB0_517;
-
- add.s32 %r4075, %r671, %r1;
- mul.hi.s32 %r4076, %r4075, 954437177;
- shr.u32 %r4077, %r4076, 31;
- shr.s32 %r4078, %r4076, 1;
- add.s32 %r4079, %r4078, %r4077;
- mul.lo.s32 %r4080, %r4079, 9;
- sub.s32 %r4081, %r4075, %r4080;
- mad.lo.s32 %r4082, %r4079, %r2605, %r670;
- mad.lo.s32 %r4083, %r4081, %r2606, %r4082;
- mul.wide.s32 %rd1141, %r4083, 4;
- add.s64 %rd1142, %rd2, %rd1141;
- ld.global.f32 %f5344, [%rd1142];
-
-$L__BB0_517:
- @%p434 bra $L__BB0_520;
-
- shl.b32 %r672, %r12, 5;
- mov.u32 %r4085, -32;
- sub.s32 %r4086, %r4085, %r672;
- setp.ge.s32 %p453, %r14, %r4086;
- @%p453 bra $L__BB0_520;
-
- add.s32 %r4087, %r672, %r1;
- add.s32 %r4088, %r4087, 32;
- mul.hi.s32 %r4089, %r4088, 954437177;
- shr.u32 %r4090, %r4089, 31;
- shr.s32 %r4091, %r4089, 1;
- add.s32 %r4092, %r4091, %r4090;
- mul.lo.s32 %r4093, %r4092, 9;
- sub.s32 %r4094, %r4088, %r4093;
- mad.lo.s32 %r4095, %r4092, %r2605, %r670;
- mad.lo.s32 %r4096, %r4094, %r2606, %r4095;
- mul.wide.s32 %rd1143, %r4096, 4;
- add.s64 %rd1144, %rd2, %rd1143;
- ld.global.f32 %f5343, [%rd1144];
-
-$L__BB0_520:
- add.s32 %r673, %r669, %r662;
- @%p438 bra $L__BB0_523;
-
- shl.b32 %r674, %r12, 5;
- neg.s32 %r4098, %r674;
- setp.ge.s32 %p455, %r14, %r4098;
- @%p455 bra $L__BB0_523;
-
- add.s32 %r4099, %r674, %r1;
- mul.hi.s32 %r4100, %r4099, 954437177;
- shr.u32 %r4101, %r4100, 31;
- shr.s32 %r4102, %r4100, 1;
- add.s32 %r4103, %r4102, %r4101;
- mul.lo.s32 %r4104, %r4103, 9;
- sub.s32 %r4105, %r4099, %r4104;
- mad.lo.s32 %r4106, %r4103, %r2605, %r673;
- mad.lo.s32 %r4107, %r4105, %r2606, %r4106;
- mul.wide.s32 %rd1145, %r4107, 4;
- add.s64 %rd1146, %rd2, %rd1145;
- ld.global.f32 %f5342, [%rd1146];
-
-$L__BB0_523:
- @%p438 bra $L__BB0_526;
-
- shl.b32 %r675, %r12, 5;
- mov.u32 %r4109, -32;
- sub.s32 %r4110, %r4109, %r675;
- setp.ge.s32 %p457, %r14, %r4110;
- @%p457 bra $L__BB0_526;
-
- add.s32 %r4111, %r675, %r1;
- add.s32 %r4112, %r4111, 32;
- mul.hi.s32 %r4113, %r4112, 954437177;
- shr.u32 %r4114, %r4113, 31;
- shr.s32 %r4115, %r4113, 1;
- add.s32 %r4116, %r4115, %r4114;
- mul.lo.s32 %r4117, %r4116, 9;
- sub.s32 %r4118, %r4112, %r4117;
- mad.lo.s32 %r4119, %r4116, %r2605, %r673;
- mad.lo.s32 %r4120, %r4118, %r2606, %r4119;
- mul.wide.s32 %rd1147, %r4120, 4;
- add.s64 %rd1148, %rd2, %rd1147;
- ld.global.f32 %f5341, [%rd1148];
-
-$L__BB0_526:
- @%p426 bra $L__BB0_555;
-
- shl.b32 %r4122, %r12, 5;
- neg.s32 %r676, %r4122;
- setp.ge.s32 %p459, %r14, %r676;
- @%p459 bra $L__BB0_540;
-
- mul.f32 %f2948, %f5348, 0f3F22F983;
- cvt.rni.s32.f32 %r8357, %f2948;
- cvt.rn.f32.s32 %f2949, %r8357;
- mov.f32 %f2950, 0fBFC90FDA;
- fma.rn.f32 %f2951, %f2949, %f2950, %f5348;
- mov.f32 %f2952, 0fB3A22168;
- fma.rn.f32 %f2953, %f2949, %f2952, %f2951;
- mov.f32 %f2954, 0fA7C234C5;
- fma.rn.f32 %f5369, %f2949, %f2954, %f2953;
- abs.f32 %f573, %f5348;
- setp.ltu.f32 %p460, %f573, 0f47CE4780;
- @%p460 bra $L__BB0_536;
-
- setp.eq.f32 %p461, %f573, 0f7F800000;
- @%p461 bra $L__BB0_535;
- bra.uni $L__BB0_530;
-
-$L__BB0_535:
- mov.f32 %f2957, 0f00000000;
- mul.rn.f32 %f5369, %f5348, %f2957;
- mov.u32 %r8357, 0;
- bra.uni $L__BB0_536;
-
-$L__BB0_760:
- mov.b32 %r981, %f847;
- shr.u32 %r4713, %r981, 23;
- and.b32 %r4714, %r4713, 255;
- add.s32 %r982, %r4714, -128;
- shl.b32 %r4715, %r981, 8;
- or.b32 %r983, %r4715, -2147483648;
- shr.u32 %r984, %r982, 5;
- mov.u64 %rd2594, 0;
- mov.u32 %r8418, 0;
- mov.u64 %rd1405, __cudart_i2opi_f;
- mov.u64 %rd2595, %rd2594;
-
-$L__BB0_761:
- .pragma "nounroll";
- shl.b64 %rd1404, %rd2594, 2;
- add.s64 %rd1406, %rd1405, %rd1404;
- ld.global.nc.u32 %r4716, [%rd1406];
- mad.wide.u32 %rd1407, %r4716, %r983, %rd2595;
- shr.u64 %rd2595, %rd1407, 32;
- add.s64 %rd1408, %rd1, %rd1404;
- st.local.u32 [%rd1408], %rd1407;
- add.s32 %r8418, %r8418, 1;
- cvt.s64.s32 %rd2594, %r8418;
- setp.ne.s32 %p660, %r8418, 6;
- @%p660 bra $L__BB0_761;
-
- st.local.u32 [%rd5], %rd2595;
- mov.u32 %r4717, 4;
- sub.s32 %r987, %r4717, %r984;
- mov.u32 %r4718, 6;
- sub.s32 %r4719, %r4718, %r984;
- mul.wide.s32 %rd1409, %r4719, 4;
- add.s64 %rd1410, %rd1, %rd1409;
- ld.local.u32 %r8419, [%rd1410];
- ld.local.u32 %r8420, [%rd1410+-4];
- and.b32 %r990, %r982, 31;
- setp.eq.s32 %p661, %r990, 0;
- @%p661 bra $L__BB0_764;
-
- mov.u32 %r4720, 32;
- sub.s32 %r4721, %r4720, %r990;
- shr.u32 %r4722, %r8420, %r4721;
- shl.b32 %r4723, %r8419, %r990;
- add.s32 %r8419, %r4722, %r4723;
- mul.wide.s32 %rd1411, %r987, 4;
- add.s64 %rd1412, %rd1, %rd1411;
- ld.local.u32 %r4724, [%rd1412];
- shr.u32 %r4725, %r4724, %r4721;
- shl.b32 %r4726, %r8420, %r990;
- add.s32 %r8420, %r4725, %r4726;
-
-$L__BB0_764:
- and.b32 %r4727, %r981, -2147483648;
- shr.u32 %r4728, %r8420, 30;
- shl.b32 %r4729, %r8419, 2;
- or.b32 %r4730, %r4728, %r4729;
- shr.u32 %r4731, %r4730, 31;
- shr.u32 %r4732, %r8419, 30;
- add.s32 %r4733, %r4731, %r4732;
- neg.s32 %r4734, %r4733;
- setp.eq.s32 %p662, %r4727, 0;
- selp.b32 %r8421, %r4733, %r4734, %p662;
- setp.ne.s32 %p663, %r4731, 0;
- xor.b32 %r4735, %r4727, -2147483648;
- selp.b32 %r4736, %r4735, %r4727, %p663;
- selp.b32 %r4737, -1, 0, %p663;
- xor.b32 %r4738, %r4730, %r4737;
- shl.b32 %r4739, %r8420, 2;
- xor.b32 %r4740, %r4739, %r4737;
- cvt.u64.u32 %rd1413, %r4738;
- cvt.u64.u32 %rd1414, %r4740;
- bfi.b64 %rd1415, %rd1413, %rd1414, 32, 32;
- cvt.rn.f64.s64 %fd97, %rd1415;
- mul.f64 %fd98, %fd97, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3306, %fd98;
- setp.eq.s32 %p664, %r4736, 0;
- neg.f32 %f3307, %f3306;
- selp.f32 %f5468, %f3306, %f3307, %p664;
-
-$L__BB0_766:
- and.b32 %r997, %r8421, 1;
- setp.eq.s32 %p665, %r997, 0;
- selp.f32 %f860, %f5468, 0f3F800000, %p665;
- mul.rn.f32 %f861, %f5468, %f5468;
- mov.f32 %f5469, 0fB94D4153;
- @%p665 bra $L__BB0_768;
-
- mov.f32 %f3310, 0fBAB607ED;
- mov.f32 %f3311, 0f37CBAC00;
- fma.rn.f32 %f5469, %f3311, %f861, %f3310;
-
-$L__BB0_768:
- selp.f32 %f3312, 0f3C0885E4, 0f3D2AAABB, %p665;
- fma.rn.f32 %f3313, %f5469, %f861, %f3312;
- selp.f32 %f3314, 0fBE2AAAA8, 0fBEFFFFFF, %p665;
- fma.rn.f32 %f3315, %f3313, %f861, %f3314;
- mov.f32 %f3316, 0f00000000;
- fma.rn.f32 %f3317, %f861, %f860, %f3316;
- fma.rn.f32 %f5470, %f3315, %f3317, %f860;
- and.b32 %r4742, %r8421, 2;
- setp.eq.s32 %p667, %r4742, 0;
- @%p667 bra $L__BB0_770;
-
- mov.f32 %f3319, 0fBF800000;
- fma.rn.f32 %f5470, %f5470, %f3319, %f3316;
-
-$L__BB0_770:
- mul.f32 %f3320, %f839, 0f3F22F983;
- cvt.rni.s32.f32 %r8425, %f3320;
- cvt.rn.f32.s32 %f3321, %r8425;
- mov.f32 %f3322, 0fBFC90FDA;
- fma.rn.f32 %f3323, %f3321, %f3322, %f839;
- mov.f32 %f3324, 0fB3A22168;
- fma.rn.f32 %f3325, %f3321, %f3324, %f3323;
- mov.f32 %f3326, 0fA7C234C5;
- fma.rn.f32 %f5471, %f3321, %f3326, %f3325;
- abs.f32 %f868, %f839;
- setp.ltu.f32 %p668, %f868, 0f47CE4780;
- @%p668 bra $L__BB0_778;
-
- setp.eq.f32 %p669, %f868, 0f7F800000;
- @%p669 bra $L__BB0_777;
- bra.uni $L__BB0_772;
-
-$L__BB0_777:
- mov.f32 %f3329, 0f00000000;
- mul.rn.f32 %f5471, %f839, %f3329;
- mov.u32 %r8425, 0;
- bra.uni $L__BB0_778;
-
-$L__BB0_772:
- mov.b32 %r999, %f839;
- shr.u32 %r4744, %r999, 23;
- and.b32 %r4745, %r4744, 255;
- add.s32 %r1000, %r4745, -128;
- shl.b32 %r4746, %r999, 8;
- or.b32 %r1001, %r4746, -2147483648;
- shr.u32 %r1002, %r1000, 5;
- mov.u64 %rd2596, 0;
- mov.u32 %r8422, 0;
- mov.u64 %rd1419, __cudart_i2opi_f;
- mov.u64 %rd2597, %rd2596;
-
-$L__BB0_773:
- .pragma "nounroll";
- shl.b64 %rd1418, %rd2596, 2;
- add.s64 %rd1420, %rd1419, %rd1418;
- ld.global.nc.u32 %r4747, [%rd1420];
- mad.wide.u32 %rd1421, %r4747, %r1001, %rd2597;
- shr.u64 %rd2597, %rd1421, 32;
- add.s64 %rd1422, %rd1, %rd1418;
- st.local.u32 [%rd1422], %rd1421;
- add.s32 %r8422, %r8422, 1;
- cvt.s64.s32 %rd2596, %r8422;
- setp.ne.s32 %p670, %r8422, 6;
- @%p670 bra $L__BB0_773;
-
- st.local.u32 [%rd5], %rd2597;
- mov.u32 %r4748, 4;
- sub.s32 %r1005, %r4748, %r1002;
- mov.u32 %r4749, 6;
- sub.s32 %r4750, %r4749, %r1002;
- mul.wide.s32 %rd1423, %r4750, 4;
- add.s64 %rd1424, %rd1, %rd1423;
- ld.local.u32 %r8423, [%rd1424];
- ld.local.u32 %r8424, [%rd1424+-4];
- and.b32 %r1008, %r1000, 31;
- setp.eq.s32 %p671, %r1008, 0;
- @%p671 bra $L__BB0_776;
-
- mov.u32 %r4751, 32;
- sub.s32 %r4752, %r4751, %r1008;
- shr.u32 %r4753, %r8424, %r4752;
- shl.b32 %r4754, %r8423, %r1008;
- add.s32 %r8423, %r4753, %r4754;
- mul.wide.s32 %rd1425, %r1005, 4;
- add.s64 %rd1426, %rd1, %rd1425;
- ld.local.u32 %r4755, [%rd1426];
- shr.u32 %r4756, %r4755, %r4752;
- shl.b32 %r4757, %r8424, %r1008;
- add.s32 %r8424, %r4756, %r4757;
-
-$L__BB0_776:
- and.b32 %r4758, %r999, -2147483648;
- shr.u32 %r4759, %r8424, 30;
- shl.b32 %r4760, %r8423, 2;
- or.b32 %r4761, %r4759, %r4760;
- shr.u32 %r4762, %r4761, 31;
- shr.u32 %r4763, %r8423, 30;
- add.s32 %r4764, %r4762, %r4763;
- neg.s32 %r4765, %r4764;
- setp.eq.s32 %p672, %r4758, 0;
- selp.b32 %r8425, %r4764, %r4765, %p672;
- setp.ne.s32 %p673, %r4762, 0;
- xor.b32 %r4766, %r4758, -2147483648;
- selp.b32 %r4767, %r4766, %r4758, %p673;
- selp.b32 %r4768, -1, 0, %p673;
- xor.b32 %r4769, %r4761, %r4768;
- shl.b32 %r4770, %r8424, 2;
- xor.b32 %r4771, %r4770, %r4768;
- cvt.u64.u32 %rd1427, %r4769;
- cvt.u64.u32 %rd1428, %r4771;
- bfi.b64 %rd1429, %rd1427, %rd1428, 32, 32;
- cvt.rn.f64.s64 %fd99, %rd1429;
- mul.f64 %fd100, %fd99, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3327, %fd100;
- setp.eq.s32 %p674, %r4767, 0;
- neg.f32 %f3328, %f3327;
- selp.f32 %f5471, %f3327, %f3328, %p674;
-
-$L__BB0_778:
- add.s32 %r1015, %r8425, 1;
- and.b32 %r1016, %r1015, 1;
- setp.eq.s32 %p675, %r1016, 0;
- selp.f32 %f872, %f5471, 0f3F800000, %p675;
- mul.rn.f32 %f873, %f5471, %f5471;
- mov.f32 %f5472, 0fB94D4153;
- @%p675 bra $L__BB0_780;
-
- mov.f32 %f3331, 0fBAB607ED;
- mov.f32 %f3332, 0f37CBAC00;
- fma.rn.f32 %f5472, %f3332, %f873, %f3331;
-
-$L__BB0_780:
- selp.f32 %f3333, 0f3C0885E4, 0f3D2AAABB, %p675;
- fma.rn.f32 %f3334, %f5472, %f873, %f3333;
- selp.f32 %f3335, 0fBE2AAAA8, 0fBEFFFFFF, %p675;
- fma.rn.f32 %f3336, %f3334, %f873, %f3335;
- mov.f32 %f3337, 0f00000000;
- fma.rn.f32 %f3338, %f873, %f872, %f3337;
- fma.rn.f32 %f5473, %f3336, %f3338, %f872;
- and.b32 %r4773, %r1015, 2;
- setp.eq.s32 %p677, %r4773, 0;
- @%p677 bra $L__BB0_782;
-
- mov.f32 %f3340, 0fBF800000;
- fma.rn.f32 %f5473, %f5473, %f3340, %f3337;
-
-$L__BB0_782:
- add.f32 %f5523, %f5470, %f5473;
- mul.f32 %f3341, %f848, 0f3F22F983;
- cvt.rni.s32.f32 %r8429, %f3341;
- cvt.rn.f32.s32 %f3342, %r8429;
- mov.f32 %f3343, 0fBFC90FDA;
- fma.rn.f32 %f3344, %f3342, %f3343, %f848;
- mov.f32 %f3345, 0fB3A22168;
- fma.rn.f32 %f3346, %f3342, %f3345, %f3344;
- mov.f32 %f3347, 0fA7C234C5;
- fma.rn.f32 %f5474, %f3342, %f3347, %f3346;
- abs.f32 %f881, %f848;
- setp.ltu.f32 %p678, %f881, 0f47CE4780;
- @%p678 bra $L__BB0_790;
-
- setp.eq.f32 %p679, %f881, 0f7F800000;
- @%p679 bra $L__BB0_789;
- bra.uni $L__BB0_784;
-
-$L__BB0_789:
- mov.f32 %f3350, 0f00000000;
- mul.rn.f32 %f5474, %f848, %f3350;
- mov.u32 %r8429, 0;
- bra.uni $L__BB0_790;
-
-$L__BB0_784:
- mov.b32 %r1018, %f848;
- shr.u32 %r4775, %r1018, 23;
- and.b32 %r4776, %r4775, 255;
- add.s32 %r1019, %r4776, -128;
- shl.b32 %r4777, %r1018, 8;
- or.b32 %r1020, %r4777, -2147483648;
- shr.u32 %r1021, %r1019, 5;
- mov.u64 %rd2598, 0;
- mov.u32 %r8426, 0;
- mov.u64 %rd1433, __cudart_i2opi_f;
- mov.u64 %rd2599, %rd2598;
-
-$L__BB0_785:
- .pragma "nounroll";
- shl.b64 %rd1432, %rd2598, 2;
- add.s64 %rd1434, %rd1433, %rd1432;
- ld.global.nc.u32 %r4778, [%rd1434];
- mad.wide.u32 %rd1435, %r4778, %r1020, %rd2599;
- shr.u64 %rd2599, %rd1435, 32;
- add.s64 %rd1436, %rd1, %rd1432;
- st.local.u32 [%rd1436], %rd1435;
- add.s32 %r8426, %r8426, 1;
- cvt.s64.s32 %rd2598, %r8426;
- setp.ne.s32 %p680, %r8426, 6;
- @%p680 bra $L__BB0_785;
-
- st.local.u32 [%rd5], %rd2599;
- mov.u32 %r4779, 4;
- sub.s32 %r1024, %r4779, %r1021;
- mov.u32 %r4780, 6;
- sub.s32 %r4781, %r4780, %r1021;
- mul.wide.s32 %rd1437, %r4781, 4;
- add.s64 %rd1438, %rd1, %rd1437;
- ld.local.u32 %r8427, [%rd1438];
- ld.local.u32 %r8428, [%rd1438+-4];
- and.b32 %r1027, %r1019, 31;
- setp.eq.s32 %p681, %r1027, 0;
- @%p681 bra $L__BB0_788;
-
- mov.u32 %r4782, 32;
- sub.s32 %r4783, %r4782, %r1027;
- shr.u32 %r4784, %r8428, %r4783;
- shl.b32 %r4785, %r8427, %r1027;
- add.s32 %r8427, %r4784, %r4785;
- mul.wide.s32 %rd1439, %r1024, 4;
- add.s64 %rd1440, %rd1, %rd1439;
- ld.local.u32 %r4786, [%rd1440];
- shr.u32 %r4787, %r4786, %r4783;
- shl.b32 %r4788, %r8428, %r1027;
- add.s32 %r8428, %r4787, %r4788;
-
-$L__BB0_788:
- and.b32 %r4789, %r1018, -2147483648;
- shr.u32 %r4790, %r8428, 30;
- shl.b32 %r4791, %r8427, 2;
- or.b32 %r4792, %r4790, %r4791;
- shr.u32 %r4793, %r4792, 31;
- shr.u32 %r4794, %r8427, 30;
- add.s32 %r4795, %r4793, %r4794;
- neg.s32 %r4796, %r4795;
- setp.eq.s32 %p682, %r4789, 0;
- selp.b32 %r8429, %r4795, %r4796, %p682;
- setp.ne.s32 %p683, %r4793, 0;
- xor.b32 %r4797, %r4789, -2147483648;
- selp.b32 %r4798, %r4797, %r4789, %p683;
- selp.b32 %r4799, -1, 0, %p683;
- xor.b32 %r4800, %r4792, %r4799;
- shl.b32 %r4801, %r8428, 2;
- xor.b32 %r4802, %r4801, %r4799;
- cvt.u64.u32 %rd1441, %r4800;
- cvt.u64.u32 %rd1442, %r4802;
- bfi.b64 %rd1443, %rd1441, %rd1442, 32, 32;
- cvt.rn.f64.s64 %fd101, %rd1443;
- mul.f64 %fd102, %fd101, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3348, %fd102;
- setp.eq.s32 %p684, %r4798, 0;
- neg.f32 %f3349, %f3348;
- selp.f32 %f5474, %f3348, %f3349, %p684;
-
-$L__BB0_790:
- and.b32 %r1034, %r8429, 1;
- setp.eq.s32 %p685, %r1034, 0;
- selp.f32 %f885, %f5474, 0f3F800000, %p685;
- mul.rn.f32 %f886, %f5474, %f5474;
- mov.f32 %f5475, 0fB94D4153;
- @%p685 bra $L__BB0_792;
-
- mov.f32 %f3352, 0fBAB607ED;
- mov.f32 %f3353, 0f37CBAC00;
- fma.rn.f32 %f5475, %f3353, %f886, %f3352;
-
-$L__BB0_792:
- selp.f32 %f3354, 0f3C0885E4, 0f3D2AAABB, %p685;
- fma.rn.f32 %f3355, %f5475, %f886, %f3354;
- selp.f32 %f3356, 0fBE2AAAA8, 0fBEFFFFFF, %p685;
- fma.rn.f32 %f3357, %f3355, %f886, %f3356;
- mov.f32 %f3358, 0f00000000;
- fma.rn.f32 %f3359, %f886, %f885, %f3358;
- fma.rn.f32 %f5476, %f3357, %f3359, %f885;
- and.b32 %r4804, %r8429, 2;
- setp.eq.s32 %p687, %r4804, 0;
- @%p687 bra $L__BB0_794;
-
- mov.f32 %f3361, 0fBF800000;
- fma.rn.f32 %f5476, %f5476, %f3361, %f3358;
-
-$L__BB0_794:
- mul.f32 %f3362, %f840, 0f3F22F983;
- cvt.rni.s32.f32 %r8433, %f3362;
- cvt.rn.f32.s32 %f3363, %r8433;
- mov.f32 %f3364, 0fBFC90FDA;
- fma.rn.f32 %f3365, %f3363, %f3364, %f840;
- mov.f32 %f3366, 0fB3A22168;
- fma.rn.f32 %f3367, %f3363, %f3366, %f3365;
- mov.f32 %f3368, 0fA7C234C5;
- fma.rn.f32 %f5477, %f3363, %f3368, %f3367;
- abs.f32 %f893, %f840;
- setp.ltu.f32 %p688, %f893, 0f47CE4780;
- @%p688 bra $L__BB0_802;
-
- setp.eq.f32 %p689, %f893, 0f7F800000;
- @%p689 bra $L__BB0_801;
- bra.uni $L__BB0_796;
-
-$L__BB0_801:
- mov.f32 %f3371, 0f00000000;
- mul.rn.f32 %f5477, %f840, %f3371;
- mov.u32 %r8433, 0;
- bra.uni $L__BB0_802;
-
-$L__BB0_796:
- mov.b32 %r1036, %f840;
- shr.u32 %r4806, %r1036, 23;
- and.b32 %r4807, %r4806, 255;
- add.s32 %r1037, %r4807, -128;
- shl.b32 %r4808, %r1036, 8;
- or.b32 %r1038, %r4808, -2147483648;
- shr.u32 %r1039, %r1037, 5;
- mov.u64 %rd2600, 0;
- mov.u32 %r8430, 0;
- mov.u64 %rd1447, __cudart_i2opi_f;
- mov.u64 %rd2601, %rd2600;
-
-$L__BB0_797:
- .pragma "nounroll";
- shl.b64 %rd1446, %rd2600, 2;
- add.s64 %rd1448, %rd1447, %rd1446;
- ld.global.nc.u32 %r4809, [%rd1448];
- mad.wide.u32 %rd1449, %r4809, %r1038, %rd2601;
- shr.u64 %rd2601, %rd1449, 32;
- add.s64 %rd1450, %rd1, %rd1446;
- st.local.u32 [%rd1450], %rd1449;
- add.s32 %r8430, %r8430, 1;
- cvt.s64.s32 %rd2600, %r8430;
- setp.ne.s32 %p690, %r8430, 6;
- @%p690 bra $L__BB0_797;
-
- st.local.u32 [%rd5], %rd2601;
- mov.u32 %r4810, 4;
- sub.s32 %r1042, %r4810, %r1039;
- mov.u32 %r4811, 6;
- sub.s32 %r4812, %r4811, %r1039;
- mul.wide.s32 %rd1451, %r4812, 4;
- add.s64 %rd1452, %rd1, %rd1451;
- ld.local.u32 %r8431, [%rd1452];
- ld.local.u32 %r8432, [%rd1452+-4];
- and.b32 %r1045, %r1037, 31;
- setp.eq.s32 %p691, %r1045, 0;
- @%p691 bra $L__BB0_800;
-
- mov.u32 %r4813, 32;
- sub.s32 %r4814, %r4813, %r1045;
- shr.u32 %r4815, %r8432, %r4814;
- shl.b32 %r4816, %r8431, %r1045;
- add.s32 %r8431, %r4815, %r4816;
- mul.wide.s32 %rd1453, %r1042, 4;
- add.s64 %rd1454, %rd1, %rd1453;
- ld.local.u32 %r4817, [%rd1454];
- shr.u32 %r4818, %r4817, %r4814;
- shl.b32 %r4819, %r8432, %r1045;
- add.s32 %r8432, %r4818, %r4819;
-
-$L__BB0_800:
- and.b32 %r4820, %r1036, -2147483648;
- shr.u32 %r4821, %r8432, 30;
- shl.b32 %r4822, %r8431, 2;
- or.b32 %r4823, %r4821, %r4822;
- shr.u32 %r4824, %r4823, 31;
- shr.u32 %r4825, %r8431, 30;
- add.s32 %r4826, %r4824, %r4825;
- neg.s32 %r4827, %r4826;
- setp.eq.s32 %p692, %r4820, 0;
- selp.b32 %r8433, %r4826, %r4827, %p692;
- setp.ne.s32 %p693, %r4824, 0;
- xor.b32 %r4828, %r4820, -2147483648;
- selp.b32 %r4829, %r4828, %r4820, %p693;
- selp.b32 %r4830, -1, 0, %p693;
- xor.b32 %r4831, %r4823, %r4830;
- shl.b32 %r4832, %r8432, 2;
- xor.b32 %r4833, %r4832, %r4830;
- cvt.u64.u32 %rd1455, %r4831;
- cvt.u64.u32 %rd1456, %r4833;
- bfi.b64 %rd1457, %rd1455, %rd1456, 32, 32;
- cvt.rn.f64.s64 %fd103, %rd1457;
- mul.f64 %fd104, %fd103, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3369, %fd104;
- setp.eq.s32 %p694, %r4829, 0;
- neg.f32 %f3370, %f3369;
- selp.f32 %f5477, %f3369, %f3370, %p694;
-
-$L__BB0_802:
- add.s32 %r1052, %r8433, 1;
- and.b32 %r1053, %r1052, 1;
- setp.eq.s32 %p695, %r1053, 0;
- selp.f32 %f897, %f5477, 0f3F800000, %p695;
- mul.rn.f32 %f898, %f5477, %f5477;
- mov.f32 %f5478, 0fB94D4153;
- @%p695 bra $L__BB0_804;
-
- mov.f32 %f3373, 0fBAB607ED;
- mov.f32 %f3374, 0f37CBAC00;
- fma.rn.f32 %f5478, %f3374, %f898, %f3373;
-
-$L__BB0_804:
- selp.f32 %f3375, 0f3C0885E4, 0f3D2AAABB, %p695;
- fma.rn.f32 %f3376, %f5478, %f898, %f3375;
- selp.f32 %f3377, 0fBE2AAAA8, 0fBEFFFFFF, %p695;
- fma.rn.f32 %f3378, %f3376, %f898, %f3377;
- mov.f32 %f3379, 0f00000000;
- fma.rn.f32 %f3380, %f898, %f897, %f3379;
- fma.rn.f32 %f5479, %f3378, %f3380, %f897;
- and.b32 %r4835, %r1052, 2;
- setp.eq.s32 %p697, %r4835, 0;
- @%p697 bra $L__BB0_806;
-
- mov.f32 %f3382, 0fBF800000;
- fma.rn.f32 %f5479, %f5479, %f3382, %f3379;
-
-$L__BB0_806:
- add.f32 %f5522, %f5476, %f5479;
- mul.f32 %f3383, %f849, 0f3F22F983;
- cvt.rni.s32.f32 %r8437, %f3383;
- cvt.rn.f32.s32 %f3384, %r8437;
- mov.f32 %f3385, 0fBFC90FDA;
- fma.rn.f32 %f3386, %f3384, %f3385, %f849;
- mov.f32 %f3387, 0fB3A22168;
- fma.rn.f32 %f3388, %f3384, %f3387, %f3386;
- mov.f32 %f3389, 0fA7C234C5;
- fma.rn.f32 %f5480, %f3384, %f3389, %f3388;
- abs.f32 %f906, %f849;
- setp.ltu.f32 %p698, %f906, 0f47CE4780;
- @%p698 bra $L__BB0_814;
-
- setp.eq.f32 %p699, %f906, 0f7F800000;
- @%p699 bra $L__BB0_813;
- bra.uni $L__BB0_808;
-
-$L__BB0_813:
- mov.f32 %f3392, 0f00000000;
- mul.rn.f32 %f5480, %f849, %f3392;
- mov.u32 %r8437, 0;
- bra.uni $L__BB0_814;
-
-$L__BB0_808:
- mov.b32 %r1055, %f849;
- shr.u32 %r4837, %r1055, 23;
- and.b32 %r4838, %r4837, 255;
- add.s32 %r1056, %r4838, -128;
- shl.b32 %r4839, %r1055, 8;
- or.b32 %r1057, %r4839, -2147483648;
- shr.u32 %r1058, %r1056, 5;
- mov.u64 %rd2602, 0;
- mov.u32 %r8434, 0;
- mov.u64 %rd1461, __cudart_i2opi_f;
- mov.u64 %rd2603, %rd2602;
-
-$L__BB0_809:
- .pragma "nounroll";
- shl.b64 %rd1460, %rd2602, 2;
- add.s64 %rd1462, %rd1461, %rd1460;
- ld.global.nc.u32 %r4840, [%rd1462];
- mad.wide.u32 %rd1463, %r4840, %r1057, %rd2603;
- shr.u64 %rd2603, %rd1463, 32;
- add.s64 %rd1464, %rd1, %rd1460;
- st.local.u32 [%rd1464], %rd1463;
- add.s32 %r8434, %r8434, 1;
- cvt.s64.s32 %rd2602, %r8434;
- setp.ne.s32 %p700, %r8434, 6;
- @%p700 bra $L__BB0_809;
-
- st.local.u32 [%rd5], %rd2603;
- mov.u32 %r4841, 4;
- sub.s32 %r1061, %r4841, %r1058;
- mov.u32 %r4842, 6;
- sub.s32 %r4843, %r4842, %r1058;
- mul.wide.s32 %rd1465, %r4843, 4;
- add.s64 %rd1466, %rd1, %rd1465;
- ld.local.u32 %r8435, [%rd1466];
- ld.local.u32 %r8436, [%rd1466+-4];
- and.b32 %r1064, %r1056, 31;
- setp.eq.s32 %p701, %r1064, 0;
- @%p701 bra $L__BB0_812;
-
- mov.u32 %r4844, 32;
- sub.s32 %r4845, %r4844, %r1064;
- shr.u32 %r4846, %r8436, %r4845;
- shl.b32 %r4847, %r8435, %r1064;
- add.s32 %r8435, %r4846, %r4847;
- mul.wide.s32 %rd1467, %r1061, 4;
- add.s64 %rd1468, %rd1, %rd1467;
- ld.local.u32 %r4848, [%rd1468];
- shr.u32 %r4849, %r4848, %r4845;
- shl.b32 %r4850, %r8436, %r1064;
- add.s32 %r8436, %r4849, %r4850;
-
-$L__BB0_812:
- and.b32 %r4851, %r1055, -2147483648;
- shr.u32 %r4852, %r8436, 30;
- shl.b32 %r4853, %r8435, 2;
- or.b32 %r4854, %r4852, %r4853;
- shr.u32 %r4855, %r4854, 31;
- shr.u32 %r4856, %r8435, 30;
- add.s32 %r4857, %r4855, %r4856;
- neg.s32 %r4858, %r4857;
- setp.eq.s32 %p702, %r4851, 0;
- selp.b32 %r8437, %r4857, %r4858, %p702;
- setp.ne.s32 %p703, %r4855, 0;
- xor.b32 %r4859, %r4851, -2147483648;
- selp.b32 %r4860, %r4859, %r4851, %p703;
- selp.b32 %r4861, -1, 0, %p703;
- xor.b32 %r4862, %r4854, %r4861;
- shl.b32 %r4863, %r8436, 2;
- xor.b32 %r4864, %r4863, %r4861;
- cvt.u64.u32 %rd1469, %r4862;
- cvt.u64.u32 %rd1470, %r4864;
- bfi.b64 %rd1471, %rd1469, %rd1470, 32, 32;
- cvt.rn.f64.s64 %fd105, %rd1471;
- mul.f64 %fd106, %fd105, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3390, %fd106;
- setp.eq.s32 %p704, %r4860, 0;
- neg.f32 %f3391, %f3390;
- selp.f32 %f5480, %f3390, %f3391, %p704;
-
-$L__BB0_814:
- and.b32 %r1071, %r8437, 1;
- setp.eq.s32 %p705, %r1071, 0;
- selp.f32 %f910, %f5480, 0f3F800000, %p705;
- mul.rn.f32 %f911, %f5480, %f5480;
- mov.f32 %f5481, 0fB94D4153;
- @%p705 bra $L__BB0_816;
-
- mov.f32 %f3394, 0fBAB607ED;
- mov.f32 %f3395, 0f37CBAC00;
- fma.rn.f32 %f5481, %f3395, %f911, %f3394;
-
-$L__BB0_816:
- selp.f32 %f3396, 0f3C0885E4, 0f3D2AAABB, %p705;
- fma.rn.f32 %f3397, %f5481, %f911, %f3396;
- selp.f32 %f3398, 0fBE2AAAA8, 0fBEFFFFFF, %p705;
- fma.rn.f32 %f3399, %f3397, %f911, %f3398;
- mov.f32 %f3400, 0f00000000;
- fma.rn.f32 %f3401, %f911, %f910, %f3400;
- fma.rn.f32 %f5482, %f3399, %f3401, %f910;
- and.b32 %r4866, %r8437, 2;
- setp.eq.s32 %p707, %r4866, 0;
- @%p707 bra $L__BB0_818;
-
- mov.f32 %f3403, 0fBF800000;
- fma.rn.f32 %f5482, %f5482, %f3403, %f3400;
-
-$L__BB0_818:
- mul.f32 %f3404, %f841, 0f3F22F983;
- cvt.rni.s32.f32 %r8441, %f3404;
- cvt.rn.f32.s32 %f3405, %r8441;
- mov.f32 %f3406, 0fBFC90FDA;
- fma.rn.f32 %f3407, %f3405, %f3406, %f841;
- mov.f32 %f3408, 0fB3A22168;
- fma.rn.f32 %f3409, %f3405, %f3408, %f3407;
- mov.f32 %f3410, 0fA7C234C5;
- fma.rn.f32 %f5483, %f3405, %f3410, %f3409;
- abs.f32 %f918, %f841;
- setp.ltu.f32 %p708, %f918, 0f47CE4780;
- @%p708 bra $L__BB0_826;
-
- setp.eq.f32 %p709, %f918, 0f7F800000;
- @%p709 bra $L__BB0_825;
- bra.uni $L__BB0_820;
-
-$L__BB0_825:
- mov.f32 %f3413, 0f00000000;
- mul.rn.f32 %f5483, %f841, %f3413;
- mov.u32 %r8441, 0;
- bra.uni $L__BB0_826;
-
-$L__BB0_820:
- mov.b32 %r1073, %f841;
- shr.u32 %r4868, %r1073, 23;
- and.b32 %r4869, %r4868, 255;
- add.s32 %r1074, %r4869, -128;
- shl.b32 %r4870, %r1073, 8;
- or.b32 %r1075, %r4870, -2147483648;
- shr.u32 %r1076, %r1074, 5;
- mov.u64 %rd2604, 0;
- mov.u32 %r8438, 0;
- mov.u64 %rd1475, __cudart_i2opi_f;
- mov.u64 %rd2605, %rd2604;
-
-$L__BB0_821:
- .pragma "nounroll";
- shl.b64 %rd1474, %rd2604, 2;
- add.s64 %rd1476, %rd1475, %rd1474;
- ld.global.nc.u32 %r4871, [%rd1476];
- mad.wide.u32 %rd1477, %r4871, %r1075, %rd2605;
- shr.u64 %rd2605, %rd1477, 32;
- add.s64 %rd1478, %rd1, %rd1474;
- st.local.u32 [%rd1478], %rd1477;
- add.s32 %r8438, %r8438, 1;
- cvt.s64.s32 %rd2604, %r8438;
- setp.ne.s32 %p710, %r8438, 6;
- @%p710 bra $L__BB0_821;
-
- st.local.u32 [%rd5], %rd2605;
- mov.u32 %r4872, 4;
- sub.s32 %r1079, %r4872, %r1076;
- mov.u32 %r4873, 6;
- sub.s32 %r4874, %r4873, %r1076;
- mul.wide.s32 %rd1479, %r4874, 4;
- add.s64 %rd1480, %rd1, %rd1479;
- ld.local.u32 %r8439, [%rd1480];
- ld.local.u32 %r8440, [%rd1480+-4];
- and.b32 %r1082, %r1074, 31;
- setp.eq.s32 %p711, %r1082, 0;
- @%p711 bra $L__BB0_824;
-
- mov.u32 %r4875, 32;
- sub.s32 %r4876, %r4875, %r1082;
- shr.u32 %r4877, %r8440, %r4876;
- shl.b32 %r4878, %r8439, %r1082;
- add.s32 %r8439, %r4877, %r4878;
- mul.wide.s32 %rd1481, %r1079, 4;
- add.s64 %rd1482, %rd1, %rd1481;
- ld.local.u32 %r4879, [%rd1482];
- shr.u32 %r4880, %r4879, %r4876;
- shl.b32 %r4881, %r8440, %r1082;
- add.s32 %r8440, %r4880, %r4881;
-
-$L__BB0_824:
- and.b32 %r4882, %r1073, -2147483648;
- shr.u32 %r4883, %r8440, 30;
- shl.b32 %r4884, %r8439, 2;
- or.b32 %r4885, %r4883, %r4884;
- shr.u32 %r4886, %r4885, 31;
- shr.u32 %r4887, %r8439, 30;
- add.s32 %r4888, %r4886, %r4887;
- neg.s32 %r4889, %r4888;
- setp.eq.s32 %p712, %r4882, 0;
- selp.b32 %r8441, %r4888, %r4889, %p712;
- setp.ne.s32 %p713, %r4886, 0;
- xor.b32 %r4890, %r4882, -2147483648;
- selp.b32 %r4891, %r4890, %r4882, %p713;
- selp.b32 %r4892, -1, 0, %p713;
- xor.b32 %r4893, %r4885, %r4892;
- shl.b32 %r4894, %r8440, 2;
- xor.b32 %r4895, %r4894, %r4892;
- cvt.u64.u32 %rd1483, %r4893;
- cvt.u64.u32 %rd1484, %r4895;
- bfi.b64 %rd1485, %rd1483, %rd1484, 32, 32;
- cvt.rn.f64.s64 %fd107, %rd1485;
- mul.f64 %fd108, %fd107, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3411, %fd108;
- setp.eq.s32 %p714, %r4891, 0;
- neg.f32 %f3412, %f3411;
- selp.f32 %f5483, %f3411, %f3412, %p714;
-
-$L__BB0_826:
- add.s32 %r1089, %r8441, 1;
- and.b32 %r1090, %r1089, 1;
- setp.eq.s32 %p715, %r1090, 0;
- selp.f32 %f922, %f5483, 0f3F800000, %p715;
- mul.rn.f32 %f923, %f5483, %f5483;
- mov.f32 %f5484, 0fB94D4153;
- @%p715 bra $L__BB0_828;
-
- mov.f32 %f3415, 0fBAB607ED;
- mov.f32 %f3416, 0f37CBAC00;
- fma.rn.f32 %f5484, %f3416, %f923, %f3415;
-
-$L__BB0_828:
- selp.f32 %f3417, 0f3C0885E4, 0f3D2AAABB, %p715;
- fma.rn.f32 %f3418, %f5484, %f923, %f3417;
- selp.f32 %f3419, 0fBE2AAAA8, 0fBEFFFFFF, %p715;
- fma.rn.f32 %f3420, %f3418, %f923, %f3419;
- mov.f32 %f3421, 0f00000000;
- fma.rn.f32 %f3422, %f923, %f922, %f3421;
- fma.rn.f32 %f5485, %f3420, %f3422, %f922;
- and.b32 %r4897, %r1089, 2;
- setp.eq.s32 %p717, %r4897, 0;
- @%p717 bra $L__BB0_830;
-
- mov.f32 %f3424, 0fBF800000;
- fma.rn.f32 %f5485, %f5485, %f3424, %f3421;
-
-$L__BB0_830:
- add.f32 %f5521, %f5482, %f5485;
- mul.f32 %f3425, %f850, 0f3F22F983;
- cvt.rni.s32.f32 %r8445, %f3425;
- cvt.rn.f32.s32 %f3426, %r8445;
- mov.f32 %f3427, 0fBFC90FDA;
- fma.rn.f32 %f3428, %f3426, %f3427, %f850;
- mov.f32 %f3429, 0fB3A22168;
- fma.rn.f32 %f3430, %f3426, %f3429, %f3428;
- mov.f32 %f3431, 0fA7C234C5;
- fma.rn.f32 %f5486, %f3426, %f3431, %f3430;
- abs.f32 %f931, %f850;
- setp.ltu.f32 %p718, %f931, 0f47CE4780;
- @%p718 bra $L__BB0_838;
-
- setp.eq.f32 %p719, %f931, 0f7F800000;
- @%p719 bra $L__BB0_837;
- bra.uni $L__BB0_832;
-
-$L__BB0_837:
- mov.f32 %f3434, 0f00000000;
- mul.rn.f32 %f5486, %f850, %f3434;
- mov.u32 %r8445, 0;
- bra.uni $L__BB0_838;
-
-$L__BB0_832:
- mov.b32 %r1092, %f850;
- shr.u32 %r4899, %r1092, 23;
- and.b32 %r4900, %r4899, 255;
- add.s32 %r1093, %r4900, -128;
- shl.b32 %r4901, %r1092, 8;
- or.b32 %r1094, %r4901, -2147483648;
- shr.u32 %r1095, %r1093, 5;
- mov.u64 %rd2606, 0;
- mov.u32 %r8442, 0;
- mov.u64 %rd1489, __cudart_i2opi_f;
- mov.u64 %rd2607, %rd2606;
-
-$L__BB0_833:
- .pragma "nounroll";
- shl.b64 %rd1488, %rd2606, 2;
- add.s64 %rd1490, %rd1489, %rd1488;
- ld.global.nc.u32 %r4902, [%rd1490];
- mad.wide.u32 %rd1491, %r4902, %r1094, %rd2607;
- shr.u64 %rd2607, %rd1491, 32;
- add.s64 %rd1492, %rd1, %rd1488;
- st.local.u32 [%rd1492], %rd1491;
- add.s32 %r8442, %r8442, 1;
- cvt.s64.s32 %rd2606, %r8442;
- setp.ne.s32 %p720, %r8442, 6;
- @%p720 bra $L__BB0_833;
-
- st.local.u32 [%rd5], %rd2607;
- mov.u32 %r4903, 4;
- sub.s32 %r1098, %r4903, %r1095;
- mov.u32 %r4904, 6;
- sub.s32 %r4905, %r4904, %r1095;
- mul.wide.s32 %rd1493, %r4905, 4;
- add.s64 %rd1494, %rd1, %rd1493;
- ld.local.u32 %r8443, [%rd1494];
- ld.local.u32 %r8444, [%rd1494+-4];
- and.b32 %r1101, %r1093, 31;
- setp.eq.s32 %p721, %r1101, 0;
- @%p721 bra $L__BB0_836;
-
- mov.u32 %r4906, 32;
- sub.s32 %r4907, %r4906, %r1101;
- shr.u32 %r4908, %r8444, %r4907;
- shl.b32 %r4909, %r8443, %r1101;
- add.s32 %r8443, %r4908, %r4909;
- mul.wide.s32 %rd1495, %r1098, 4;
- add.s64 %rd1496, %rd1, %rd1495;
- ld.local.u32 %r4910, [%rd1496];
- shr.u32 %r4911, %r4910, %r4907;
- shl.b32 %r4912, %r8444, %r1101;
- add.s32 %r8444, %r4911, %r4912;
-
-$L__BB0_836:
- and.b32 %r4913, %r1092, -2147483648;
- shr.u32 %r4914, %r8444, 30;
- shl.b32 %r4915, %r8443, 2;
- or.b32 %r4916, %r4914, %r4915;
- shr.u32 %r4917, %r4916, 31;
- shr.u32 %r4918, %r8443, 30;
- add.s32 %r4919, %r4917, %r4918;
- neg.s32 %r4920, %r4919;
- setp.eq.s32 %p722, %r4913, 0;
- selp.b32 %r8445, %r4919, %r4920, %p722;
- setp.ne.s32 %p723, %r4917, 0;
- xor.b32 %r4921, %r4913, -2147483648;
- selp.b32 %r4922, %r4921, %r4913, %p723;
- selp.b32 %r4923, -1, 0, %p723;
- xor.b32 %r4924, %r4916, %r4923;
- shl.b32 %r4925, %r8444, 2;
- xor.b32 %r4926, %r4925, %r4923;
- cvt.u64.u32 %rd1497, %r4924;
- cvt.u64.u32 %rd1498, %r4926;
- bfi.b64 %rd1499, %rd1497, %rd1498, 32, 32;
- cvt.rn.f64.s64 %fd109, %rd1499;
- mul.f64 %fd110, %fd109, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3432, %fd110;
- setp.eq.s32 %p724, %r4922, 0;
- neg.f32 %f3433, %f3432;
- selp.f32 %f5486, %f3432, %f3433, %p724;
-
-$L__BB0_838:
- and.b32 %r1108, %r8445, 1;
- setp.eq.s32 %p725, %r1108, 0;
- selp.f32 %f935, %f5486, 0f3F800000, %p725;
- mul.rn.f32 %f936, %f5486, %f5486;
- mov.f32 %f5487, 0fB94D4153;
- @%p725 bra $L__BB0_840;
-
- mov.f32 %f3436, 0fBAB607ED;
- mov.f32 %f3437, 0f37CBAC00;
- fma.rn.f32 %f5487, %f3437, %f936, %f3436;
-
-$L__BB0_840:
- selp.f32 %f3438, 0f3C0885E4, 0f3D2AAABB, %p725;
- fma.rn.f32 %f3439, %f5487, %f936, %f3438;
- selp.f32 %f3440, 0fBE2AAAA8, 0fBEFFFFFF, %p725;
- fma.rn.f32 %f3441, %f3439, %f936, %f3440;
- mov.f32 %f3442, 0f00000000;
- fma.rn.f32 %f3443, %f936, %f935, %f3442;
- fma.rn.f32 %f5488, %f3441, %f3443, %f935;
- and.b32 %r4928, %r8445, 2;
- setp.eq.s32 %p727, %r4928, 0;
- @%p727 bra $L__BB0_842;
-
- mov.f32 %f3445, 0fBF800000;
- fma.rn.f32 %f5488, %f5488, %f3445, %f3442;
-
-$L__BB0_842:
- mul.f32 %f3446, %f842, 0f3F22F983;
- cvt.rni.s32.f32 %r8449, %f3446;
- cvt.rn.f32.s32 %f3447, %r8449;
- mov.f32 %f3448, 0fBFC90FDA;
- fma.rn.f32 %f3449, %f3447, %f3448, %f842;
- mov.f32 %f3450, 0fB3A22168;
- fma.rn.f32 %f3451, %f3447, %f3450, %f3449;
- mov.f32 %f3452, 0fA7C234C5;
- fma.rn.f32 %f5489, %f3447, %f3452, %f3451;
- abs.f32 %f943, %f842;
- setp.ltu.f32 %p728, %f943, 0f47CE4780;
- @%p728 bra $L__BB0_850;
-
- setp.eq.f32 %p729, %f943, 0f7F800000;
- @%p729 bra $L__BB0_849;
- bra.uni $L__BB0_844;
-
-$L__BB0_849:
- mov.f32 %f3455, 0f00000000;
- mul.rn.f32 %f5489, %f842, %f3455;
- mov.u32 %r8449, 0;
- bra.uni $L__BB0_850;
-
-$L__BB0_844:
- mov.b32 %r1110, %f842;
- shr.u32 %r4930, %r1110, 23;
- and.b32 %r4931, %r4930, 255;
- add.s32 %r1111, %r4931, -128;
- shl.b32 %r4932, %r1110, 8;
- or.b32 %r1112, %r4932, -2147483648;
- shr.u32 %r1113, %r1111, 5;
- mov.u64 %rd2608, 0;
- mov.u32 %r8446, 0;
- mov.u64 %rd1503, __cudart_i2opi_f;
- mov.u64 %rd2609, %rd2608;
-
-$L__BB0_845:
- .pragma "nounroll";
- shl.b64 %rd1502, %rd2608, 2;
- add.s64 %rd1504, %rd1503, %rd1502;
- ld.global.nc.u32 %r4933, [%rd1504];
- mad.wide.u32 %rd1505, %r4933, %r1112, %rd2609;
- shr.u64 %rd2609, %rd1505, 32;
- add.s64 %rd1506, %rd1, %rd1502;
- st.local.u32 [%rd1506], %rd1505;
- add.s32 %r8446, %r8446, 1;
- cvt.s64.s32 %rd2608, %r8446;
- setp.ne.s32 %p730, %r8446, 6;
- @%p730 bra $L__BB0_845;
-
- st.local.u32 [%rd5], %rd2609;
- mov.u32 %r4934, 4;
- sub.s32 %r1116, %r4934, %r1113;
- mov.u32 %r4935, 6;
- sub.s32 %r4936, %r4935, %r1113;
- mul.wide.s32 %rd1507, %r4936, 4;
- add.s64 %rd1508, %rd1, %rd1507;
- ld.local.u32 %r8447, [%rd1508];
- ld.local.u32 %r8448, [%rd1508+-4];
- and.b32 %r1119, %r1111, 31;
- setp.eq.s32 %p731, %r1119, 0;
- @%p731 bra $L__BB0_848;
-
- mov.u32 %r4937, 32;
- sub.s32 %r4938, %r4937, %r1119;
- shr.u32 %r4939, %r8448, %r4938;
- shl.b32 %r4940, %r8447, %r1119;
- add.s32 %r8447, %r4939, %r4940;
- mul.wide.s32 %rd1509, %r1116, 4;
- add.s64 %rd1510, %rd1, %rd1509;
- ld.local.u32 %r4941, [%rd1510];
- shr.u32 %r4942, %r4941, %r4938;
- shl.b32 %r4943, %r8448, %r1119;
- add.s32 %r8448, %r4942, %r4943;
-
-$L__BB0_848:
- and.b32 %r4944, %r1110, -2147483648;
- shr.u32 %r4945, %r8448, 30;
- shl.b32 %r4946, %r8447, 2;
- or.b32 %r4947, %r4945, %r4946;
- shr.u32 %r4948, %r4947, 31;
- shr.u32 %r4949, %r8447, 30;
- add.s32 %r4950, %r4948, %r4949;
- neg.s32 %r4951, %r4950;
- setp.eq.s32 %p732, %r4944, 0;
- selp.b32 %r8449, %r4950, %r4951, %p732;
- setp.ne.s32 %p733, %r4948, 0;
- xor.b32 %r4952, %r4944, -2147483648;
- selp.b32 %r4953, %r4952, %r4944, %p733;
- selp.b32 %r4954, -1, 0, %p733;
- xor.b32 %r4955, %r4947, %r4954;
- shl.b32 %r4956, %r8448, 2;
- xor.b32 %r4957, %r4956, %r4954;
- cvt.u64.u32 %rd1511, %r4955;
- cvt.u64.u32 %rd1512, %r4957;
- bfi.b64 %rd1513, %rd1511, %rd1512, 32, 32;
- cvt.rn.f64.s64 %fd111, %rd1513;
- mul.f64 %fd112, %fd111, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3453, %fd112;
- setp.eq.s32 %p734, %r4953, 0;
- neg.f32 %f3454, %f3453;
- selp.f32 %f5489, %f3453, %f3454, %p734;
-
-$L__BB0_850:
- add.s32 %r1126, %r8449, 1;
- and.b32 %r1127, %r1126, 1;
- setp.eq.s32 %p735, %r1127, 0;
- selp.f32 %f947, %f5489, 0f3F800000, %p735;
- mul.rn.f32 %f948, %f5489, %f5489;
- mov.f32 %f5490, 0fB94D4153;
- @%p735 bra $L__BB0_852;
-
- mov.f32 %f3457, 0fBAB607ED;
- mov.f32 %f3458, 0f37CBAC00;
- fma.rn.f32 %f5490, %f3458, %f948, %f3457;
-
-$L__BB0_852:
- selp.f32 %f3459, 0f3C0885E4, 0f3D2AAABB, %p735;
- fma.rn.f32 %f3460, %f5490, %f948, %f3459;
- selp.f32 %f3461, 0fBE2AAAA8, 0fBEFFFFFF, %p735;
- fma.rn.f32 %f3462, %f3460, %f948, %f3461;
- mov.f32 %f3463, 0f00000000;
- fma.rn.f32 %f3464, %f948, %f947, %f3463;
- fma.rn.f32 %f5491, %f3462, %f3464, %f947;
- and.b32 %r4959, %r1126, 2;
- setp.eq.s32 %p737, %r4959, 0;
- @%p737 bra $L__BB0_854;
-
- mov.f32 %f3466, 0fBF800000;
- fma.rn.f32 %f5491, %f5491, %f3466, %f3463;
-
-$L__BB0_854:
- add.f32 %f5520, %f5488, %f5491;
- mul.f32 %f3467, %f851, 0f3F22F983;
- cvt.rni.s32.f32 %r8453, %f3467;
- cvt.rn.f32.s32 %f3468, %r8453;
- mov.f32 %f3469, 0fBFC90FDA;
- fma.rn.f32 %f3470, %f3468, %f3469, %f851;
- mov.f32 %f3471, 0fB3A22168;
- fma.rn.f32 %f3472, %f3468, %f3471, %f3470;
- mov.f32 %f3473, 0fA7C234C5;
- fma.rn.f32 %f5492, %f3468, %f3473, %f3472;
- abs.f32 %f956, %f851;
- setp.ltu.f32 %p738, %f956, 0f47CE4780;
- @%p738 bra $L__BB0_862;
-
- setp.eq.f32 %p739, %f956, 0f7F800000;
- @%p739 bra $L__BB0_861;
- bra.uni $L__BB0_856;
-
-$L__BB0_861:
- mov.f32 %f3476, 0f00000000;
- mul.rn.f32 %f5492, %f851, %f3476;
- mov.u32 %r8453, 0;
- bra.uni $L__BB0_862;
-
-$L__BB0_856:
- mov.b32 %r1129, %f851;
- shr.u32 %r4961, %r1129, 23;
- and.b32 %r4962, %r4961, 255;
- add.s32 %r1130, %r4962, -128;
- shl.b32 %r4963, %r1129, 8;
- or.b32 %r1131, %r4963, -2147483648;
- shr.u32 %r1132, %r1130, 5;
- mov.u64 %rd2610, 0;
- mov.u32 %r8450, 0;
- mov.u64 %rd1517, __cudart_i2opi_f;
- mov.u64 %rd2611, %rd2610;
-
-$L__BB0_857:
- .pragma "nounroll";
- shl.b64 %rd1516, %rd2610, 2;
- add.s64 %rd1518, %rd1517, %rd1516;
- ld.global.nc.u32 %r4964, [%rd1518];
- mad.wide.u32 %rd1519, %r4964, %r1131, %rd2611;
- shr.u64 %rd2611, %rd1519, 32;
- add.s64 %rd1520, %rd1, %rd1516;
- st.local.u32 [%rd1520], %rd1519;
- add.s32 %r8450, %r8450, 1;
- cvt.s64.s32 %rd2610, %r8450;
- setp.ne.s32 %p740, %r8450, 6;
- @%p740 bra $L__BB0_857;
-
- st.local.u32 [%rd5], %rd2611;
- mov.u32 %r4965, 4;
- sub.s32 %r1135, %r4965, %r1132;
- mov.u32 %r4966, 6;
- sub.s32 %r4967, %r4966, %r1132;
- mul.wide.s32 %rd1521, %r4967, 4;
- add.s64 %rd1522, %rd1, %rd1521;
- ld.local.u32 %r8451, [%rd1522];
- ld.local.u32 %r8452, [%rd1522+-4];
- and.b32 %r1138, %r1130, 31;
- setp.eq.s32 %p741, %r1138, 0;
- @%p741 bra $L__BB0_860;
-
- mov.u32 %r4968, 32;
- sub.s32 %r4969, %r4968, %r1138;
- shr.u32 %r4970, %r8452, %r4969;
- shl.b32 %r4971, %r8451, %r1138;
- add.s32 %r8451, %r4970, %r4971;
- mul.wide.s32 %rd1523, %r1135, 4;
- add.s64 %rd1524, %rd1, %rd1523;
- ld.local.u32 %r4972, [%rd1524];
- shr.u32 %r4973, %r4972, %r4969;
- shl.b32 %r4974, %r8452, %r1138;
- add.s32 %r8452, %r4973, %r4974;
-
-$L__BB0_860:
- and.b32 %r4975, %r1129, -2147483648;
- shr.u32 %r4976, %r8452, 30;
- shl.b32 %r4977, %r8451, 2;
- or.b32 %r4978, %r4976, %r4977;
- shr.u32 %r4979, %r4978, 31;
- shr.u32 %r4980, %r8451, 30;
- add.s32 %r4981, %r4979, %r4980;
- neg.s32 %r4982, %r4981;
- setp.eq.s32 %p742, %r4975, 0;
- selp.b32 %r8453, %r4981, %r4982, %p742;
- setp.ne.s32 %p743, %r4979, 0;
- xor.b32 %r4983, %r4975, -2147483648;
- selp.b32 %r4984, %r4983, %r4975, %p743;
- selp.b32 %r4985, -1, 0, %p743;
- xor.b32 %r4986, %r4978, %r4985;
- shl.b32 %r4987, %r8452, 2;
- xor.b32 %r4988, %r4987, %r4985;
- cvt.u64.u32 %rd1525, %r4986;
- cvt.u64.u32 %rd1526, %r4988;
- bfi.b64 %rd1527, %rd1525, %rd1526, 32, 32;
- cvt.rn.f64.s64 %fd113, %rd1527;
- mul.f64 %fd114, %fd113, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3474, %fd114;
- setp.eq.s32 %p744, %r4984, 0;
- neg.f32 %f3475, %f3474;
- selp.f32 %f5492, %f3474, %f3475, %p744;
-
-$L__BB0_862:
- and.b32 %r1145, %r8453, 1;
- setp.eq.s32 %p745, %r1145, 0;
- selp.f32 %f960, %f5492, 0f3F800000, %p745;
- mul.rn.f32 %f961, %f5492, %f5492;
- mov.f32 %f5493, 0fB94D4153;
- @%p745 bra $L__BB0_864;
-
- mov.f32 %f3478, 0fBAB607ED;
- mov.f32 %f3479, 0f37CBAC00;
- fma.rn.f32 %f5493, %f3479, %f961, %f3478;
-
-$L__BB0_864:
- selp.f32 %f3480, 0f3C0885E4, 0f3D2AAABB, %p745;
- fma.rn.f32 %f3481, %f5493, %f961, %f3480;
- selp.f32 %f3482, 0fBE2AAAA8, 0fBEFFFFFF, %p745;
- fma.rn.f32 %f3483, %f3481, %f961, %f3482;
- mov.f32 %f3484, 0f00000000;
- fma.rn.f32 %f3485, %f961, %f960, %f3484;
- fma.rn.f32 %f5494, %f3483, %f3485, %f960;
- and.b32 %r4990, %r8453, 2;
- setp.eq.s32 %p747, %r4990, 0;
- @%p747 bra $L__BB0_866;
-
- mov.f32 %f3487, 0fBF800000;
- fma.rn.f32 %f5494, %f5494, %f3487, %f3484;
-
-$L__BB0_866:
- mul.f32 %f3488, %f843, 0f3F22F983;
- cvt.rni.s32.f32 %r8457, %f3488;
- cvt.rn.f32.s32 %f3489, %r8457;
- mov.f32 %f3490, 0fBFC90FDA;
- fma.rn.f32 %f3491, %f3489, %f3490, %f843;
- mov.f32 %f3492, 0fB3A22168;
- fma.rn.f32 %f3493, %f3489, %f3492, %f3491;
- mov.f32 %f3494, 0fA7C234C5;
- fma.rn.f32 %f5495, %f3489, %f3494, %f3493;
- abs.f32 %f968, %f843;
- setp.ltu.f32 %p748, %f968, 0f47CE4780;
- @%p748 bra $L__BB0_874;
-
- setp.eq.f32 %p749, %f968, 0f7F800000;
- @%p749 bra $L__BB0_873;
- bra.uni $L__BB0_868;
-
-$L__BB0_873:
- mov.f32 %f3497, 0f00000000;
- mul.rn.f32 %f5495, %f843, %f3497;
- mov.u32 %r8457, 0;
- bra.uni $L__BB0_874;
-
-$L__BB0_868:
- mov.b32 %r1147, %f843;
- shr.u32 %r4992, %r1147, 23;
- and.b32 %r4993, %r4992, 255;
- add.s32 %r1148, %r4993, -128;
- shl.b32 %r4994, %r1147, 8;
- or.b32 %r1149, %r4994, -2147483648;
- shr.u32 %r1150, %r1148, 5;
- mov.u64 %rd2612, 0;
- mov.u32 %r8454, 0;
- mov.u64 %rd1531, __cudart_i2opi_f;
- mov.u64 %rd2613, %rd2612;
-
-$L__BB0_869:
- .pragma "nounroll";
- shl.b64 %rd1530, %rd2612, 2;
- add.s64 %rd1532, %rd1531, %rd1530;
- ld.global.nc.u32 %r4995, [%rd1532];
- mad.wide.u32 %rd1533, %r4995, %r1149, %rd2613;
- shr.u64 %rd2613, %rd1533, 32;
- add.s64 %rd1534, %rd1, %rd1530;
- st.local.u32 [%rd1534], %rd1533;
- add.s32 %r8454, %r8454, 1;
- cvt.s64.s32 %rd2612, %r8454;
- setp.ne.s32 %p750, %r8454, 6;
- @%p750 bra $L__BB0_869;
-
- st.local.u32 [%rd5], %rd2613;
- mov.u32 %r4996, 4;
- sub.s32 %r1153, %r4996, %r1150;
- mov.u32 %r4997, 6;
- sub.s32 %r4998, %r4997, %r1150;
- mul.wide.s32 %rd1535, %r4998, 4;
- add.s64 %rd1536, %rd1, %rd1535;
- ld.local.u32 %r8455, [%rd1536];
- ld.local.u32 %r8456, [%rd1536+-4];
- and.b32 %r1156, %r1148, 31;
- setp.eq.s32 %p751, %r1156, 0;
- @%p751 bra $L__BB0_872;
-
- mov.u32 %r4999, 32;
- sub.s32 %r5000, %r4999, %r1156;
- shr.u32 %r5001, %r8456, %r5000;
- shl.b32 %r5002, %r8455, %r1156;
- add.s32 %r8455, %r5001, %r5002;
- mul.wide.s32 %rd1537, %r1153, 4;
- add.s64 %rd1538, %rd1, %rd1537;
- ld.local.u32 %r5003, [%rd1538];
- shr.u32 %r5004, %r5003, %r5000;
- shl.b32 %r5005, %r8456, %r1156;
- add.s32 %r8456, %r5004, %r5005;
-
-$L__BB0_872:
- and.b32 %r5006, %r1147, -2147483648;
- shr.u32 %r5007, %r8456, 30;
- shl.b32 %r5008, %r8455, 2;
- or.b32 %r5009, %r5007, %r5008;
- shr.u32 %r5010, %r5009, 31;
- shr.u32 %r5011, %r8455, 30;
- add.s32 %r5012, %r5010, %r5011;
- neg.s32 %r5013, %r5012;
- setp.eq.s32 %p752, %r5006, 0;
- selp.b32 %r8457, %r5012, %r5013, %p752;
- setp.ne.s32 %p753, %r5010, 0;
- xor.b32 %r5014, %r5006, -2147483648;
- selp.b32 %r5015, %r5014, %r5006, %p753;
- selp.b32 %r5016, -1, 0, %p753;
- xor.b32 %r5017, %r5009, %r5016;
- shl.b32 %r5018, %r8456, 2;
- xor.b32 %r5019, %r5018, %r5016;
- cvt.u64.u32 %rd1539, %r5017;
- cvt.u64.u32 %rd1540, %r5019;
- bfi.b64 %rd1541, %rd1539, %rd1540, 32, 32;
- cvt.rn.f64.s64 %fd115, %rd1541;
- mul.f64 %fd116, %fd115, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3495, %fd116;
- setp.eq.s32 %p754, %r5015, 0;
- neg.f32 %f3496, %f3495;
- selp.f32 %f5495, %f3495, %f3496, %p754;
-
-$L__BB0_874:
- add.s32 %r1163, %r8457, 1;
- and.b32 %r1164, %r1163, 1;
- setp.eq.s32 %p755, %r1164, 0;
- selp.f32 %f972, %f5495, 0f3F800000, %p755;
- mul.rn.f32 %f973, %f5495, %f5495;
- mov.f32 %f5496, 0fB94D4153;
- @%p755 bra $L__BB0_876;
-
- mov.f32 %f3499, 0fBAB607ED;
- mov.f32 %f3500, 0f37CBAC00;
- fma.rn.f32 %f5496, %f3500, %f973, %f3499;
-
-$L__BB0_876:
- selp.f32 %f3501, 0f3C0885E4, 0f3D2AAABB, %p755;
- fma.rn.f32 %f3502, %f5496, %f973, %f3501;
- selp.f32 %f3503, 0fBE2AAAA8, 0fBEFFFFFF, %p755;
- fma.rn.f32 %f3504, %f3502, %f973, %f3503;
- mov.f32 %f3505, 0f00000000;
- fma.rn.f32 %f3506, %f973, %f972, %f3505;
- fma.rn.f32 %f5497, %f3504, %f3506, %f972;
- and.b32 %r5021, %r1163, 2;
- setp.eq.s32 %p757, %r5021, 0;
- @%p757 bra $L__BB0_878;
-
- mov.f32 %f3508, 0fBF800000;
- fma.rn.f32 %f5497, %f5497, %f3508, %f3505;
-
-$L__BB0_878:
- add.f32 %f5519, %f5494, %f5497;
- mul.f32 %f3509, %f852, 0f3F22F983;
- cvt.rni.s32.f32 %r8461, %f3509;
- cvt.rn.f32.s32 %f3510, %r8461;
- mov.f32 %f3511, 0fBFC90FDA;
- fma.rn.f32 %f3512, %f3510, %f3511, %f852;
- mov.f32 %f3513, 0fB3A22168;
- fma.rn.f32 %f3514, %f3510, %f3513, %f3512;
- mov.f32 %f3515, 0fA7C234C5;
- fma.rn.f32 %f5498, %f3510, %f3515, %f3514;
- abs.f32 %f981, %f852;
- setp.ltu.f32 %p758, %f981, 0f47CE4780;
- @%p758 bra $L__BB0_886;
-
- setp.eq.f32 %p759, %f981, 0f7F800000;
- @%p759 bra $L__BB0_885;
- bra.uni $L__BB0_880;
-
-$L__BB0_885:
- mov.f32 %f3518, 0f00000000;
- mul.rn.f32 %f5498, %f852, %f3518;
- mov.u32 %r8461, 0;
- bra.uni $L__BB0_886;
-
-$L__BB0_880:
- mov.b32 %r1166, %f852;
- shr.u32 %r5023, %r1166, 23;
- and.b32 %r5024, %r5023, 255;
- add.s32 %r1167, %r5024, -128;
- shl.b32 %r5025, %r1166, 8;
- or.b32 %r1168, %r5025, -2147483648;
- shr.u32 %r1169, %r1167, 5;
- mov.u64 %rd2614, 0;
- mov.u32 %r8458, 0;
- mov.u64 %rd1545, __cudart_i2opi_f;
- mov.u64 %rd2615, %rd2614;
-
-$L__BB0_881:
- .pragma "nounroll";
- shl.b64 %rd1544, %rd2614, 2;
- add.s64 %rd1546, %rd1545, %rd1544;
- ld.global.nc.u32 %r5026, [%rd1546];
- mad.wide.u32 %rd1547, %r5026, %r1168, %rd2615;
- shr.u64 %rd2615, %rd1547, 32;
- add.s64 %rd1548, %rd1, %rd1544;
- st.local.u32 [%rd1548], %rd1547;
- add.s32 %r8458, %r8458, 1;
- cvt.s64.s32 %rd2614, %r8458;
- setp.ne.s32 %p760, %r8458, 6;
- @%p760 bra $L__BB0_881;
-
- st.local.u32 [%rd5], %rd2615;
- mov.u32 %r5027, 4;
- sub.s32 %r1172, %r5027, %r1169;
- mov.u32 %r5028, 6;
- sub.s32 %r5029, %r5028, %r1169;
- mul.wide.s32 %rd1549, %r5029, 4;
- add.s64 %rd1550, %rd1, %rd1549;
- ld.local.u32 %r8459, [%rd1550];
- ld.local.u32 %r8460, [%rd1550+-4];
- and.b32 %r1175, %r1167, 31;
- setp.eq.s32 %p761, %r1175, 0;
- @%p761 bra $L__BB0_884;
-
- mov.u32 %r5030, 32;
- sub.s32 %r5031, %r5030, %r1175;
- shr.u32 %r5032, %r8460, %r5031;
- shl.b32 %r5033, %r8459, %r1175;
- add.s32 %r8459, %r5032, %r5033;
- mul.wide.s32 %rd1551, %r1172, 4;
- add.s64 %rd1552, %rd1, %rd1551;
- ld.local.u32 %r5034, [%rd1552];
- shr.u32 %r5035, %r5034, %r5031;
- shl.b32 %r5036, %r8460, %r1175;
- add.s32 %r8460, %r5035, %r5036;
-
-$L__BB0_884:
- and.b32 %r5037, %r1166, -2147483648;
- shr.u32 %r5038, %r8460, 30;
- shl.b32 %r5039, %r8459, 2;
- or.b32 %r5040, %r5038, %r5039;
- shr.u32 %r5041, %r5040, 31;
- shr.u32 %r5042, %r8459, 30;
- add.s32 %r5043, %r5041, %r5042;
- neg.s32 %r5044, %r5043;
- setp.eq.s32 %p762, %r5037, 0;
- selp.b32 %r8461, %r5043, %r5044, %p762;
- setp.ne.s32 %p763, %r5041, 0;
- xor.b32 %r5045, %r5037, -2147483648;
- selp.b32 %r5046, %r5045, %r5037, %p763;
- selp.b32 %r5047, -1, 0, %p763;
- xor.b32 %r5048, %r5040, %r5047;
- shl.b32 %r5049, %r8460, 2;
- xor.b32 %r5050, %r5049, %r5047;
- cvt.u64.u32 %rd1553, %r5048;
- cvt.u64.u32 %rd1554, %r5050;
- bfi.b64 %rd1555, %rd1553, %rd1554, 32, 32;
- cvt.rn.f64.s64 %fd117, %rd1555;
- mul.f64 %fd118, %fd117, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3516, %fd118;
- setp.eq.s32 %p764, %r5046, 0;
- neg.f32 %f3517, %f3516;
- selp.f32 %f5498, %f3516, %f3517, %p764;
-
-$L__BB0_886:
- and.b32 %r1182, %r8461, 1;
- setp.eq.s32 %p765, %r1182, 0;
- selp.f32 %f985, %f5498, 0f3F800000, %p765;
- mul.rn.f32 %f986, %f5498, %f5498;
- mov.f32 %f5499, 0fB94D4153;
- @%p765 bra $L__BB0_888;
-
- mov.f32 %f3520, 0fBAB607ED;
- mov.f32 %f3521, 0f37CBAC00;
- fma.rn.f32 %f5499, %f3521, %f986, %f3520;
-
-$L__BB0_888:
- selp.f32 %f3522, 0f3C0885E4, 0f3D2AAABB, %p765;
- fma.rn.f32 %f3523, %f5499, %f986, %f3522;
- selp.f32 %f3524, 0fBE2AAAA8, 0fBEFFFFFF, %p765;
- fma.rn.f32 %f3525, %f3523, %f986, %f3524;
- mov.f32 %f3526, 0f00000000;
- fma.rn.f32 %f3527, %f986, %f985, %f3526;
- fma.rn.f32 %f5500, %f3525, %f3527, %f985;
- and.b32 %r5052, %r8461, 2;
- setp.eq.s32 %p767, %r5052, 0;
- @%p767 bra $L__BB0_890;
-
- mov.f32 %f3529, 0fBF800000;
- fma.rn.f32 %f5500, %f5500, %f3529, %f3526;
-
-$L__BB0_890:
- mul.f32 %f3530, %f844, 0f3F22F983;
- cvt.rni.s32.f32 %r8465, %f3530;
- cvt.rn.f32.s32 %f3531, %r8465;
- mov.f32 %f3532, 0fBFC90FDA;
- fma.rn.f32 %f3533, %f3531, %f3532, %f844;
- mov.f32 %f3534, 0fB3A22168;
- fma.rn.f32 %f3535, %f3531, %f3534, %f3533;
- mov.f32 %f3536, 0fA7C234C5;
- fma.rn.f32 %f5501, %f3531, %f3536, %f3535;
- abs.f32 %f993, %f844;
- setp.ltu.f32 %p768, %f993, 0f47CE4780;
- @%p768 bra $L__BB0_898;
-
- setp.eq.f32 %p769, %f993, 0f7F800000;
- @%p769 bra $L__BB0_897;
- bra.uni $L__BB0_892;
-
-$L__BB0_897:
- mov.f32 %f3539, 0f00000000;
- mul.rn.f32 %f5501, %f844, %f3539;
- mov.u32 %r8465, 0;
- bra.uni $L__BB0_898;
-
-$L__BB0_892:
- mov.b32 %r1184, %f844;
- shr.u32 %r5054, %r1184, 23;
- and.b32 %r5055, %r5054, 255;
- add.s32 %r1185, %r5055, -128;
- shl.b32 %r5056, %r1184, 8;
- or.b32 %r1186, %r5056, -2147483648;
- shr.u32 %r1187, %r1185, 5;
- mov.u64 %rd2616, 0;
- mov.u32 %r8462, 0;
- mov.u64 %rd1559, __cudart_i2opi_f;
- mov.u64 %rd2617, %rd2616;
-
-$L__BB0_893:
- .pragma "nounroll";
- shl.b64 %rd1558, %rd2616, 2;
- add.s64 %rd1560, %rd1559, %rd1558;
- ld.global.nc.u32 %r5057, [%rd1560];
- mad.wide.u32 %rd1561, %r5057, %r1186, %rd2617;
- shr.u64 %rd2617, %rd1561, 32;
- add.s64 %rd1562, %rd1, %rd1558;
- st.local.u32 [%rd1562], %rd1561;
- add.s32 %r8462, %r8462, 1;
- cvt.s64.s32 %rd2616, %r8462;
- setp.ne.s32 %p770, %r8462, 6;
- @%p770 bra $L__BB0_893;
-
- st.local.u32 [%rd5], %rd2617;
- mov.u32 %r5058, 4;
- sub.s32 %r1190, %r5058, %r1187;
- mov.u32 %r5059, 6;
- sub.s32 %r5060, %r5059, %r1187;
- mul.wide.s32 %rd1563, %r5060, 4;
- add.s64 %rd1564, %rd1, %rd1563;
- ld.local.u32 %r8463, [%rd1564];
- ld.local.u32 %r8464, [%rd1564+-4];
- and.b32 %r1193, %r1185, 31;
- setp.eq.s32 %p771, %r1193, 0;
- @%p771 bra $L__BB0_896;
-
- mov.u32 %r5061, 32;
- sub.s32 %r5062, %r5061, %r1193;
- shr.u32 %r5063, %r8464, %r5062;
- shl.b32 %r5064, %r8463, %r1193;
- add.s32 %r8463, %r5063, %r5064;
- mul.wide.s32 %rd1565, %r1190, 4;
- add.s64 %rd1566, %rd1, %rd1565;
- ld.local.u32 %r5065, [%rd1566];
- shr.u32 %r5066, %r5065, %r5062;
- shl.b32 %r5067, %r8464, %r1193;
- add.s32 %r8464, %r5066, %r5067;
-
-$L__BB0_896:
- and.b32 %r5068, %r1184, -2147483648;
- shr.u32 %r5069, %r8464, 30;
- shl.b32 %r5070, %r8463, 2;
- or.b32 %r5071, %r5069, %r5070;
- shr.u32 %r5072, %r5071, 31;
- shr.u32 %r5073, %r8463, 30;
- add.s32 %r5074, %r5072, %r5073;
- neg.s32 %r5075, %r5074;
- setp.eq.s32 %p772, %r5068, 0;
- selp.b32 %r8465, %r5074, %r5075, %p772;
- setp.ne.s32 %p773, %r5072, 0;
- xor.b32 %r5076, %r5068, -2147483648;
- selp.b32 %r5077, %r5076, %r5068, %p773;
- selp.b32 %r5078, -1, 0, %p773;
- xor.b32 %r5079, %r5071, %r5078;
- shl.b32 %r5080, %r8464, 2;
- xor.b32 %r5081, %r5080, %r5078;
- cvt.u64.u32 %rd1567, %r5079;
- cvt.u64.u32 %rd1568, %r5081;
- bfi.b64 %rd1569, %rd1567, %rd1568, 32, 32;
- cvt.rn.f64.s64 %fd119, %rd1569;
- mul.f64 %fd120, %fd119, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3537, %fd120;
- setp.eq.s32 %p774, %r5077, 0;
- neg.f32 %f3538, %f3537;
- selp.f32 %f5501, %f3537, %f3538, %p774;
-
-$L__BB0_898:
- add.s32 %r1200, %r8465, 1;
- and.b32 %r1201, %r1200, 1;
- setp.eq.s32 %p775, %r1201, 0;
- selp.f32 %f997, %f5501, 0f3F800000, %p775;
- mul.rn.f32 %f998, %f5501, %f5501;
- mov.f32 %f5502, 0fB94D4153;
- @%p775 bra $L__BB0_900;
-
- mov.f32 %f3541, 0fBAB607ED;
- mov.f32 %f3542, 0f37CBAC00;
- fma.rn.f32 %f5502, %f3542, %f998, %f3541;
-
-$L__BB0_900:
- selp.f32 %f3543, 0f3C0885E4, 0f3D2AAABB, %p775;
- fma.rn.f32 %f3544, %f5502, %f998, %f3543;
- selp.f32 %f3545, 0fBE2AAAA8, 0fBEFFFFFF, %p775;
- fma.rn.f32 %f3546, %f3544, %f998, %f3545;
- mov.f32 %f3547, 0f00000000;
- fma.rn.f32 %f3548, %f998, %f997, %f3547;
- fma.rn.f32 %f5503, %f3546, %f3548, %f997;
- and.b32 %r5083, %r1200, 2;
- setp.eq.s32 %p777, %r5083, 0;
- @%p777 bra $L__BB0_902;
-
- mov.f32 %f3550, 0fBF800000;
- fma.rn.f32 %f5503, %f5503, %f3550, %f3547;
-
-$L__BB0_902:
- add.f32 %f5518, %f5500, %f5503;
- mul.f32 %f3551, %f853, 0f3F22F983;
- cvt.rni.s32.f32 %r8469, %f3551;
- cvt.rn.f32.s32 %f3552, %r8469;
- mov.f32 %f3553, 0fBFC90FDA;
- fma.rn.f32 %f3554, %f3552, %f3553, %f853;
- mov.f32 %f3555, 0fB3A22168;
- fma.rn.f32 %f3556, %f3552, %f3555, %f3554;
- mov.f32 %f3557, 0fA7C234C5;
- fma.rn.f32 %f5504, %f3552, %f3557, %f3556;
- abs.f32 %f1006, %f853;
- setp.ltu.f32 %p778, %f1006, 0f47CE4780;
- @%p778 bra $L__BB0_910;
-
- setp.eq.f32 %p779, %f1006, 0f7F800000;
- @%p779 bra $L__BB0_909;
- bra.uni $L__BB0_904;
-
-$L__BB0_909:
- mov.f32 %f3560, 0f00000000;
- mul.rn.f32 %f5504, %f853, %f3560;
- mov.u32 %r8469, 0;
- bra.uni $L__BB0_910;
-
-$L__BB0_904:
- mov.b32 %r1203, %f853;
- shr.u32 %r5085, %r1203, 23;
- and.b32 %r5086, %r5085, 255;
- add.s32 %r1204, %r5086, -128;
- shl.b32 %r5087, %r1203, 8;
- or.b32 %r1205, %r5087, -2147483648;
- shr.u32 %r1206, %r1204, 5;
- mov.u64 %rd2618, 0;
- mov.u32 %r8466, 0;
- mov.u64 %rd1573, __cudart_i2opi_f;
- mov.u64 %rd2619, %rd2618;
-
-$L__BB0_905:
- .pragma "nounroll";
- shl.b64 %rd1572, %rd2618, 2;
- add.s64 %rd1574, %rd1573, %rd1572;
- ld.global.nc.u32 %r5088, [%rd1574];
- mad.wide.u32 %rd1575, %r5088, %r1205, %rd2619;
- shr.u64 %rd2619, %rd1575, 32;
- add.s64 %rd1576, %rd1, %rd1572;
- st.local.u32 [%rd1576], %rd1575;
- add.s32 %r8466, %r8466, 1;
- cvt.s64.s32 %rd2618, %r8466;
- setp.ne.s32 %p780, %r8466, 6;
- @%p780 bra $L__BB0_905;
-
- st.local.u32 [%rd5], %rd2619;
- mov.u32 %r5089, 4;
- sub.s32 %r1209, %r5089, %r1206;
- mov.u32 %r5090, 6;
- sub.s32 %r5091, %r5090, %r1206;
- mul.wide.s32 %rd1577, %r5091, 4;
- add.s64 %rd1578, %rd1, %rd1577;
- ld.local.u32 %r8467, [%rd1578];
- ld.local.u32 %r8468, [%rd1578+-4];
- and.b32 %r1212, %r1204, 31;
- setp.eq.s32 %p781, %r1212, 0;
- @%p781 bra $L__BB0_908;
-
- mov.u32 %r5092, 32;
- sub.s32 %r5093, %r5092, %r1212;
- shr.u32 %r5094, %r8468, %r5093;
- shl.b32 %r5095, %r8467, %r1212;
- add.s32 %r8467, %r5094, %r5095;
- mul.wide.s32 %rd1579, %r1209, 4;
- add.s64 %rd1580, %rd1, %rd1579;
- ld.local.u32 %r5096, [%rd1580];
- shr.u32 %r5097, %r5096, %r5093;
- shl.b32 %r5098, %r8468, %r1212;
- add.s32 %r8468, %r5097, %r5098;
-
-$L__BB0_908:
- and.b32 %r5099, %r1203, -2147483648;
- shr.u32 %r5100, %r8468, 30;
- shl.b32 %r5101, %r8467, 2;
- or.b32 %r5102, %r5100, %r5101;
- shr.u32 %r5103, %r5102, 31;
- shr.u32 %r5104, %r8467, 30;
- add.s32 %r5105, %r5103, %r5104;
- neg.s32 %r5106, %r5105;
- setp.eq.s32 %p782, %r5099, 0;
- selp.b32 %r8469, %r5105, %r5106, %p782;
- setp.ne.s32 %p783, %r5103, 0;
- xor.b32 %r5107, %r5099, -2147483648;
- selp.b32 %r5108, %r5107, %r5099, %p783;
- selp.b32 %r5109, -1, 0, %p783;
- xor.b32 %r5110, %r5102, %r5109;
- shl.b32 %r5111, %r8468, 2;
- xor.b32 %r5112, %r5111, %r5109;
- cvt.u64.u32 %rd1581, %r5110;
- cvt.u64.u32 %rd1582, %r5112;
- bfi.b64 %rd1583, %rd1581, %rd1582, 32, 32;
- cvt.rn.f64.s64 %fd121, %rd1583;
- mul.f64 %fd122, %fd121, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3558, %fd122;
- setp.eq.s32 %p784, %r5108, 0;
- neg.f32 %f3559, %f3558;
- selp.f32 %f5504, %f3558, %f3559, %p784;
-
-$L__BB0_910:
- and.b32 %r1219, %r8469, 1;
- setp.eq.s32 %p785, %r1219, 0;
- selp.f32 %f1010, %f5504, 0f3F800000, %p785;
- mul.rn.f32 %f1011, %f5504, %f5504;
- mov.f32 %f5505, 0fB94D4153;
- @%p785 bra $L__BB0_912;
-
- mov.f32 %f3562, 0fBAB607ED;
- mov.f32 %f3563, 0f37CBAC00;
- fma.rn.f32 %f5505, %f3563, %f1011, %f3562;
-
-$L__BB0_912:
- selp.f32 %f3564, 0f3C0885E4, 0f3D2AAABB, %p785;
- fma.rn.f32 %f3565, %f5505, %f1011, %f3564;
- selp.f32 %f3566, 0fBE2AAAA8, 0fBEFFFFFF, %p785;
- fma.rn.f32 %f3567, %f3565, %f1011, %f3566;
- mov.f32 %f3568, 0f00000000;
- fma.rn.f32 %f3569, %f1011, %f1010, %f3568;
- fma.rn.f32 %f5506, %f3567, %f3569, %f1010;
- and.b32 %r5114, %r8469, 2;
- setp.eq.s32 %p787, %r5114, 0;
- @%p787 bra $L__BB0_914;
-
- mov.f32 %f3571, 0fBF800000;
- fma.rn.f32 %f5506, %f5506, %f3571, %f3568;
-
-$L__BB0_914:
- mul.f32 %f3572, %f845, 0f3F22F983;
- cvt.rni.s32.f32 %r8473, %f3572;
- cvt.rn.f32.s32 %f3573, %r8473;
- mov.f32 %f3574, 0fBFC90FDA;
- fma.rn.f32 %f3575, %f3573, %f3574, %f845;
- mov.f32 %f3576, 0fB3A22168;
- fma.rn.f32 %f3577, %f3573, %f3576, %f3575;
- mov.f32 %f3578, 0fA7C234C5;
- fma.rn.f32 %f5507, %f3573, %f3578, %f3577;
- abs.f32 %f1018, %f845;
- setp.ltu.f32 %p788, %f1018, 0f47CE4780;
- @%p788 bra $L__BB0_922;
-
- setp.eq.f32 %p789, %f1018, 0f7F800000;
- @%p789 bra $L__BB0_921;
- bra.uni $L__BB0_916;
-
-$L__BB0_921:
- mov.f32 %f3581, 0f00000000;
- mul.rn.f32 %f5507, %f845, %f3581;
- mov.u32 %r8473, 0;
- bra.uni $L__BB0_922;
-
-$L__BB0_916:
- mov.b32 %r1221, %f845;
- shr.u32 %r5116, %r1221, 23;
- and.b32 %r5117, %r5116, 255;
- add.s32 %r1222, %r5117, -128;
- shl.b32 %r5118, %r1221, 8;
- or.b32 %r1223, %r5118, -2147483648;
- shr.u32 %r1224, %r1222, 5;
- mov.u64 %rd2620, 0;
- mov.u32 %r8470, 0;
- mov.u64 %rd1587, __cudart_i2opi_f;
- mov.u64 %rd2621, %rd2620;
-
-$L__BB0_917:
- .pragma "nounroll";
- shl.b64 %rd1586, %rd2620, 2;
- add.s64 %rd1588, %rd1587, %rd1586;
- ld.global.nc.u32 %r5119, [%rd1588];
- mad.wide.u32 %rd1589, %r5119, %r1223, %rd2621;
- shr.u64 %rd2621, %rd1589, 32;
- add.s64 %rd1590, %rd1, %rd1586;
- st.local.u32 [%rd1590], %rd1589;
- add.s32 %r8470, %r8470, 1;
- cvt.s64.s32 %rd2620, %r8470;
- setp.ne.s32 %p790, %r8470, 6;
- @%p790 bra $L__BB0_917;
-
- st.local.u32 [%rd5], %rd2621;
- mov.u32 %r5120, 4;
- sub.s32 %r1227, %r5120, %r1224;
- mov.u32 %r5121, 6;
- sub.s32 %r5122, %r5121, %r1224;
- mul.wide.s32 %rd1591, %r5122, 4;
- add.s64 %rd1592, %rd1, %rd1591;
- ld.local.u32 %r8471, [%rd1592];
- ld.local.u32 %r8472, [%rd1592+-4];
- and.b32 %r1230, %r1222, 31;
- setp.eq.s32 %p791, %r1230, 0;
- @%p791 bra $L__BB0_920;
-
- mov.u32 %r5123, 32;
- sub.s32 %r5124, %r5123, %r1230;
- shr.u32 %r5125, %r8472, %r5124;
- shl.b32 %r5126, %r8471, %r1230;
- add.s32 %r8471, %r5125, %r5126;
- mul.wide.s32 %rd1593, %r1227, 4;
- add.s64 %rd1594, %rd1, %rd1593;
- ld.local.u32 %r5127, [%rd1594];
- shr.u32 %r5128, %r5127, %r5124;
- shl.b32 %r5129, %r8472, %r1230;
- add.s32 %r8472, %r5128, %r5129;
-
-$L__BB0_920:
- and.b32 %r5130, %r1221, -2147483648;
- shr.u32 %r5131, %r8472, 30;
- shl.b32 %r5132, %r8471, 2;
- or.b32 %r5133, %r5131, %r5132;
- shr.u32 %r5134, %r5133, 31;
- shr.u32 %r5135, %r8471, 30;
- add.s32 %r5136, %r5134, %r5135;
- neg.s32 %r5137, %r5136;
- setp.eq.s32 %p792, %r5130, 0;
- selp.b32 %r8473, %r5136, %r5137, %p792;
- setp.ne.s32 %p793, %r5134, 0;
- xor.b32 %r5138, %r5130, -2147483648;
- selp.b32 %r5139, %r5138, %r5130, %p793;
- selp.b32 %r5140, -1, 0, %p793;
- xor.b32 %r5141, %r5133, %r5140;
- shl.b32 %r5142, %r8472, 2;
- xor.b32 %r5143, %r5142, %r5140;
- cvt.u64.u32 %rd1595, %r5141;
- cvt.u64.u32 %rd1596, %r5143;
- bfi.b64 %rd1597, %rd1595, %rd1596, 32, 32;
- cvt.rn.f64.s64 %fd123, %rd1597;
- mul.f64 %fd124, %fd123, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3579, %fd124;
- setp.eq.s32 %p794, %r5139, 0;
- neg.f32 %f3580, %f3579;
- selp.f32 %f5507, %f3579, %f3580, %p794;
-
-$L__BB0_922:
- add.s32 %r1237, %r8473, 1;
- and.b32 %r1238, %r1237, 1;
- setp.eq.s32 %p795, %r1238, 0;
- selp.f32 %f1022, %f5507, 0f3F800000, %p795;
- mul.rn.f32 %f1023, %f5507, %f5507;
- mov.f32 %f5508, 0fB94D4153;
- @%p795 bra $L__BB0_924;
-
- mov.f32 %f3583, 0fBAB607ED;
- mov.f32 %f3584, 0f37CBAC00;
- fma.rn.f32 %f5508, %f3584, %f1023, %f3583;
-
-$L__BB0_924:
- selp.f32 %f3585, 0f3C0885E4, 0f3D2AAABB, %p795;
- fma.rn.f32 %f3586, %f5508, %f1023, %f3585;
- selp.f32 %f3587, 0fBE2AAAA8, 0fBEFFFFFF, %p795;
- fma.rn.f32 %f3588, %f3586, %f1023, %f3587;
- mov.f32 %f3589, 0f00000000;
- fma.rn.f32 %f3590, %f1023, %f1022, %f3589;
- fma.rn.f32 %f5509, %f3588, %f3590, %f1022;
- and.b32 %r5145, %r1237, 2;
- setp.eq.s32 %p797, %r5145, 0;
- @%p797 bra $L__BB0_926;
-
- mov.f32 %f3592, 0fBF800000;
- fma.rn.f32 %f5509, %f5509, %f3592, %f3589;
-
-$L__BB0_926:
- add.f32 %f5517, %f5506, %f5509;
- mul.f32 %f3593, %f854, 0f3F22F983;
- cvt.rni.s32.f32 %r8477, %f3593;
- cvt.rn.f32.s32 %f3594, %r8477;
- mov.f32 %f3595, 0fBFC90FDA;
- fma.rn.f32 %f3596, %f3594, %f3595, %f854;
- mov.f32 %f3597, 0fB3A22168;
- fma.rn.f32 %f3598, %f3594, %f3597, %f3596;
- mov.f32 %f3599, 0fA7C234C5;
- fma.rn.f32 %f5510, %f3594, %f3599, %f3598;
- abs.f32 %f1031, %f854;
- setp.ltu.f32 %p798, %f1031, 0f47CE4780;
- @%p798 bra $L__BB0_934;
-
- setp.eq.f32 %p799, %f1031, 0f7F800000;
- @%p799 bra $L__BB0_933;
- bra.uni $L__BB0_928;
-
-$L__BB0_933:
- mov.f32 %f3602, 0f00000000;
- mul.rn.f32 %f5510, %f854, %f3602;
- mov.u32 %r8477, 0;
- bra.uni $L__BB0_934;
-
-$L__BB0_928:
- mov.b32 %r1240, %f854;
- shr.u32 %r5147, %r1240, 23;
- and.b32 %r5148, %r5147, 255;
- add.s32 %r1241, %r5148, -128;
- shl.b32 %r5149, %r1240, 8;
- or.b32 %r1242, %r5149, -2147483648;
- shr.u32 %r1243, %r1241, 5;
- mov.u64 %rd2622, 0;
- mov.u32 %r8474, 0;
- mov.u64 %rd1601, __cudart_i2opi_f;
- mov.u64 %rd2623, %rd2622;
-
-$L__BB0_929:
- .pragma "nounroll";
- shl.b64 %rd1600, %rd2622, 2;
- add.s64 %rd1602, %rd1601, %rd1600;
- ld.global.nc.u32 %r5150, [%rd1602];
- mad.wide.u32 %rd1603, %r5150, %r1242, %rd2623;
- shr.u64 %rd2623, %rd1603, 32;
- add.s64 %rd1604, %rd1, %rd1600;
- st.local.u32 [%rd1604], %rd1603;
- add.s32 %r8474, %r8474, 1;
- cvt.s64.s32 %rd2622, %r8474;
- setp.ne.s32 %p800, %r8474, 6;
- @%p800 bra $L__BB0_929;
-
- st.local.u32 [%rd5], %rd2623;
- mov.u32 %r5151, 4;
- sub.s32 %r1246, %r5151, %r1243;
- mov.u32 %r5152, 6;
- sub.s32 %r5153, %r5152, %r1243;
- mul.wide.s32 %rd1605, %r5153, 4;
- add.s64 %rd1606, %rd1, %rd1605;
- ld.local.u32 %r8475, [%rd1606];
- ld.local.u32 %r8476, [%rd1606+-4];
- and.b32 %r1249, %r1241, 31;
- setp.eq.s32 %p801, %r1249, 0;
- @%p801 bra $L__BB0_932;
-
- mov.u32 %r5154, 32;
- sub.s32 %r5155, %r5154, %r1249;
- shr.u32 %r5156, %r8476, %r5155;
- shl.b32 %r5157, %r8475, %r1249;
- add.s32 %r8475, %r5156, %r5157;
- mul.wide.s32 %rd1607, %r1246, 4;
- add.s64 %rd1608, %rd1, %rd1607;
- ld.local.u32 %r5158, [%rd1608];
- shr.u32 %r5159, %r5158, %r5155;
- shl.b32 %r5160, %r8476, %r1249;
- add.s32 %r8476, %r5159, %r5160;
-
-$L__BB0_932:
- and.b32 %r5161, %r1240, -2147483648;
- shr.u32 %r5162, %r8476, 30;
- shl.b32 %r5163, %r8475, 2;
- or.b32 %r5164, %r5162, %r5163;
- shr.u32 %r5165, %r5164, 31;
- shr.u32 %r5166, %r8475, 30;
- add.s32 %r5167, %r5165, %r5166;
- neg.s32 %r5168, %r5167;
- setp.eq.s32 %p802, %r5161, 0;
- selp.b32 %r8477, %r5167, %r5168, %p802;
- setp.ne.s32 %p803, %r5165, 0;
- xor.b32 %r5169, %r5161, -2147483648;
- selp.b32 %r5170, %r5169, %r5161, %p803;
- selp.b32 %r5171, -1, 0, %p803;
- xor.b32 %r5172, %r5164, %r5171;
- shl.b32 %r5173, %r8476, 2;
- xor.b32 %r5174, %r5173, %r5171;
- cvt.u64.u32 %rd1609, %r5172;
- cvt.u64.u32 %rd1610, %r5174;
- bfi.b64 %rd1611, %rd1609, %rd1610, 32, 32;
- cvt.rn.f64.s64 %fd125, %rd1611;
- mul.f64 %fd126, %fd125, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3600, %fd126;
- setp.eq.s32 %p804, %r5170, 0;
- neg.f32 %f3601, %f3600;
- selp.f32 %f5510, %f3600, %f3601, %p804;
-
-$L__BB0_934:
- and.b32 %r1256, %r8477, 1;
- setp.eq.s32 %p805, %r1256, 0;
- selp.f32 %f1035, %f5510, 0f3F800000, %p805;
- mul.rn.f32 %f1036, %f5510, %f5510;
- mov.f32 %f5511, 0fB94D4153;
- @%p805 bra $L__BB0_936;
-
- mov.f32 %f3604, 0fBAB607ED;
- mov.f32 %f3605, 0f37CBAC00;
- fma.rn.f32 %f5511, %f3605, %f1036, %f3604;
-
-$L__BB0_936:
- selp.f32 %f3606, 0f3C0885E4, 0f3D2AAABB, %p805;
- fma.rn.f32 %f3607, %f5511, %f1036, %f3606;
- selp.f32 %f3608, 0fBE2AAAA8, 0fBEFFFFFF, %p805;
- fma.rn.f32 %f3609, %f3607, %f1036, %f3608;
- mov.f32 %f3610, 0f00000000;
- fma.rn.f32 %f3611, %f1036, %f1035, %f3610;
- fma.rn.f32 %f5512, %f3609, %f3611, %f1035;
- and.b32 %r5176, %r8477, 2;
- setp.eq.s32 %p807, %r5176, 0;
- @%p807 bra $L__BB0_938;
-
- mov.f32 %f3613, 0fBF800000;
- fma.rn.f32 %f5512, %f5512, %f3613, %f3610;
-
-$L__BB0_938:
- mul.f32 %f3614, %f846, 0f3F22F983;
- cvt.rni.s32.f32 %r8481, %f3614;
- cvt.rn.f32.s32 %f3615, %r8481;
- mov.f32 %f3616, 0fBFC90FDA;
- fma.rn.f32 %f3617, %f3615, %f3616, %f846;
- mov.f32 %f3618, 0fB3A22168;
- fma.rn.f32 %f3619, %f3615, %f3618, %f3617;
- mov.f32 %f3620, 0fA7C234C5;
- fma.rn.f32 %f5513, %f3615, %f3620, %f3619;
- abs.f32 %f1043, %f846;
- setp.ltu.f32 %p808, %f1043, 0f47CE4780;
- @%p808 bra $L__BB0_946;
-
- setp.eq.f32 %p809, %f1043, 0f7F800000;
- @%p809 bra $L__BB0_945;
- bra.uni $L__BB0_940;
-
-$L__BB0_945:
- mov.f32 %f3623, 0f00000000;
- mul.rn.f32 %f5513, %f846, %f3623;
- mov.u32 %r8481, 0;
- bra.uni $L__BB0_946;
-
-$L__BB0_940:
- mov.b32 %r1258, %f846;
- shr.u32 %r5178, %r1258, 23;
- and.b32 %r5179, %r5178, 255;
- add.s32 %r1259, %r5179, -128;
- shl.b32 %r5180, %r1258, 8;
- or.b32 %r1260, %r5180, -2147483648;
- shr.u32 %r1261, %r1259, 5;
- mov.u64 %rd2624, 0;
- mov.u32 %r8478, 0;
- mov.u64 %rd1615, __cudart_i2opi_f;
- mov.u64 %rd2625, %rd2624;
-
-$L__BB0_941:
- .pragma "nounroll";
- shl.b64 %rd1614, %rd2624, 2;
- add.s64 %rd1616, %rd1615, %rd1614;
- ld.global.nc.u32 %r5181, [%rd1616];
- mad.wide.u32 %rd1617, %r5181, %r1260, %rd2625;
- shr.u64 %rd2625, %rd1617, 32;
- add.s64 %rd1618, %rd1, %rd1614;
- st.local.u32 [%rd1618], %rd1617;
- add.s32 %r8478, %r8478, 1;
- cvt.s64.s32 %rd2624, %r8478;
- setp.ne.s32 %p810, %r8478, 6;
- @%p810 bra $L__BB0_941;
-
- st.local.u32 [%rd5], %rd2625;
- mov.u32 %r5182, 4;
- sub.s32 %r1264, %r5182, %r1261;
- mov.u32 %r5183, 6;
- sub.s32 %r5184, %r5183, %r1261;
- mul.wide.s32 %rd1619, %r5184, 4;
- add.s64 %rd1620, %rd1, %rd1619;
- ld.local.u32 %r8479, [%rd1620];
- ld.local.u32 %r8480, [%rd1620+-4];
- and.b32 %r1267, %r1259, 31;
- setp.eq.s32 %p811, %r1267, 0;
- @%p811 bra $L__BB0_944;
-
- mov.u32 %r5185, 32;
- sub.s32 %r5186, %r5185, %r1267;
- shr.u32 %r5187, %r8480, %r5186;
- shl.b32 %r5188, %r8479, %r1267;
- add.s32 %r8479, %r5187, %r5188;
- mul.wide.s32 %rd1621, %r1264, 4;
- add.s64 %rd1622, %rd1, %rd1621;
- ld.local.u32 %r5189, [%rd1622];
- shr.u32 %r5190, %r5189, %r5186;
- shl.b32 %r5191, %r8480, %r1267;
- add.s32 %r8480, %r5190, %r5191;
-
-$L__BB0_944:
- and.b32 %r5192, %r1258, -2147483648;
- shr.u32 %r5193, %r8480, 30;
- shl.b32 %r5194, %r8479, 2;
- or.b32 %r5195, %r5193, %r5194;
- shr.u32 %r5196, %r5195, 31;
- shr.u32 %r5197, %r8479, 30;
- add.s32 %r5198, %r5196, %r5197;
- neg.s32 %r5199, %r5198;
- setp.eq.s32 %p812, %r5192, 0;
- selp.b32 %r8481, %r5198, %r5199, %p812;
- setp.ne.s32 %p813, %r5196, 0;
- xor.b32 %r5200, %r5192, -2147483648;
- selp.b32 %r5201, %r5200, %r5192, %p813;
- selp.b32 %r5202, -1, 0, %p813;
- xor.b32 %r5203, %r5195, %r5202;
- shl.b32 %r5204, %r8480, 2;
- xor.b32 %r5205, %r5204, %r5202;
- cvt.u64.u32 %rd1623, %r5203;
- cvt.u64.u32 %rd1624, %r5205;
- bfi.b64 %rd1625, %rd1623, %rd1624, 32, 32;
- cvt.rn.f64.s64 %fd127, %rd1625;
- mul.f64 %fd128, %fd127, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3621, %fd128;
- setp.eq.s32 %p814, %r5201, 0;
- neg.f32 %f3622, %f3621;
- selp.f32 %f5513, %f3621, %f3622, %p814;
-
-$L__BB0_946:
- add.s32 %r1274, %r8481, 1;
- and.b32 %r1275, %r1274, 1;
- setp.eq.s32 %p815, %r1275, 0;
- selp.f32 %f1047, %f5513, 0f3F800000, %p815;
- mul.rn.f32 %f1048, %f5513, %f5513;
- mov.f32 %f5514, 0fB94D4153;
- @%p815 bra $L__BB0_948;
-
- mov.f32 %f3625, 0fBAB607ED;
- mov.f32 %f3626, 0f37CBAC00;
- fma.rn.f32 %f5514, %f3626, %f1048, %f3625;
-
-$L__BB0_948:
- selp.f32 %f3627, 0f3C0885E4, 0f3D2AAABB, %p815;
- fma.rn.f32 %f3628, %f5514, %f1048, %f3627;
- selp.f32 %f3629, 0fBE2AAAA8, 0fBEFFFFFF, %p815;
- fma.rn.f32 %f3630, %f3628, %f1048, %f3629;
- mov.f32 %f3631, 0f00000000;
- fma.rn.f32 %f3632, %f1048, %f1047, %f3631;
- fma.rn.f32 %f5515, %f3630, %f3632, %f1047;
- and.b32 %r5207, %r1274, 2;
- setp.eq.s32 %p817, %r5207, 0;
- @%p817 bra $L__BB0_950;
-
- mov.f32 %f3634, 0fBF800000;
- fma.rn.f32 %f5515, %f5515, %f3634, %f3631;
-
-$L__BB0_950:
- add.f32 %f5516, %f5512, %f5515;
- bra.uni $L__BB0_951;
-
-$L__BB0_530:
- mov.b32 %r678, %f5348;
- shr.u32 %r4124, %r678, 23;
- and.b32 %r4125, %r4124, 255;
- add.s32 %r679, %r4125, -128;
- shl.b32 %r4126, %r678, 8;
- or.b32 %r680, %r4126, -2147483648;
- shr.u32 %r681, %r679, 5;
- mov.u64 %rd2562, 0;
- mov.u32 %r8354, 0;
- mov.u64 %rd1152, __cudart_i2opi_f;
- mov.u64 %rd2563, %rd2562;
-
-$L__BB0_531:
- .pragma "nounroll";
- shl.b64 %rd1151, %rd2562, 2;
- add.s64 %rd1153, %rd1152, %rd1151;
- ld.global.nc.u32 %r4127, [%rd1153];
- mad.wide.u32 %rd1154, %r4127, %r680, %rd2563;
- shr.u64 %rd2563, %rd1154, 32;
- add.s64 %rd1155, %rd1, %rd1151;
- st.local.u32 [%rd1155], %rd1154;
- add.s32 %r8354, %r8354, 1;
- cvt.s64.s32 %rd2562, %r8354;
- setp.ne.s32 %p462, %r8354, 6;
- @%p462 bra $L__BB0_531;
-
- st.local.u32 [%rd5], %rd2563;
- mov.u32 %r4128, 4;
- sub.s32 %r684, %r4128, %r681;
- mov.u32 %r4129, 6;
- sub.s32 %r4130, %r4129, %r681;
- mul.wide.s32 %rd1156, %r4130, 4;
- add.s64 %rd1157, %rd1, %rd1156;
- ld.local.u32 %r8355, [%rd1157];
- ld.local.u32 %r8356, [%rd1157+-4];
- and.b32 %r687, %r679, 31;
- setp.eq.s32 %p463, %r687, 0;
- @%p463 bra $L__BB0_534;
-
- mov.u32 %r4131, 32;
- sub.s32 %r4132, %r4131, %r687;
- shr.u32 %r4133, %r8356, %r4132;
- shl.b32 %r4134, %r8355, %r687;
- add.s32 %r8355, %r4133, %r4134;
- mul.wide.s32 %rd1158, %r684, 4;
- add.s64 %rd1159, %rd1, %rd1158;
- ld.local.u32 %r4135, [%rd1159];
- shr.u32 %r4136, %r4135, %r4132;
- shl.b32 %r4137, %r8356, %r687;
- add.s32 %r8356, %r4136, %r4137;
-
-$L__BB0_534:
- and.b32 %r4138, %r678, -2147483648;
- shr.u32 %r4139, %r8356, 30;
- shl.b32 %r4140, %r8355, 2;
- or.b32 %r4141, %r4139, %r4140;
- shr.u32 %r4142, %r4141, 31;
- shr.u32 %r4143, %r8355, 30;
- add.s32 %r4144, %r4142, %r4143;
- neg.s32 %r4145, %r4144;
- setp.eq.s32 %p464, %r4138, 0;
- selp.b32 %r8357, %r4144, %r4145, %p464;
- setp.ne.s32 %p465, %r4142, 0;
- xor.b32 %r4146, %r4138, -2147483648;
- selp.b32 %r4147, %r4146, %r4138, %p465;
- selp.b32 %r4148, -1, 0, %p465;
- xor.b32 %r4149, %r4141, %r4148;
- shl.b32 %r4150, %r8356, 2;
- xor.b32 %r4151, %r4150, %r4148;
- cvt.u64.u32 %rd1160, %r4149;
- cvt.u64.u32 %rd1161, %r4151;
- bfi.b64 %rd1162, %rd1160, %rd1161, 32, 32;
- cvt.rn.f64.s64 %fd65, %rd1162;
- mul.f64 %fd66, %fd65, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f2955, %fd66;
- setp.eq.s32 %p466, %r4147, 0;
- neg.f32 %f2956, %f2955;
- selp.f32 %f5369, %f2955, %f2956, %p466;
-
-$L__BB0_536:
- and.b32 %r694, %r8357, 1;
- setp.eq.s32 %p467, %r694, 0;
- selp.f32 %f577, %f5369, 0f3F800000, %p467;
- mul.rn.f32 %f578, %f5369, %f5369;
- mov.f32 %f5370, 0fB94D4153;
- @%p467 bra $L__BB0_538;
-
- mov.f32 %f2959, 0fBAB607ED;
- mov.f32 %f2960, 0f37CBAC00;
- fma.rn.f32 %f5370, %f2960, %f578, %f2959;
-
-$L__BB0_538:
- selp.f32 %f2961, 0f3C0885E4, 0f3D2AAABB, %p467;
- fma.rn.f32 %f2962, %f5370, %f578, %f2961;
- selp.f32 %f2963, 0fBE2AAAA8, 0fBEFFFFFF, %p467;
- fma.rn.f32 %f2964, %f2962, %f578, %f2963;
- mov.f32 %f2965, 0f00000000;
- fma.rn.f32 %f2966, %f578, %f577, %f2965;
- fma.rn.f32 %f5213, %f2964, %f2966, %f577;
- and.b32 %r4153, %r8357, 2;
- setp.eq.s32 %p469, %r4153, 0;
- @%p469 bra $L__BB0_540;
-
- mov.f32 %f2968, 0fBF800000;
- fma.rn.f32 %f5213, %f5213, %f2968, %f2965;
-
-$L__BB0_540:
- setp.lt.s32 %p8, %r14, %r676;
- @%p459 bra $L__BB0_553;
-
- mul.f32 %f2969, %f5531, 0f3F22F983;
- cvt.rni.s32.f32 %r8361, %f2969;
- cvt.rn.f32.s32 %f2970, %r8361;
- mov.f32 %f2971, 0fBFC90FDA;
- fma.rn.f32 %f2972, %f2970, %f2971, %f5531;
- mov.f32 %f2973, 0fB3A22168;
- fma.rn.f32 %f2974, %f2970, %f2973, %f2972;
- mov.f32 %f2975, 0fA7C234C5;
- fma.rn.f32 %f5373, %f2970, %f2975, %f2974;
- abs.f32 %f586, %f5531;
- setp.ltu.f32 %p471, %f586, 0f47CE4780;
- @%p471 bra $L__BB0_549;
-
- setp.eq.f32 %p472, %f586, 0f7F800000;
- @%p472 bra $L__BB0_548;
- bra.uni $L__BB0_543;
-
-$L__BB0_548:
- mov.f32 %f2978, 0f00000000;
- mul.rn.f32 %f5373, %f5531, %f2978;
- mov.u32 %r8361, 0;
- bra.uni $L__BB0_549;
-
-$L__BB0_543:
- mov.b32 %r696, %f5531;
- shr.u32 %r4155, %r696, 23;
- and.b32 %r4156, %r4155, 255;
- add.s32 %r697, %r4156, -128;
- shl.b32 %r4157, %r696, 8;
- or.b32 %r698, %r4157, -2147483648;
- shr.u32 %r699, %r697, 5;
- mov.u64 %rd2564, 0;
- mov.u32 %r8358, 0;
- mov.u64 %rd1166, __cudart_i2opi_f;
- mov.u64 %rd2565, %rd2564;
-
-$L__BB0_544:
- .pragma "nounroll";
- shl.b64 %rd1165, %rd2564, 2;
- add.s64 %rd1167, %rd1166, %rd1165;
- ld.global.nc.u32 %r4158, [%rd1167];
- mad.wide.u32 %rd1168, %r4158, %r698, %rd2565;
- shr.u64 %rd2565, %rd1168, 32;
- add.s64 %rd1169, %rd1, %rd1165;
- st.local.u32 [%rd1169], %rd1168;
- add.s32 %r8358, %r8358, 1;
- cvt.s64.s32 %rd2564, %r8358;
- setp.ne.s32 %p473, %r8358, 6;
- @%p473 bra $L__BB0_544;
-
- st.local.u32 [%rd5], %rd2565;
- mov.u32 %r4159, 4;
- sub.s32 %r702, %r4159, %r699;
- mov.u32 %r4160, 6;
- sub.s32 %r4161, %r4160, %r699;
- mul.wide.s32 %rd1170, %r4161, 4;
- add.s64 %rd1171, %rd1, %rd1170;
- ld.local.u32 %r8359, [%rd1171];
- ld.local.u32 %r8360, [%rd1171+-4];
- and.b32 %r705, %r697, 31;
- setp.eq.s32 %p474, %r705, 0;
- @%p474 bra $L__BB0_547;
-
- mov.u32 %r4162, 32;
- sub.s32 %r4163, %r4162, %r705;
- shr.u32 %r4164, %r8360, %r4163;
- shl.b32 %r4165, %r8359, %r705;
- add.s32 %r8359, %r4164, %r4165;
- mul.wide.s32 %rd1172, %r702, 4;
- add.s64 %rd1173, %rd1, %rd1172;
- ld.local.u32 %r4166, [%rd1173];
- shr.u32 %r4167, %r4166, %r4163;
- shl.b32 %r4168, %r8360, %r705;
- add.s32 %r8360, %r4167, %r4168;
-
-$L__BB0_547:
- and.b32 %r4169, %r696, -2147483648;
- shr.u32 %r4170, %r8360, 30;
- shl.b32 %r4171, %r8359, 2;
- or.b32 %r4172, %r4170, %r4171;
- shr.u32 %r4173, %r4172, 31;
- shr.u32 %r4174, %r8359, 30;
- add.s32 %r4175, %r4173, %r4174;
- neg.s32 %r4176, %r4175;
- setp.eq.s32 %p475, %r4169, 0;
- selp.b32 %r8361, %r4175, %r4176, %p475;
- setp.ne.s32 %p476, %r4173, 0;
- xor.b32 %r4177, %r4169, -2147483648;
- selp.b32 %r4178, %r4177, %r4169, %p476;
- selp.b32 %r4179, -1, 0, %p476;
- xor.b32 %r4180, %r4172, %r4179;
- shl.b32 %r4181, %r8360, 2;
- xor.b32 %r4182, %r4181, %r4179;
- cvt.u64.u32 %rd1174, %r4180;
- cvt.u64.u32 %rd1175, %r4182;
- bfi.b64 %rd1176, %rd1174, %rd1175, 32, 32;
- cvt.rn.f64.s64 %fd67, %rd1176;
- mul.f64 %fd68, %fd67, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f2976, %fd68;
- setp.eq.s32 %p477, %r4178, 0;
- neg.f32 %f2977, %f2976;
- selp.f32 %f5373, %f2976, %f2977, %p477;
-
-$L__BB0_549:
- add.s32 %r712, %r8361, 1;
- and.b32 %r713, %r712, 1;
- setp.eq.s32 %p478, %r713, 0;
- selp.f32 %f590, %f5373, 0f3F800000, %p478;
- mul.rn.f32 %f591, %f5373, %f5373;
- mov.f32 %f5374, 0fB94D4153;
- @%p478 bra $L__BB0_551;
-
- mov.f32 %f2980, 0fBAB607ED;
- mov.f32 %f2981, 0f37CBAC00;
- fma.rn.f32 %f5374, %f2981, %f591, %f2980;
-
-$L__BB0_551:
- selp.f32 %f2982, 0f3C0885E4, 0f3D2AAABB, %p478;
- fma.rn.f32 %f2983, %f5374, %f591, %f2982;
- selp.f32 %f2984, 0fBE2AAAA8, 0fBEFFFFFF, %p478;
- fma.rn.f32 %f2985, %f2983, %f591, %f2984;
- mov.f32 %f2986, 0f00000000;
- fma.rn.f32 %f2987, %f591, %f590, %f2986;
- fma.rn.f32 %f5215, %f2985, %f2987, %f590;
- and.b32 %r4184, %r712, 2;
- setp.eq.s32 %p480, %r4184, 0;
- @%p480 bra $L__BB0_553;
-
- mov.f32 %f2989, 0fBF800000;
- fma.rn.f32 %f5215, %f5215, %f2989, %f2986;
-
-$L__BB0_553:
- selp.f32 %f598, %f5215, %f5216, %p8;
- selp.f32 %f599, %f5213, %f5214, %p8;
- @%p459 bra $L__BB0_555;
-
- add.f32 %f5523, %f599, %f598;
- mov.f32 %f5214, %f5213;
- mov.f32 %f5216, %f5215;
-
-$L__BB0_555:
- @%p426 bra $L__BB0_584;
-
- shl.b32 %r4186, %r12, 5;
- mov.u32 %r4187, -32;
- sub.s32 %r714, %r4187, %r4186;
- setp.ge.s32 %p484, %r14, %r714;
- @%p484 bra $L__BB0_569;
-
- mul.f32 %f2992, %f5347, 0f3F22F983;
- cvt.rni.s32.f32 %r8365, %f2992;
- cvt.rn.f32.s32 %f2993, %r8365;
- mov.f32 %f2994, 0fBFC90FDA;
- fma.rn.f32 %f2995, %f2993, %f2994, %f5347;
- mov.f32 %f2996, 0fB3A22168;
- fma.rn.f32 %f2997, %f2993, %f2996, %f2995;
- mov.f32 %f2998, 0fA7C234C5;
- fma.rn.f32 %f5382, %f2993, %f2998, %f2997;
- abs.f32 %f607, %f5347;
- setp.ltu.f32 %p485, %f607, 0f47CE4780;
- @%p485 bra $L__BB0_565;
-
- setp.eq.f32 %p486, %f607, 0f7F800000;
- @%p486 bra $L__BB0_564;
- bra.uni $L__BB0_559;
-
-$L__BB0_564:
- mov.f32 %f3001, 0f00000000;
- mul.rn.f32 %f5382, %f5347, %f3001;
- mov.u32 %r8365, 0;
- bra.uni $L__BB0_565;
-
-$L__BB0_559:
- mov.b32 %r716, %f5347;
- shr.u32 %r4189, %r716, 23;
- and.b32 %r4190, %r4189, 255;
- add.s32 %r717, %r4190, -128;
- shl.b32 %r4191, %r716, 8;
- or.b32 %r718, %r4191, -2147483648;
- shr.u32 %r719, %r717, 5;
- mov.u64 %rd2566, 0;
- mov.u32 %r8362, 0;
- mov.u64 %rd1180, __cudart_i2opi_f;
- mov.u64 %rd2567, %rd2566;
-
-$L__BB0_560:
- .pragma "nounroll";
- shl.b64 %rd1179, %rd2566, 2;
- add.s64 %rd1181, %rd1180, %rd1179;
- ld.global.nc.u32 %r4192, [%rd1181];
- mad.wide.u32 %rd1182, %r4192, %r718, %rd2567;
- shr.u64 %rd2567, %rd1182, 32;
- add.s64 %rd1183, %rd1, %rd1179;
- st.local.u32 [%rd1183], %rd1182;
- add.s32 %r8362, %r8362, 1;
- cvt.s64.s32 %rd2566, %r8362;
- setp.ne.s32 %p487, %r8362, 6;
- @%p487 bra $L__BB0_560;
-
- st.local.u32 [%rd5], %rd2567;
- mov.u32 %r4193, 4;
- sub.s32 %r722, %r4193, %r719;
- mov.u32 %r4194, 6;
- sub.s32 %r4195, %r4194, %r719;
- mul.wide.s32 %rd1184, %r4195, 4;
- add.s64 %rd1185, %rd1, %rd1184;
- ld.local.u32 %r8363, [%rd1185];
- ld.local.u32 %r8364, [%rd1185+-4];
- and.b32 %r725, %r717, 31;
- setp.eq.s32 %p488, %r725, 0;
- @%p488 bra $L__BB0_563;
-
- mov.u32 %r4196, 32;
- sub.s32 %r4197, %r4196, %r725;
- shr.u32 %r4198, %r8364, %r4197;
- shl.b32 %r4199, %r8363, %r725;
- add.s32 %r8363, %r4198, %r4199;
- mul.wide.s32 %rd1186, %r722, 4;
- add.s64 %rd1187, %rd1, %rd1186;
- ld.local.u32 %r4200, [%rd1187];
- shr.u32 %r4201, %r4200, %r4197;
- shl.b32 %r4202, %r8364, %r725;
- add.s32 %r8364, %r4201, %r4202;
-
-$L__BB0_563:
- and.b32 %r4203, %r716, -2147483648;
- shr.u32 %r4204, %r8364, 30;
- shl.b32 %r4205, %r8363, 2;
- or.b32 %r4206, %r4204, %r4205;
- shr.u32 %r4207, %r4206, 31;
- shr.u32 %r4208, %r8363, 30;
- add.s32 %r4209, %r4207, %r4208;
- neg.s32 %r4210, %r4209;
- setp.eq.s32 %p489, %r4203, 0;
- selp.b32 %r8365, %r4209, %r4210, %p489;
- setp.ne.s32 %p490, %r4207, 0;
- xor.b32 %r4211, %r4203, -2147483648;
- selp.b32 %r4212, %r4211, %r4203, %p490;
- selp.b32 %r4213, -1, 0, %p490;
- xor.b32 %r4214, %r4206, %r4213;
- shl.b32 %r4215, %r8364, 2;
- xor.b32 %r4216, %r4215, %r4213;
- cvt.u64.u32 %rd1188, %r4214;
- cvt.u64.u32 %rd1189, %r4216;
- bfi.b64 %rd1190, %rd1188, %rd1189, 32, 32;
- cvt.rn.f64.s64 %fd69, %rd1190;
- mul.f64 %fd70, %fd69, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f2999, %fd70;
- setp.eq.s32 %p491, %r4212, 0;
- neg.f32 %f3000, %f2999;
- selp.f32 %f5382, %f2999, %f3000, %p491;
-
-$L__BB0_565:
- and.b32 %r732, %r8365, 1;
- setp.eq.s32 %p492, %r732, 0;
- selp.f32 %f611, %f5382, 0f3F800000, %p492;
- mul.rn.f32 %f612, %f5382, %f5382;
- mov.f32 %f5383, 0fB94D4153;
- @%p492 bra $L__BB0_567;
-
- mov.f32 %f3003, 0fBAB607ED;
- mov.f32 %f3004, 0f37CBAC00;
- fma.rn.f32 %f5383, %f3004, %f612, %f3003;
-
-$L__BB0_567:
- selp.f32 %f3005, 0f3C0885E4, 0f3D2AAABB, %p492;
- fma.rn.f32 %f3006, %f5383, %f612, %f3005;
- selp.f32 %f3007, 0fBE2AAAA8, 0fBEFFFFFF, %p492;
- fma.rn.f32 %f3008, %f3006, %f612, %f3007;
- mov.f32 %f3009, 0f00000000;
- fma.rn.f32 %f3010, %f612, %f611, %f3009;
- fma.rn.f32 %f5213, %f3008, %f3010, %f611;
- and.b32 %r4218, %r8365, 2;
- setp.eq.s32 %p494, %r4218, 0;
- @%p494 bra $L__BB0_569;
-
- mov.f32 %f3012, 0fBF800000;
- fma.rn.f32 %f5213, %f5213, %f3012, %f3009;
-
-$L__BB0_569:
- setp.lt.s32 %p9, %r14, %r714;
- @%p484 bra $L__BB0_582;
-
- mul.f32 %f3013, %f5339, 0f3F22F983;
- cvt.rni.s32.f32 %r8369, %f3013;
- cvt.rn.f32.s32 %f3014, %r8369;
- mov.f32 %f3015, 0fBFC90FDA;
- fma.rn.f32 %f3016, %f3014, %f3015, %f5339;
- mov.f32 %f3017, 0fB3A22168;
- fma.rn.f32 %f3018, %f3014, %f3017, %f3016;
- mov.f32 %f3019, 0fA7C234C5;
- fma.rn.f32 %f5386, %f3014, %f3019, %f3018;
- abs.f32 %f620, %f5339;
- setp.ltu.f32 %p496, %f620, 0f47CE4780;
- @%p496 bra $L__BB0_578;
-
- setp.eq.f32 %p497, %f620, 0f7F800000;
- @%p497 bra $L__BB0_577;
- bra.uni $L__BB0_572;
-
-$L__BB0_577:
- mov.f32 %f3022, 0f00000000;
- mul.rn.f32 %f5386, %f5339, %f3022;
- mov.u32 %r8369, 0;
- bra.uni $L__BB0_578;
-
-$L__BB0_572:
- mov.b32 %r734, %f5339;
- shr.u32 %r4220, %r734, 23;
- and.b32 %r4221, %r4220, 255;
- add.s32 %r735, %r4221, -128;
- shl.b32 %r4222, %r734, 8;
- or.b32 %r736, %r4222, -2147483648;
- shr.u32 %r737, %r735, 5;
- mov.u64 %rd2568, 0;
- mov.u32 %r8366, 0;
- mov.u64 %rd1194, __cudart_i2opi_f;
- mov.u64 %rd2569, %rd2568;
-
-$L__BB0_573:
- .pragma "nounroll";
- shl.b64 %rd1193, %rd2568, 2;
- add.s64 %rd1195, %rd1194, %rd1193;
- ld.global.nc.u32 %r4223, [%rd1195];
- mad.wide.u32 %rd1196, %r4223, %r736, %rd2569;
- shr.u64 %rd2569, %rd1196, 32;
- add.s64 %rd1197, %rd1, %rd1193;
- st.local.u32 [%rd1197], %rd1196;
- add.s32 %r8366, %r8366, 1;
- cvt.s64.s32 %rd2568, %r8366;
- setp.ne.s32 %p498, %r8366, 6;
- @%p498 bra $L__BB0_573;
-
- st.local.u32 [%rd5], %rd2569;
- mov.u32 %r4224, 4;
- sub.s32 %r740, %r4224, %r737;
- mov.u32 %r4225, 6;
- sub.s32 %r4226, %r4225, %r737;
- mul.wide.s32 %rd1198, %r4226, 4;
- add.s64 %rd1199, %rd1, %rd1198;
- ld.local.u32 %r8367, [%rd1199];
- ld.local.u32 %r8368, [%rd1199+-4];
- and.b32 %r743, %r735, 31;
- setp.eq.s32 %p499, %r743, 0;
- @%p499 bra $L__BB0_576;
-
- mov.u32 %r4227, 32;
- sub.s32 %r4228, %r4227, %r743;
- shr.u32 %r4229, %r8368, %r4228;
- shl.b32 %r4230, %r8367, %r743;
- add.s32 %r8367, %r4229, %r4230;
- mul.wide.s32 %rd1200, %r740, 4;
- add.s64 %rd1201, %rd1, %rd1200;
- ld.local.u32 %r4231, [%rd1201];
- shr.u32 %r4232, %r4231, %r4228;
- shl.b32 %r4233, %r8368, %r743;
- add.s32 %r8368, %r4232, %r4233;
-
-$L__BB0_576:
- and.b32 %r4234, %r734, -2147483648;
- shr.u32 %r4235, %r8368, 30;
- shl.b32 %r4236, %r8367, 2;
- or.b32 %r4237, %r4235, %r4236;
- shr.u32 %r4238, %r4237, 31;
- shr.u32 %r4239, %r8367, 30;
- add.s32 %r4240, %r4238, %r4239;
- neg.s32 %r4241, %r4240;
- setp.eq.s32 %p500, %r4234, 0;
- selp.b32 %r8369, %r4240, %r4241, %p500;
- setp.ne.s32 %p501, %r4238, 0;
- xor.b32 %r4242, %r4234, -2147483648;
- selp.b32 %r4243, %r4242, %r4234, %p501;
- selp.b32 %r4244, -1, 0, %p501;
- xor.b32 %r4245, %r4237, %r4244;
- shl.b32 %r4246, %r8368, 2;
- xor.b32 %r4247, %r4246, %r4244;
- cvt.u64.u32 %rd1202, %r4245;
- cvt.u64.u32 %rd1203, %r4247;
- bfi.b64 %rd1204, %rd1202, %rd1203, 32, 32;
- cvt.rn.f64.s64 %fd71, %rd1204;
- mul.f64 %fd72, %fd71, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3020, %fd72;
- setp.eq.s32 %p502, %r4243, 0;
- neg.f32 %f3021, %f3020;
- selp.f32 %f5386, %f3020, %f3021, %p502;
-
-$L__BB0_578:
- add.s32 %r750, %r8369, 1;
- and.b32 %r751, %r750, 1;
- setp.eq.s32 %p503, %r751, 0;
- selp.f32 %f624, %f5386, 0f3F800000, %p503;
- mul.rn.f32 %f625, %f5386, %f5386;
- mov.f32 %f5387, 0fB94D4153;
- @%p503 bra $L__BB0_580;
-
- mov.f32 %f3024, 0fBAB607ED;
- mov.f32 %f3025, 0f37CBAC00;
- fma.rn.f32 %f5387, %f3025, %f625, %f3024;
-
-$L__BB0_580:
- selp.f32 %f3026, 0f3C0885E4, 0f3D2AAABB, %p503;
- fma.rn.f32 %f3027, %f5387, %f625, %f3026;
- selp.f32 %f3028, 0fBE2AAAA8, 0fBEFFFFFF, %p503;
- fma.rn.f32 %f3029, %f3027, %f625, %f3028;
- mov.f32 %f3030, 0f00000000;
- fma.rn.f32 %f3031, %f625, %f624, %f3030;
- fma.rn.f32 %f5215, %f3029, %f3031, %f624;
- and.b32 %r4249, %r750, 2;
- setp.eq.s32 %p505, %r4249, 0;
- @%p505 bra $L__BB0_582;
-
- mov.f32 %f3033, 0fBF800000;
- fma.rn.f32 %f5215, %f5215, %f3033, %f3030;
-
-$L__BB0_582:
- selp.f32 %f632, %f5215, %f5216, %p9;
- selp.f32 %f633, %f5213, %f5214, %p9;
- @%p484 bra $L__BB0_584;
-
- add.f32 %f5522, %f633, %f632;
- mov.f32 %f5214, %f5213;
- mov.f32 %f5216, %f5215;
-
-$L__BB0_584:
- @%p430 bra $L__BB0_613;
-
- shl.b32 %r4251, %r12, 5;
- neg.s32 %r752, %r4251;
- setp.ge.s32 %p509, %r14, %r752;
- @%p509 bra $L__BB0_598;
-
- mul.f32 %f3036, %f5346, 0f3F22F983;
- cvt.rni.s32.f32 %r8373, %f3036;
- cvt.rn.f32.s32 %f3037, %r8373;
- mov.f32 %f3038, 0fBFC90FDA;
- fma.rn.f32 %f3039, %f3037, %f3038, %f5346;
- mov.f32 %f3040, 0fB3A22168;
- fma.rn.f32 %f3041, %f3037, %f3040, %f3039;
- mov.f32 %f3042, 0fA7C234C5;
- fma.rn.f32 %f5395, %f3037, %f3042, %f3041;
- abs.f32 %f641, %f5346;
- setp.ltu.f32 %p510, %f641, 0f47CE4780;
- @%p510 bra $L__BB0_594;
-
- setp.eq.f32 %p511, %f641, 0f7F800000;
- @%p511 bra $L__BB0_593;
- bra.uni $L__BB0_588;
-
-$L__BB0_593:
- mov.f32 %f3045, 0f00000000;
- mul.rn.f32 %f5395, %f5346, %f3045;
- mov.u32 %r8373, 0;
- bra.uni $L__BB0_594;
-
-$L__BB0_588:
- mov.b32 %r754, %f5346;
- shr.u32 %r4253, %r754, 23;
- and.b32 %r4254, %r4253, 255;
- add.s32 %r755, %r4254, -128;
- shl.b32 %r4255, %r754, 8;
- or.b32 %r756, %r4255, -2147483648;
- shr.u32 %r757, %r755, 5;
- mov.u64 %rd2570, 0;
- mov.u32 %r8370, 0;
- mov.u64 %rd1208, __cudart_i2opi_f;
- mov.u64 %rd2571, %rd2570;
-
-$L__BB0_589:
- .pragma "nounroll";
- shl.b64 %rd1207, %rd2570, 2;
- add.s64 %rd1209, %rd1208, %rd1207;
- ld.global.nc.u32 %r4256, [%rd1209];
- mad.wide.u32 %rd1210, %r4256, %r756, %rd2571;
- shr.u64 %rd2571, %rd1210, 32;
- add.s64 %rd1211, %rd1, %rd1207;
- st.local.u32 [%rd1211], %rd1210;
- add.s32 %r8370, %r8370, 1;
- cvt.s64.s32 %rd2570, %r8370;
- setp.ne.s32 %p512, %r8370, 6;
- @%p512 bra $L__BB0_589;
-
- st.local.u32 [%rd5], %rd2571;
- mov.u32 %r4257, 4;
- sub.s32 %r760, %r4257, %r757;
- mov.u32 %r4258, 6;
- sub.s32 %r4259, %r4258, %r757;
- mul.wide.s32 %rd1212, %r4259, 4;
- add.s64 %rd1213, %rd1, %rd1212;
- ld.local.u32 %r8371, [%rd1213];
- ld.local.u32 %r8372, [%rd1213+-4];
- and.b32 %r763, %r755, 31;
- setp.eq.s32 %p513, %r763, 0;
- @%p513 bra $L__BB0_592;
-
- mov.u32 %r4260, 32;
- sub.s32 %r4261, %r4260, %r763;
- shr.u32 %r4262, %r8372, %r4261;
- shl.b32 %r4263, %r8371, %r763;
- add.s32 %r8371, %r4262, %r4263;
- mul.wide.s32 %rd1214, %r760, 4;
- add.s64 %rd1215, %rd1, %rd1214;
- ld.local.u32 %r4264, [%rd1215];
- shr.u32 %r4265, %r4264, %r4261;
- shl.b32 %r4266, %r8372, %r763;
- add.s32 %r8372, %r4265, %r4266;
-
-$L__BB0_592:
- and.b32 %r4267, %r754, -2147483648;
- shr.u32 %r4268, %r8372, 30;
- shl.b32 %r4269, %r8371, 2;
- or.b32 %r4270, %r4268, %r4269;
- shr.u32 %r4271, %r4270, 31;
- shr.u32 %r4272, %r8371, 30;
- add.s32 %r4273, %r4271, %r4272;
- neg.s32 %r4274, %r4273;
- setp.eq.s32 %p514, %r4267, 0;
- selp.b32 %r8373, %r4273, %r4274, %p514;
- setp.ne.s32 %p515, %r4271, 0;
- xor.b32 %r4275, %r4267, -2147483648;
- selp.b32 %r4276, %r4275, %r4267, %p515;
- selp.b32 %r4277, -1, 0, %p515;
- xor.b32 %r4278, %r4270, %r4277;
- shl.b32 %r4279, %r8372, 2;
- xor.b32 %r4280, %r4279, %r4277;
- cvt.u64.u32 %rd1216, %r4278;
- cvt.u64.u32 %rd1217, %r4280;
- bfi.b64 %rd1218, %rd1216, %rd1217, 32, 32;
- cvt.rn.f64.s64 %fd73, %rd1218;
- mul.f64 %fd74, %fd73, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3043, %fd74;
- setp.eq.s32 %p516, %r4276, 0;
- neg.f32 %f3044, %f3043;
- selp.f32 %f5395, %f3043, %f3044, %p516;
-
-$L__BB0_594:
- and.b32 %r770, %r8373, 1;
- setp.eq.s32 %p517, %r770, 0;
- selp.f32 %f645, %f5395, 0f3F800000, %p517;
- mul.rn.f32 %f646, %f5395, %f5395;
- mov.f32 %f5396, 0fB94D4153;
- @%p517 bra $L__BB0_596;
-
- mov.f32 %f3047, 0fBAB607ED;
- mov.f32 %f3048, 0f37CBAC00;
- fma.rn.f32 %f5396, %f3048, %f646, %f3047;
-
-$L__BB0_596:
- selp.f32 %f3049, 0f3C0885E4, 0f3D2AAABB, %p517;
- fma.rn.f32 %f3050, %f5396, %f646, %f3049;
- selp.f32 %f3051, 0fBE2AAAA8, 0fBEFFFFFF, %p517;
- fma.rn.f32 %f3052, %f3050, %f646, %f3051;
- mov.f32 %f3053, 0f00000000;
- fma.rn.f32 %f3054, %f646, %f645, %f3053;
- fma.rn.f32 %f5213, %f3052, %f3054, %f645;
- and.b32 %r4282, %r8373, 2;
- setp.eq.s32 %p519, %r4282, 0;
- @%p519 bra $L__BB0_598;
-
- mov.f32 %f3056, 0fBF800000;
- fma.rn.f32 %f5213, %f5213, %f3056, %f3053;
-
-$L__BB0_598:
- setp.lt.s32 %p10, %r14, %r752;
- @%p509 bra $L__BB0_611;
-
- mul.f32 %f3057, %f5338, 0f3F22F983;
- cvt.rni.s32.f32 %r8377, %f3057;
- cvt.rn.f32.s32 %f3058, %r8377;
- mov.f32 %f3059, 0fBFC90FDA;
- fma.rn.f32 %f3060, %f3058, %f3059, %f5338;
- mov.f32 %f3061, 0fB3A22168;
- fma.rn.f32 %f3062, %f3058, %f3061, %f3060;
- mov.f32 %f3063, 0fA7C234C5;
- fma.rn.f32 %f5399, %f3058, %f3063, %f3062;
- abs.f32 %f654, %f5338;
- setp.ltu.f32 %p521, %f654, 0f47CE4780;
- @%p521 bra $L__BB0_607;
-
- setp.eq.f32 %p522, %f654, 0f7F800000;
- @%p522 bra $L__BB0_606;
- bra.uni $L__BB0_601;
-
-$L__BB0_606:
- mov.f32 %f3066, 0f00000000;
- mul.rn.f32 %f5399, %f5338, %f3066;
- mov.u32 %r8377, 0;
- bra.uni $L__BB0_607;
-
-$L__BB0_601:
- mov.b32 %r772, %f5338;
- shr.u32 %r4284, %r772, 23;
- and.b32 %r4285, %r4284, 255;
- add.s32 %r773, %r4285, -128;
- shl.b32 %r4286, %r772, 8;
- or.b32 %r774, %r4286, -2147483648;
- shr.u32 %r775, %r773, 5;
- mov.u64 %rd2572, 0;
- mov.u32 %r8374, 0;
- mov.u64 %rd1222, __cudart_i2opi_f;
- mov.u64 %rd2573, %rd2572;
-
-$L__BB0_602:
- .pragma "nounroll";
- shl.b64 %rd1221, %rd2572, 2;
- add.s64 %rd1223, %rd1222, %rd1221;
- ld.global.nc.u32 %r4287, [%rd1223];
- mad.wide.u32 %rd1224, %r4287, %r774, %rd2573;
- shr.u64 %rd2573, %rd1224, 32;
- add.s64 %rd1225, %rd1, %rd1221;
- st.local.u32 [%rd1225], %rd1224;
- add.s32 %r8374, %r8374, 1;
- cvt.s64.s32 %rd2572, %r8374;
- setp.ne.s32 %p523, %r8374, 6;
- @%p523 bra $L__BB0_602;
-
- st.local.u32 [%rd5], %rd2573;
- mov.u32 %r4288, 4;
- sub.s32 %r778, %r4288, %r775;
- mov.u32 %r4289, 6;
- sub.s32 %r4290, %r4289, %r775;
- mul.wide.s32 %rd1226, %r4290, 4;
- add.s64 %rd1227, %rd1, %rd1226;
- ld.local.u32 %r8375, [%rd1227];
- ld.local.u32 %r8376, [%rd1227+-4];
- and.b32 %r781, %r773, 31;
- setp.eq.s32 %p524, %r781, 0;
- @%p524 bra $L__BB0_605;
-
- mov.u32 %r4291, 32;
- sub.s32 %r4292, %r4291, %r781;
- shr.u32 %r4293, %r8376, %r4292;
- shl.b32 %r4294, %r8375, %r781;
- add.s32 %r8375, %r4293, %r4294;
- mul.wide.s32 %rd1228, %r778, 4;
- add.s64 %rd1229, %rd1, %rd1228;
- ld.local.u32 %r4295, [%rd1229];
- shr.u32 %r4296, %r4295, %r4292;
- shl.b32 %r4297, %r8376, %r781;
- add.s32 %r8376, %r4296, %r4297;
-
-$L__BB0_605:
- and.b32 %r4298, %r772, -2147483648;
- shr.u32 %r4299, %r8376, 30;
- shl.b32 %r4300, %r8375, 2;
- or.b32 %r4301, %r4299, %r4300;
- shr.u32 %r4302, %r4301, 31;
- shr.u32 %r4303, %r8375, 30;
- add.s32 %r4304, %r4302, %r4303;
- neg.s32 %r4305, %r4304;
- setp.eq.s32 %p525, %r4298, 0;
- selp.b32 %r8377, %r4304, %r4305, %p525;
- setp.ne.s32 %p526, %r4302, 0;
- xor.b32 %r4306, %r4298, -2147483648;
- selp.b32 %r4307, %r4306, %r4298, %p526;
- selp.b32 %r4308, -1, 0, %p526;
- xor.b32 %r4309, %r4301, %r4308;
- shl.b32 %r4310, %r8376, 2;
- xor.b32 %r4311, %r4310, %r4308;
- cvt.u64.u32 %rd1230, %r4309;
- cvt.u64.u32 %rd1231, %r4311;
- bfi.b64 %rd1232, %rd1230, %rd1231, 32, 32;
- cvt.rn.f64.s64 %fd75, %rd1232;
- mul.f64 %fd76, %fd75, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3064, %fd76;
- setp.eq.s32 %p527, %r4307, 0;
- neg.f32 %f3065, %f3064;
- selp.f32 %f5399, %f3064, %f3065, %p527;
-
-$L__BB0_607:
- add.s32 %r788, %r8377, 1;
- and.b32 %r789, %r788, 1;
- setp.eq.s32 %p528, %r789, 0;
- selp.f32 %f658, %f5399, 0f3F800000, %p528;
- mul.rn.f32 %f659, %f5399, %f5399;
- mov.f32 %f5400, 0fB94D4153;
- @%p528 bra $L__BB0_609;
-
- mov.f32 %f3068, 0fBAB607ED;
- mov.f32 %f3069, 0f37CBAC00;
- fma.rn.f32 %f5400, %f3069, %f659, %f3068;
-
-$L__BB0_609:
- selp.f32 %f3070, 0f3C0885E4, 0f3D2AAABB, %p528;
- fma.rn.f32 %f3071, %f5400, %f659, %f3070;
- selp.f32 %f3072, 0fBE2AAAA8, 0fBEFFFFFF, %p528;
- fma.rn.f32 %f3073, %f3071, %f659, %f3072;
- mov.f32 %f3074, 0f00000000;
- fma.rn.f32 %f3075, %f659, %f658, %f3074;
- fma.rn.f32 %f5215, %f3073, %f3075, %f658;
- and.b32 %r4313, %r788, 2;
- setp.eq.s32 %p530, %r4313, 0;
- @%p530 bra $L__BB0_611;
-
- mov.f32 %f3077, 0fBF800000;
- fma.rn.f32 %f5215, %f5215, %f3077, %f3074;
-
-$L__BB0_611:
- selp.f32 %f666, %f5215, %f5216, %p10;
- selp.f32 %f667, %f5213, %f5214, %p10;
- @%p509 bra $L__BB0_613;
-
- add.f32 %f5521, %f667, %f666;
- mov.f32 %f5214, %f5213;
- mov.f32 %f5216, %f5215;
-
-$L__BB0_613:
- @%p430 bra $L__BB0_642;
-
- shl.b32 %r4315, %r12, 5;
- mov.u32 %r4316, -32;
- sub.s32 %r790, %r4316, %r4315;
- setp.ge.s32 %p534, %r14, %r790;
- @%p534 bra $L__BB0_627;
-
- mul.f32 %f3080, %f5345, 0f3F22F983;
- cvt.rni.s32.f32 %r8381, %f3080;
- cvt.rn.f32.s32 %f3081, %r8381;
- mov.f32 %f3082, 0fBFC90FDA;
- fma.rn.f32 %f3083, %f3081, %f3082, %f5345;
- mov.f32 %f3084, 0fB3A22168;
- fma.rn.f32 %f3085, %f3081, %f3084, %f3083;
- mov.f32 %f3086, 0fA7C234C5;
- fma.rn.f32 %f5408, %f3081, %f3086, %f3085;
- abs.f32 %f675, %f5345;
- setp.ltu.f32 %p535, %f675, 0f47CE4780;
- @%p535 bra $L__BB0_623;
-
- setp.eq.f32 %p536, %f675, 0f7F800000;
- @%p536 bra $L__BB0_622;
- bra.uni $L__BB0_617;
-
-$L__BB0_622:
- mov.f32 %f3089, 0f00000000;
- mul.rn.f32 %f5408, %f5345, %f3089;
- mov.u32 %r8381, 0;
- bra.uni $L__BB0_623;
-
-$L__BB0_617:
- mov.b32 %r792, %f5345;
- shr.u32 %r4318, %r792, 23;
- and.b32 %r4319, %r4318, 255;
- add.s32 %r793, %r4319, -128;
- shl.b32 %r4320, %r792, 8;
- or.b32 %r794, %r4320, -2147483648;
- shr.u32 %r795, %r793, 5;
- mov.u64 %rd2574, 0;
- mov.u32 %r8378, 0;
- mov.u64 %rd1236, __cudart_i2opi_f;
- mov.u64 %rd2575, %rd2574;
-
-$L__BB0_618:
- .pragma "nounroll";
- shl.b64 %rd1235, %rd2574, 2;
- add.s64 %rd1237, %rd1236, %rd1235;
- ld.global.nc.u32 %r4321, [%rd1237];
- mad.wide.u32 %rd1238, %r4321, %r794, %rd2575;
- shr.u64 %rd2575, %rd1238, 32;
- add.s64 %rd1239, %rd1, %rd1235;
- st.local.u32 [%rd1239], %rd1238;
- add.s32 %r8378, %r8378, 1;
- cvt.s64.s32 %rd2574, %r8378;
- setp.ne.s32 %p537, %r8378, 6;
- @%p537 bra $L__BB0_618;
-
- st.local.u32 [%rd5], %rd2575;
- mov.u32 %r4322, 4;
- sub.s32 %r798, %r4322, %r795;
- mov.u32 %r4323, 6;
- sub.s32 %r4324, %r4323, %r795;
- mul.wide.s32 %rd1240, %r4324, 4;
- add.s64 %rd1241, %rd1, %rd1240;
- ld.local.u32 %r8379, [%rd1241];
- ld.local.u32 %r8380, [%rd1241+-4];
- and.b32 %r801, %r793, 31;
- setp.eq.s32 %p538, %r801, 0;
- @%p538 bra $L__BB0_621;
-
- mov.u32 %r4325, 32;
- sub.s32 %r4326, %r4325, %r801;
- shr.u32 %r4327, %r8380, %r4326;
- shl.b32 %r4328, %r8379, %r801;
- add.s32 %r8379, %r4327, %r4328;
- mul.wide.s32 %rd1242, %r798, 4;
- add.s64 %rd1243, %rd1, %rd1242;
- ld.local.u32 %r4329, [%rd1243];
- shr.u32 %r4330, %r4329, %r4326;
- shl.b32 %r4331, %r8380, %r801;
- add.s32 %r8380, %r4330, %r4331;
-
-$L__BB0_621:
- and.b32 %r4332, %r792, -2147483648;
- shr.u32 %r4333, %r8380, 30;
- shl.b32 %r4334, %r8379, 2;
- or.b32 %r4335, %r4333, %r4334;
- shr.u32 %r4336, %r4335, 31;
- shr.u32 %r4337, %r8379, 30;
- add.s32 %r4338, %r4336, %r4337;
- neg.s32 %r4339, %r4338;
- setp.eq.s32 %p539, %r4332, 0;
- selp.b32 %r8381, %r4338, %r4339, %p539;
- setp.ne.s32 %p540, %r4336, 0;
- xor.b32 %r4340, %r4332, -2147483648;
- selp.b32 %r4341, %r4340, %r4332, %p540;
- selp.b32 %r4342, -1, 0, %p540;
- xor.b32 %r4343, %r4335, %r4342;
- shl.b32 %r4344, %r8380, 2;
- xor.b32 %r4345, %r4344, %r4342;
- cvt.u64.u32 %rd1244, %r4343;
- cvt.u64.u32 %rd1245, %r4345;
- bfi.b64 %rd1246, %rd1244, %rd1245, 32, 32;
- cvt.rn.f64.s64 %fd77, %rd1246;
- mul.f64 %fd78, %fd77, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3087, %fd78;
- setp.eq.s32 %p541, %r4341, 0;
- neg.f32 %f3088, %f3087;
- selp.f32 %f5408, %f3087, %f3088, %p541;
-
-$L__BB0_623:
- and.b32 %r808, %r8381, 1;
- setp.eq.s32 %p542, %r808, 0;
- selp.f32 %f679, %f5408, 0f3F800000, %p542;
- mul.rn.f32 %f680, %f5408, %f5408;
- mov.f32 %f5409, 0fB94D4153;
- @%p542 bra $L__BB0_625;
-
- mov.f32 %f3091, 0fBAB607ED;
- mov.f32 %f3092, 0f37CBAC00;
- fma.rn.f32 %f5409, %f3092, %f680, %f3091;
-
-$L__BB0_625:
- selp.f32 %f3093, 0f3C0885E4, 0f3D2AAABB, %p542;
- fma.rn.f32 %f3094, %f5409, %f680, %f3093;
- selp.f32 %f3095, 0fBE2AAAA8, 0fBEFFFFFF, %p542;
- fma.rn.f32 %f3096, %f3094, %f680, %f3095;
- mov.f32 %f3097, 0f00000000;
- fma.rn.f32 %f3098, %f680, %f679, %f3097;
- fma.rn.f32 %f5213, %f3096, %f3098, %f679;
- and.b32 %r4347, %r8381, 2;
- setp.eq.s32 %p544, %r4347, 0;
- @%p544 bra $L__BB0_627;
-
- mov.f32 %f3100, 0fBF800000;
- fma.rn.f32 %f5213, %f5213, %f3100, %f3097;
-
-$L__BB0_627:
- setp.lt.s32 %p11, %r14, %r790;
- @%p534 bra $L__BB0_640;
-
- mul.f32 %f3101, %f5337, 0f3F22F983;
- cvt.rni.s32.f32 %r8385, %f3101;
- cvt.rn.f32.s32 %f3102, %r8385;
- mov.f32 %f3103, 0fBFC90FDA;
- fma.rn.f32 %f3104, %f3102, %f3103, %f5337;
- mov.f32 %f3105, 0fB3A22168;
- fma.rn.f32 %f3106, %f3102, %f3105, %f3104;
- mov.f32 %f3107, 0fA7C234C5;
- fma.rn.f32 %f5412, %f3102, %f3107, %f3106;
- abs.f32 %f688, %f5337;
- setp.ltu.f32 %p546, %f688, 0f47CE4780;
- @%p546 bra $L__BB0_636;
-
- setp.eq.f32 %p547, %f688, 0f7F800000;
- @%p547 bra $L__BB0_635;
- bra.uni $L__BB0_630;
-
-$L__BB0_635:
- mov.f32 %f3110, 0f00000000;
- mul.rn.f32 %f5412, %f5337, %f3110;
- mov.u32 %r8385, 0;
- bra.uni $L__BB0_636;
-
-$L__BB0_630:
- mov.b32 %r810, %f5337;
- shr.u32 %r4349, %r810, 23;
- and.b32 %r4350, %r4349, 255;
- add.s32 %r811, %r4350, -128;
- shl.b32 %r4351, %r810, 8;
- or.b32 %r812, %r4351, -2147483648;
- shr.u32 %r813, %r811, 5;
- mov.u64 %rd2576, 0;
- mov.u32 %r8382, 0;
- mov.u64 %rd1250, __cudart_i2opi_f;
- mov.u64 %rd2577, %rd2576;
-
-$L__BB0_631:
- .pragma "nounroll";
- shl.b64 %rd1249, %rd2576, 2;
- add.s64 %rd1251, %rd1250, %rd1249;
- ld.global.nc.u32 %r4352, [%rd1251];
- mad.wide.u32 %rd1252, %r4352, %r812, %rd2577;
- shr.u64 %rd2577, %rd1252, 32;
- add.s64 %rd1253, %rd1, %rd1249;
- st.local.u32 [%rd1253], %rd1252;
- add.s32 %r8382, %r8382, 1;
- cvt.s64.s32 %rd2576, %r8382;
- setp.ne.s32 %p548, %r8382, 6;
- @%p548 bra $L__BB0_631;
-
- st.local.u32 [%rd5], %rd2577;
- mov.u32 %r4353, 4;
- sub.s32 %r816, %r4353, %r813;
- mov.u32 %r4354, 6;
- sub.s32 %r4355, %r4354, %r813;
- mul.wide.s32 %rd1254, %r4355, 4;
- add.s64 %rd1255, %rd1, %rd1254;
- ld.local.u32 %r8383, [%rd1255];
- ld.local.u32 %r8384, [%rd1255+-4];
- and.b32 %r819, %r811, 31;
- setp.eq.s32 %p549, %r819, 0;
- @%p549 bra $L__BB0_634;
-
- mov.u32 %r4356, 32;
- sub.s32 %r4357, %r4356, %r819;
- shr.u32 %r4358, %r8384, %r4357;
- shl.b32 %r4359, %r8383, %r819;
- add.s32 %r8383, %r4358, %r4359;
- mul.wide.s32 %rd1256, %r816, 4;
- add.s64 %rd1257, %rd1, %rd1256;
- ld.local.u32 %r4360, [%rd1257];
- shr.u32 %r4361, %r4360, %r4357;
- shl.b32 %r4362, %r8384, %r819;
- add.s32 %r8384, %r4361, %r4362;
-
-$L__BB0_634:
- and.b32 %r4363, %r810, -2147483648;
- shr.u32 %r4364, %r8384, 30;
- shl.b32 %r4365, %r8383, 2;
- or.b32 %r4366, %r4364, %r4365;
- shr.u32 %r4367, %r4366, 31;
- shr.u32 %r4368, %r8383, 30;
- add.s32 %r4369, %r4367, %r4368;
- neg.s32 %r4370, %r4369;
- setp.eq.s32 %p550, %r4363, 0;
- selp.b32 %r8385, %r4369, %r4370, %p550;
- setp.ne.s32 %p551, %r4367, 0;
- xor.b32 %r4371, %r4363, -2147483648;
- selp.b32 %r4372, %r4371, %r4363, %p551;
- selp.b32 %r4373, -1, 0, %p551;
- xor.b32 %r4374, %r4366, %r4373;
- shl.b32 %r4375, %r8384, 2;
- xor.b32 %r4376, %r4375, %r4373;
- cvt.u64.u32 %rd1258, %r4374;
- cvt.u64.u32 %rd1259, %r4376;
- bfi.b64 %rd1260, %rd1258, %rd1259, 32, 32;
- cvt.rn.f64.s64 %fd79, %rd1260;
- mul.f64 %fd80, %fd79, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3108, %fd80;
- setp.eq.s32 %p552, %r4372, 0;
- neg.f32 %f3109, %f3108;
- selp.f32 %f5412, %f3108, %f3109, %p552;
-
-$L__BB0_636:
- add.s32 %r826, %r8385, 1;
- and.b32 %r827, %r826, 1;
- setp.eq.s32 %p553, %r827, 0;
- selp.f32 %f692, %f5412, 0f3F800000, %p553;
- mul.rn.f32 %f693, %f5412, %f5412;
- mov.f32 %f5413, 0fB94D4153;
- @%p553 bra $L__BB0_638;
-
- mov.f32 %f3112, 0fBAB607ED;
- mov.f32 %f3113, 0f37CBAC00;
- fma.rn.f32 %f5413, %f3113, %f693, %f3112;
-
-$L__BB0_638:
- selp.f32 %f3114, 0f3C0885E4, 0f3D2AAABB, %p553;
- fma.rn.f32 %f3115, %f5413, %f693, %f3114;
- selp.f32 %f3116, 0fBE2AAAA8, 0fBEFFFFFF, %p553;
- fma.rn.f32 %f3117, %f3115, %f693, %f3116;
- mov.f32 %f3118, 0f00000000;
- fma.rn.f32 %f3119, %f693, %f692, %f3118;
- fma.rn.f32 %f5215, %f3117, %f3119, %f692;
- and.b32 %r4378, %r826, 2;
- setp.eq.s32 %p555, %r4378, 0;
- @%p555 bra $L__BB0_640;
-
- mov.f32 %f3121, 0fBF800000;
- fma.rn.f32 %f5215, %f5215, %f3121, %f3118;
-
-$L__BB0_640:
- selp.f32 %f700, %f5215, %f5216, %p11;
- selp.f32 %f701, %f5213, %f5214, %p11;
- @%p534 bra $L__BB0_642;
-
- add.f32 %f5520, %f701, %f700;
- mov.f32 %f5214, %f5213;
- mov.f32 %f5216, %f5215;
-
-$L__BB0_642:
- @%p434 bra $L__BB0_671;
-
- shl.b32 %r4380, %r12, 5;
- neg.s32 %r828, %r4380;
- setp.ge.s32 %p559, %r14, %r828;
- @%p559 bra $L__BB0_656;
-
- mul.f32 %f3124, %f5344, 0f3F22F983;
- cvt.rni.s32.f32 %r8389, %f3124;
- cvt.rn.f32.s32 %f3125, %r8389;
- mov.f32 %f3126, 0fBFC90FDA;
- fma.rn.f32 %f3127, %f3125, %f3126, %f5344;
- mov.f32 %f3128, 0fB3A22168;
- fma.rn.f32 %f3129, %f3125, %f3128, %f3127;
- mov.f32 %f3130, 0fA7C234C5;
- fma.rn.f32 %f5421, %f3125, %f3130, %f3129;
- abs.f32 %f709, %f5344;
- setp.ltu.f32 %p560, %f709, 0f47CE4780;
- @%p560 bra $L__BB0_652;
-
- setp.eq.f32 %p561, %f709, 0f7F800000;
- @%p561 bra $L__BB0_651;
- bra.uni $L__BB0_646;
-
-$L__BB0_651:
- mov.f32 %f3133, 0f00000000;
- mul.rn.f32 %f5421, %f5344, %f3133;
- mov.u32 %r8389, 0;
- bra.uni $L__BB0_652;
-
-$L__BB0_646:
- mov.b32 %r830, %f5344;
- shr.u32 %r4382, %r830, 23;
- and.b32 %r4383, %r4382, 255;
- add.s32 %r831, %r4383, -128;
- shl.b32 %r4384, %r830, 8;
- or.b32 %r832, %r4384, -2147483648;
- shr.u32 %r833, %r831, 5;
- mov.u64 %rd2578, 0;
- mov.u32 %r8386, 0;
- mov.u64 %rd1264, __cudart_i2opi_f;
- mov.u64 %rd2579, %rd2578;
-
-$L__BB0_647:
- .pragma "nounroll";
- shl.b64 %rd1263, %rd2578, 2;
- add.s64 %rd1265, %rd1264, %rd1263;
- ld.global.nc.u32 %r4385, [%rd1265];
- mad.wide.u32 %rd1266, %r4385, %r832, %rd2579;
- shr.u64 %rd2579, %rd1266, 32;
- add.s64 %rd1267, %rd1, %rd1263;
- st.local.u32 [%rd1267], %rd1266;
- add.s32 %r8386, %r8386, 1;
- cvt.s64.s32 %rd2578, %r8386;
- setp.ne.s32 %p562, %r8386, 6;
- @%p562 bra $L__BB0_647;
-
- st.local.u32 [%rd5], %rd2579;
- mov.u32 %r4386, 4;
- sub.s32 %r836, %r4386, %r833;
- mov.u32 %r4387, 6;
- sub.s32 %r4388, %r4387, %r833;
- mul.wide.s32 %rd1268, %r4388, 4;
- add.s64 %rd1269, %rd1, %rd1268;
- ld.local.u32 %r8387, [%rd1269];
- ld.local.u32 %r8388, [%rd1269+-4];
- and.b32 %r839, %r831, 31;
- setp.eq.s32 %p563, %r839, 0;
- @%p563 bra $L__BB0_650;
-
- mov.u32 %r4389, 32;
- sub.s32 %r4390, %r4389, %r839;
- shr.u32 %r4391, %r8388, %r4390;
- shl.b32 %r4392, %r8387, %r839;
- add.s32 %r8387, %r4391, %r4392;
- mul.wide.s32 %rd1270, %r836, 4;
- add.s64 %rd1271, %rd1, %rd1270;
- ld.local.u32 %r4393, [%rd1271];
- shr.u32 %r4394, %r4393, %r4390;
- shl.b32 %r4395, %r8388, %r839;
- add.s32 %r8388, %r4394, %r4395;
-
-$L__BB0_650:
- and.b32 %r4396, %r830, -2147483648;
- shr.u32 %r4397, %r8388, 30;
- shl.b32 %r4398, %r8387, 2;
- or.b32 %r4399, %r4397, %r4398;
- shr.u32 %r4400, %r4399, 31;
- shr.u32 %r4401, %r8387, 30;
- add.s32 %r4402, %r4400, %r4401;
- neg.s32 %r4403, %r4402;
- setp.eq.s32 %p564, %r4396, 0;
- selp.b32 %r8389, %r4402, %r4403, %p564;
- setp.ne.s32 %p565, %r4400, 0;
- xor.b32 %r4404, %r4396, -2147483648;
- selp.b32 %r4405, %r4404, %r4396, %p565;
- selp.b32 %r4406, -1, 0, %p565;
- xor.b32 %r4407, %r4399, %r4406;
- shl.b32 %r4408, %r8388, 2;
- xor.b32 %r4409, %r4408, %r4406;
- cvt.u64.u32 %rd1272, %r4407;
- cvt.u64.u32 %rd1273, %r4409;
- bfi.b64 %rd1274, %rd1272, %rd1273, 32, 32;
- cvt.rn.f64.s64 %fd81, %rd1274;
- mul.f64 %fd82, %fd81, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3131, %fd82;
- setp.eq.s32 %p566, %r4405, 0;
- neg.f32 %f3132, %f3131;
- selp.f32 %f5421, %f3131, %f3132, %p566;
-
-$L__BB0_652:
- and.b32 %r846, %r8389, 1;
- setp.eq.s32 %p567, %r846, 0;
- selp.f32 %f713, %f5421, 0f3F800000, %p567;
- mul.rn.f32 %f714, %f5421, %f5421;
- mov.f32 %f5422, 0fB94D4153;
- @%p567 bra $L__BB0_654;
-
- mov.f32 %f3135, 0fBAB607ED;
- mov.f32 %f3136, 0f37CBAC00;
- fma.rn.f32 %f5422, %f3136, %f714, %f3135;
-
-$L__BB0_654:
- selp.f32 %f3137, 0f3C0885E4, 0f3D2AAABB, %p567;
- fma.rn.f32 %f3138, %f5422, %f714, %f3137;
- selp.f32 %f3139, 0fBE2AAAA8, 0fBEFFFFFF, %p567;
- fma.rn.f32 %f3140, %f3138, %f714, %f3139;
- mov.f32 %f3141, 0f00000000;
- fma.rn.f32 %f3142, %f714, %f713, %f3141;
- fma.rn.f32 %f5213, %f3140, %f3142, %f713;
- and.b32 %r4411, %r8389, 2;
- setp.eq.s32 %p569, %r4411, 0;
- @%p569 bra $L__BB0_656;
-
- mov.f32 %f3144, 0fBF800000;
- fma.rn.f32 %f5213, %f5213, %f3144, %f3141;
-
-$L__BB0_656:
- setp.lt.s32 %p12, %r14, %r828;
- @%p559 bra $L__BB0_669;
-
- mul.f32 %f3145, %f5336, 0f3F22F983;
- cvt.rni.s32.f32 %r8393, %f3145;
- cvt.rn.f32.s32 %f3146, %r8393;
- mov.f32 %f3147, 0fBFC90FDA;
- fma.rn.f32 %f3148, %f3146, %f3147, %f5336;
- mov.f32 %f3149, 0fB3A22168;
- fma.rn.f32 %f3150, %f3146, %f3149, %f3148;
- mov.f32 %f3151, 0fA7C234C5;
- fma.rn.f32 %f5425, %f3146, %f3151, %f3150;
- abs.f32 %f722, %f5336;
- setp.ltu.f32 %p571, %f722, 0f47CE4780;
- @%p571 bra $L__BB0_665;
-
- setp.eq.f32 %p572, %f722, 0f7F800000;
- @%p572 bra $L__BB0_664;
- bra.uni $L__BB0_659;
-
-$L__BB0_664:
- mov.f32 %f3154, 0f00000000;
- mul.rn.f32 %f5425, %f5336, %f3154;
- mov.u32 %r8393, 0;
- bra.uni $L__BB0_665;
-
-$L__BB0_659:
- mov.b32 %r848, %f5336;
- shr.u32 %r4413, %r848, 23;
- and.b32 %r4414, %r4413, 255;
- add.s32 %r849, %r4414, -128;
- shl.b32 %r4415, %r848, 8;
- or.b32 %r850, %r4415, -2147483648;
- shr.u32 %r851, %r849, 5;
- mov.u64 %rd2580, 0;
- mov.u32 %r8390, 0;
- mov.u64 %rd1278, __cudart_i2opi_f;
- mov.u64 %rd2581, %rd2580;
-
-$L__BB0_660:
- .pragma "nounroll";
- shl.b64 %rd1277, %rd2580, 2;
- add.s64 %rd1279, %rd1278, %rd1277;
- ld.global.nc.u32 %r4416, [%rd1279];
- mad.wide.u32 %rd1280, %r4416, %r850, %rd2581;
- shr.u64 %rd2581, %rd1280, 32;
- add.s64 %rd1281, %rd1, %rd1277;
- st.local.u32 [%rd1281], %rd1280;
- add.s32 %r8390, %r8390, 1;
- cvt.s64.s32 %rd2580, %r8390;
- setp.ne.s32 %p573, %r8390, 6;
- @%p573 bra $L__BB0_660;
-
- st.local.u32 [%rd5], %rd2581;
- mov.u32 %r4417, 4;
- sub.s32 %r854, %r4417, %r851;
- mov.u32 %r4418, 6;
- sub.s32 %r4419, %r4418, %r851;
- mul.wide.s32 %rd1282, %r4419, 4;
- add.s64 %rd1283, %rd1, %rd1282;
- ld.local.u32 %r8391, [%rd1283];
- ld.local.u32 %r8392, [%rd1283+-4];
- and.b32 %r857, %r849, 31;
- setp.eq.s32 %p574, %r857, 0;
- @%p574 bra $L__BB0_663;
-
- mov.u32 %r4420, 32;
- sub.s32 %r4421, %r4420, %r857;
- shr.u32 %r4422, %r8392, %r4421;
- shl.b32 %r4423, %r8391, %r857;
- add.s32 %r8391, %r4422, %r4423;
- mul.wide.s32 %rd1284, %r854, 4;
- add.s64 %rd1285, %rd1, %rd1284;
- ld.local.u32 %r4424, [%rd1285];
- shr.u32 %r4425, %r4424, %r4421;
- shl.b32 %r4426, %r8392, %r857;
- add.s32 %r8392, %r4425, %r4426;
-
-$L__BB0_663:
- and.b32 %r4427, %r848, -2147483648;
- shr.u32 %r4428, %r8392, 30;
- shl.b32 %r4429, %r8391, 2;
- or.b32 %r4430, %r4428, %r4429;
- shr.u32 %r4431, %r4430, 31;
- shr.u32 %r4432, %r8391, 30;
- add.s32 %r4433, %r4431, %r4432;
- neg.s32 %r4434, %r4433;
- setp.eq.s32 %p575, %r4427, 0;
- selp.b32 %r8393, %r4433, %r4434, %p575;
- setp.ne.s32 %p576, %r4431, 0;
- xor.b32 %r4435, %r4427, -2147483648;
- selp.b32 %r4436, %r4435, %r4427, %p576;
- selp.b32 %r4437, -1, 0, %p576;
- xor.b32 %r4438, %r4430, %r4437;
- shl.b32 %r4439, %r8392, 2;
- xor.b32 %r4440, %r4439, %r4437;
- cvt.u64.u32 %rd1286, %r4438;
- cvt.u64.u32 %rd1287, %r4440;
- bfi.b64 %rd1288, %rd1286, %rd1287, 32, 32;
- cvt.rn.f64.s64 %fd83, %rd1288;
- mul.f64 %fd84, %fd83, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3152, %fd84;
- setp.eq.s32 %p577, %r4436, 0;
- neg.f32 %f3153, %f3152;
- selp.f32 %f5425, %f3152, %f3153, %p577;
-
-$L__BB0_665:
- add.s32 %r864, %r8393, 1;
- and.b32 %r865, %r864, 1;
- setp.eq.s32 %p578, %r865, 0;
- selp.f32 %f726, %f5425, 0f3F800000, %p578;
- mul.rn.f32 %f727, %f5425, %f5425;
- mov.f32 %f5426, 0fB94D4153;
- @%p578 bra $L__BB0_667;
-
- mov.f32 %f3156, 0fBAB607ED;
- mov.f32 %f3157, 0f37CBAC00;
- fma.rn.f32 %f5426, %f3157, %f727, %f3156;
-
-$L__BB0_667:
- selp.f32 %f3158, 0f3C0885E4, 0f3D2AAABB, %p578;
- fma.rn.f32 %f3159, %f5426, %f727, %f3158;
- selp.f32 %f3160, 0fBE2AAAA8, 0fBEFFFFFF, %p578;
- fma.rn.f32 %f3161, %f3159, %f727, %f3160;
- mov.f32 %f3162, 0f00000000;
- fma.rn.f32 %f3163, %f727, %f726, %f3162;
- fma.rn.f32 %f5215, %f3161, %f3163, %f726;
- and.b32 %r4442, %r864, 2;
- setp.eq.s32 %p580, %r4442, 0;
- @%p580 bra $L__BB0_669;
-
- mov.f32 %f3165, 0fBF800000;
- fma.rn.f32 %f5215, %f5215, %f3165, %f3162;
-
-$L__BB0_669:
- selp.f32 %f734, %f5215, %f5216, %p12;
- selp.f32 %f735, %f5213, %f5214, %p12;
- @%p559 bra $L__BB0_671;
-
- add.f32 %f5519, %f735, %f734;
- mov.f32 %f5214, %f5213;
- mov.f32 %f5216, %f5215;
-
-$L__BB0_671:
- @%p434 bra $L__BB0_700;
-
- shl.b32 %r4444, %r12, 5;
- mov.u32 %r4445, -32;
- sub.s32 %r866, %r4445, %r4444;
- setp.ge.s32 %p584, %r14, %r866;
- @%p584 bra $L__BB0_685;
-
- mul.f32 %f3168, %f5343, 0f3F22F983;
- cvt.rni.s32.f32 %r8397, %f3168;
- cvt.rn.f32.s32 %f3169, %r8397;
- mov.f32 %f3170, 0fBFC90FDA;
- fma.rn.f32 %f3171, %f3169, %f3170, %f5343;
- mov.f32 %f3172, 0fB3A22168;
- fma.rn.f32 %f3173, %f3169, %f3172, %f3171;
- mov.f32 %f3174, 0fA7C234C5;
- fma.rn.f32 %f5434, %f3169, %f3174, %f3173;
- abs.f32 %f743, %f5343;
- setp.ltu.f32 %p585, %f743, 0f47CE4780;
- @%p585 bra $L__BB0_681;
-
- setp.eq.f32 %p586, %f743, 0f7F800000;
- @%p586 bra $L__BB0_680;
- bra.uni $L__BB0_675;
-
-$L__BB0_680:
- mov.f32 %f3177, 0f00000000;
- mul.rn.f32 %f5434, %f5343, %f3177;
- mov.u32 %r8397, 0;
- bra.uni $L__BB0_681;
-
-$L__BB0_675:
- mov.b32 %r868, %f5343;
- shr.u32 %r4447, %r868, 23;
- and.b32 %r4448, %r4447, 255;
- add.s32 %r869, %r4448, -128;
- shl.b32 %r4449, %r868, 8;
- or.b32 %r870, %r4449, -2147483648;
- shr.u32 %r871, %r869, 5;
- mov.u64 %rd2582, 0;
- mov.u32 %r8394, 0;
- mov.u64 %rd1292, __cudart_i2opi_f;
- mov.u64 %rd2583, %rd2582;
-
-$L__BB0_676:
- .pragma "nounroll";
- shl.b64 %rd1291, %rd2582, 2;
- add.s64 %rd1293, %rd1292, %rd1291;
- ld.global.nc.u32 %r4450, [%rd1293];
- mad.wide.u32 %rd1294, %r4450, %r870, %rd2583;
- shr.u64 %rd2583, %rd1294, 32;
- add.s64 %rd1295, %rd1, %rd1291;
- st.local.u32 [%rd1295], %rd1294;
- add.s32 %r8394, %r8394, 1;
- cvt.s64.s32 %rd2582, %r8394;
- setp.ne.s32 %p587, %r8394, 6;
- @%p587 bra $L__BB0_676;
-
- st.local.u32 [%rd5], %rd2583;
- mov.u32 %r4451, 4;
- sub.s32 %r874, %r4451, %r871;
- mov.u32 %r4452, 6;
- sub.s32 %r4453, %r4452, %r871;
- mul.wide.s32 %rd1296, %r4453, 4;
- add.s64 %rd1297, %rd1, %rd1296;
- ld.local.u32 %r8395, [%rd1297];
- ld.local.u32 %r8396, [%rd1297+-4];
- and.b32 %r877, %r869, 31;
- setp.eq.s32 %p588, %r877, 0;
- @%p588 bra $L__BB0_679;
-
- mov.u32 %r4454, 32;
- sub.s32 %r4455, %r4454, %r877;
- shr.u32 %r4456, %r8396, %r4455;
- shl.b32 %r4457, %r8395, %r877;
- add.s32 %r8395, %r4456, %r4457;
- mul.wide.s32 %rd1298, %r874, 4;
- add.s64 %rd1299, %rd1, %rd1298;
- ld.local.u32 %r4458, [%rd1299];
- shr.u32 %r4459, %r4458, %r4455;
- shl.b32 %r4460, %r8396, %r877;
- add.s32 %r8396, %r4459, %r4460;
-
-$L__BB0_679:
- and.b32 %r4461, %r868, -2147483648;
- shr.u32 %r4462, %r8396, 30;
- shl.b32 %r4463, %r8395, 2;
- or.b32 %r4464, %r4462, %r4463;
- shr.u32 %r4465, %r4464, 31;
- shr.u32 %r4466, %r8395, 30;
- add.s32 %r4467, %r4465, %r4466;
- neg.s32 %r4468, %r4467;
- setp.eq.s32 %p589, %r4461, 0;
- selp.b32 %r8397, %r4467, %r4468, %p589;
- setp.ne.s32 %p590, %r4465, 0;
- xor.b32 %r4469, %r4461, -2147483648;
- selp.b32 %r4470, %r4469, %r4461, %p590;
- selp.b32 %r4471, -1, 0, %p590;
- xor.b32 %r4472, %r4464, %r4471;
- shl.b32 %r4473, %r8396, 2;
- xor.b32 %r4474, %r4473, %r4471;
- cvt.u64.u32 %rd1300, %r4472;
- cvt.u64.u32 %rd1301, %r4474;
- bfi.b64 %rd1302, %rd1300, %rd1301, 32, 32;
- cvt.rn.f64.s64 %fd85, %rd1302;
- mul.f64 %fd86, %fd85, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3175, %fd86;
- setp.eq.s32 %p591, %r4470, 0;
- neg.f32 %f3176, %f3175;
- selp.f32 %f5434, %f3175, %f3176, %p591;
-
-$L__BB0_681:
- and.b32 %r884, %r8397, 1;
- setp.eq.s32 %p592, %r884, 0;
- selp.f32 %f747, %f5434, 0f3F800000, %p592;
- mul.rn.f32 %f748, %f5434, %f5434;
- mov.f32 %f5435, 0fB94D4153;
- @%p592 bra $L__BB0_683;
-
- mov.f32 %f3179, 0fBAB607ED;
- mov.f32 %f3180, 0f37CBAC00;
- fma.rn.f32 %f5435, %f3180, %f748, %f3179;
-
-$L__BB0_683:
- selp.f32 %f3181, 0f3C0885E4, 0f3D2AAABB, %p592;
- fma.rn.f32 %f3182, %f5435, %f748, %f3181;
- selp.f32 %f3183, 0fBE2AAAA8, 0fBEFFFFFF, %p592;
- fma.rn.f32 %f3184, %f3182, %f748, %f3183;
- mov.f32 %f3185, 0f00000000;
- fma.rn.f32 %f3186, %f748, %f747, %f3185;
- fma.rn.f32 %f5213, %f3184, %f3186, %f747;
- and.b32 %r4476, %r8397, 2;
- setp.eq.s32 %p594, %r4476, 0;
- @%p594 bra $L__BB0_685;
-
- mov.f32 %f3188, 0fBF800000;
- fma.rn.f32 %f5213, %f5213, %f3188, %f3185;
-
-$L__BB0_685:
- setp.lt.s32 %p13, %r14, %r866;
- @%p584 bra $L__BB0_698;
-
- mul.f32 %f3189, %f5335, 0f3F22F983;
- cvt.rni.s32.f32 %r8401, %f3189;
- cvt.rn.f32.s32 %f3190, %r8401;
- mov.f32 %f3191, 0fBFC90FDA;
- fma.rn.f32 %f3192, %f3190, %f3191, %f5335;
- mov.f32 %f3193, 0fB3A22168;
- fma.rn.f32 %f3194, %f3190, %f3193, %f3192;
- mov.f32 %f3195, 0fA7C234C5;
- fma.rn.f32 %f5438, %f3190, %f3195, %f3194;
- abs.f32 %f756, %f5335;
- setp.ltu.f32 %p596, %f756, 0f47CE4780;
- @%p596 bra $L__BB0_694;
-
- setp.eq.f32 %p597, %f756, 0f7F800000;
- @%p597 bra $L__BB0_693;
- bra.uni $L__BB0_688;
-
-$L__BB0_693:
- mov.f32 %f3198, 0f00000000;
- mul.rn.f32 %f5438, %f5335, %f3198;
- mov.u32 %r8401, 0;
- bra.uni $L__BB0_694;
-
-$L__BB0_688:
- mov.b32 %r886, %f5335;
- shr.u32 %r4478, %r886, 23;
- and.b32 %r4479, %r4478, 255;
- add.s32 %r887, %r4479, -128;
- shl.b32 %r4480, %r886, 8;
- or.b32 %r888, %r4480, -2147483648;
- shr.u32 %r889, %r887, 5;
- mov.u64 %rd2584, 0;
- mov.u32 %r8398, 0;
- mov.u64 %rd1306, __cudart_i2opi_f;
- mov.u64 %rd2585, %rd2584;
-
-$L__BB0_689:
- .pragma "nounroll";
- shl.b64 %rd1305, %rd2584, 2;
- add.s64 %rd1307, %rd1306, %rd1305;
- ld.global.nc.u32 %r4481, [%rd1307];
- mad.wide.u32 %rd1308, %r4481, %r888, %rd2585;
- shr.u64 %rd2585, %rd1308, 32;
- add.s64 %rd1309, %rd1, %rd1305;
- st.local.u32 [%rd1309], %rd1308;
- add.s32 %r8398, %r8398, 1;
- cvt.s64.s32 %rd2584, %r8398;
- setp.ne.s32 %p598, %r8398, 6;
- @%p598 bra $L__BB0_689;
-
- st.local.u32 [%rd5], %rd2585;
- mov.u32 %r4482, 4;
- sub.s32 %r892, %r4482, %r889;
- mov.u32 %r4483, 6;
- sub.s32 %r4484, %r4483, %r889;
- mul.wide.s32 %rd1310, %r4484, 4;
- add.s64 %rd1311, %rd1, %rd1310;
- ld.local.u32 %r8399, [%rd1311];
- ld.local.u32 %r8400, [%rd1311+-4];
- and.b32 %r895, %r887, 31;
- setp.eq.s32 %p599, %r895, 0;
- @%p599 bra $L__BB0_692;
-
- mov.u32 %r4485, 32;
- sub.s32 %r4486, %r4485, %r895;
- shr.u32 %r4487, %r8400, %r4486;
- shl.b32 %r4488, %r8399, %r895;
- add.s32 %r8399, %r4487, %r4488;
- mul.wide.s32 %rd1312, %r892, 4;
- add.s64 %rd1313, %rd1, %rd1312;
- ld.local.u32 %r4489, [%rd1313];
- shr.u32 %r4490, %r4489, %r4486;
- shl.b32 %r4491, %r8400, %r895;
- add.s32 %r8400, %r4490, %r4491;
-
-$L__BB0_692:
- and.b32 %r4492, %r886, -2147483648;
- shr.u32 %r4493, %r8400, 30;
- shl.b32 %r4494, %r8399, 2;
- or.b32 %r4495, %r4493, %r4494;
- shr.u32 %r4496, %r4495, 31;
- shr.u32 %r4497, %r8399, 30;
- add.s32 %r4498, %r4496, %r4497;
- neg.s32 %r4499, %r4498;
- setp.eq.s32 %p600, %r4492, 0;
- selp.b32 %r8401, %r4498, %r4499, %p600;
- setp.ne.s32 %p601, %r4496, 0;
- xor.b32 %r4500, %r4492, -2147483648;
- selp.b32 %r4501, %r4500, %r4492, %p601;
- selp.b32 %r4502, -1, 0, %p601;
- xor.b32 %r4503, %r4495, %r4502;
- shl.b32 %r4504, %r8400, 2;
- xor.b32 %r4505, %r4504, %r4502;
- cvt.u64.u32 %rd1314, %r4503;
- cvt.u64.u32 %rd1315, %r4505;
- bfi.b64 %rd1316, %rd1314, %rd1315, 32, 32;
- cvt.rn.f64.s64 %fd87, %rd1316;
- mul.f64 %fd88, %fd87, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3196, %fd88;
- setp.eq.s32 %p602, %r4501, 0;
- neg.f32 %f3197, %f3196;
- selp.f32 %f5438, %f3196, %f3197, %p602;
-
-$L__BB0_694:
- add.s32 %r902, %r8401, 1;
- and.b32 %r903, %r902, 1;
- setp.eq.s32 %p603, %r903, 0;
- selp.f32 %f760, %f5438, 0f3F800000, %p603;
- mul.rn.f32 %f761, %f5438, %f5438;
- mov.f32 %f5439, 0fB94D4153;
- @%p603 bra $L__BB0_696;
-
- mov.f32 %f3200, 0fBAB607ED;
- mov.f32 %f3201, 0f37CBAC00;
- fma.rn.f32 %f5439, %f3201, %f761, %f3200;
-
-$L__BB0_696:
- selp.f32 %f3202, 0f3C0885E4, 0f3D2AAABB, %p603;
- fma.rn.f32 %f3203, %f5439, %f761, %f3202;
- selp.f32 %f3204, 0fBE2AAAA8, 0fBEFFFFFF, %p603;
- fma.rn.f32 %f3205, %f3203, %f761, %f3204;
- mov.f32 %f3206, 0f00000000;
- fma.rn.f32 %f3207, %f761, %f760, %f3206;
- fma.rn.f32 %f5215, %f3205, %f3207, %f760;
- and.b32 %r4507, %r902, 2;
- setp.eq.s32 %p605, %r4507, 0;
- @%p605 bra $L__BB0_698;
-
- mov.f32 %f3209, 0fBF800000;
- fma.rn.f32 %f5215, %f5215, %f3209, %f3206;
-
-$L__BB0_698:
- selp.f32 %f768, %f5215, %f5216, %p13;
- selp.f32 %f769, %f5213, %f5214, %p13;
- @%p584 bra $L__BB0_700;
-
- add.f32 %f5518, %f769, %f768;
- mov.f32 %f5214, %f5213;
- mov.f32 %f5216, %f5215;
-
-$L__BB0_700:
- @%p438 bra $L__BB0_729;
-
- shl.b32 %r4509, %r12, 5;
- neg.s32 %r904, %r4509;
- setp.ge.s32 %p609, %r14, %r904;
- @%p609 bra $L__BB0_714;
-
- mul.f32 %f3212, %f5342, 0f3F22F983;
- cvt.rni.s32.f32 %r8405, %f3212;
- cvt.rn.f32.s32 %f3213, %r8405;
- mov.f32 %f3214, 0fBFC90FDA;
- fma.rn.f32 %f3215, %f3213, %f3214, %f5342;
- mov.f32 %f3216, 0fB3A22168;
- fma.rn.f32 %f3217, %f3213, %f3216, %f3215;
- mov.f32 %f3218, 0fA7C234C5;
- fma.rn.f32 %f5447, %f3213, %f3218, %f3217;
- abs.f32 %f777, %f5342;
- setp.ltu.f32 %p610, %f777, 0f47CE4780;
- @%p610 bra $L__BB0_710;
-
- setp.eq.f32 %p611, %f777, 0f7F800000;
- @%p611 bra $L__BB0_709;
- bra.uni $L__BB0_704;
-
-$L__BB0_709:
- mov.f32 %f3221, 0f00000000;
- mul.rn.f32 %f5447, %f5342, %f3221;
- mov.u32 %r8405, 0;
- bra.uni $L__BB0_710;
-
-$L__BB0_704:
- mov.b32 %r906, %f5342;
- shr.u32 %r4511, %r906, 23;
- and.b32 %r4512, %r4511, 255;
- add.s32 %r907, %r4512, -128;
- shl.b32 %r4513, %r906, 8;
- or.b32 %r908, %r4513, -2147483648;
- shr.u32 %r909, %r907, 5;
- mov.u64 %rd2586, 0;
- mov.u32 %r8402, 0;
- mov.u64 %rd1320, __cudart_i2opi_f;
- mov.u64 %rd2587, %rd2586;
-
-$L__BB0_705:
- .pragma "nounroll";
- shl.b64 %rd1319, %rd2586, 2;
- add.s64 %rd1321, %rd1320, %rd1319;
- ld.global.nc.u32 %r4514, [%rd1321];
- mad.wide.u32 %rd1322, %r4514, %r908, %rd2587;
- shr.u64 %rd2587, %rd1322, 32;
- add.s64 %rd1323, %rd1, %rd1319;
- st.local.u32 [%rd1323], %rd1322;
- add.s32 %r8402, %r8402, 1;
- cvt.s64.s32 %rd2586, %r8402;
- setp.ne.s32 %p612, %r8402, 6;
- @%p612 bra $L__BB0_705;
-
- st.local.u32 [%rd5], %rd2587;
- mov.u32 %r4515, 4;
- sub.s32 %r912, %r4515, %r909;
- mov.u32 %r4516, 6;
- sub.s32 %r4517, %r4516, %r909;
- mul.wide.s32 %rd1324, %r4517, 4;
- add.s64 %rd1325, %rd1, %rd1324;
- ld.local.u32 %r8403, [%rd1325];
- ld.local.u32 %r8404, [%rd1325+-4];
- and.b32 %r915, %r907, 31;
- setp.eq.s32 %p613, %r915, 0;
- @%p613 bra $L__BB0_708;
-
- mov.u32 %r4518, 32;
- sub.s32 %r4519, %r4518, %r915;
- shr.u32 %r4520, %r8404, %r4519;
- shl.b32 %r4521, %r8403, %r915;
- add.s32 %r8403, %r4520, %r4521;
- mul.wide.s32 %rd1326, %r912, 4;
- add.s64 %rd1327, %rd1, %rd1326;
- ld.local.u32 %r4522, [%rd1327];
- shr.u32 %r4523, %r4522, %r4519;
- shl.b32 %r4524, %r8404, %r915;
- add.s32 %r8404, %r4523, %r4524;
-
-$L__BB0_708:
- and.b32 %r4525, %r906, -2147483648;
- shr.u32 %r4526, %r8404, 30;
- shl.b32 %r4527, %r8403, 2;
- or.b32 %r4528, %r4526, %r4527;
- shr.u32 %r4529, %r4528, 31;
- shr.u32 %r4530, %r8403, 30;
- add.s32 %r4531, %r4529, %r4530;
- neg.s32 %r4532, %r4531;
- setp.eq.s32 %p614, %r4525, 0;
- selp.b32 %r8405, %r4531, %r4532, %p614;
- setp.ne.s32 %p615, %r4529, 0;
- xor.b32 %r4533, %r4525, -2147483648;
- selp.b32 %r4534, %r4533, %r4525, %p615;
- selp.b32 %r4535, -1, 0, %p615;
- xor.b32 %r4536, %r4528, %r4535;
- shl.b32 %r4537, %r8404, 2;
- xor.b32 %r4538, %r4537, %r4535;
- cvt.u64.u32 %rd1328, %r4536;
- cvt.u64.u32 %rd1329, %r4538;
- bfi.b64 %rd1330, %rd1328, %rd1329, 32, 32;
- cvt.rn.f64.s64 %fd89, %rd1330;
- mul.f64 %fd90, %fd89, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3219, %fd90;
- setp.eq.s32 %p616, %r4534, 0;
- neg.f32 %f3220, %f3219;
- selp.f32 %f5447, %f3219, %f3220, %p616;
-
-$L__BB0_710:
- and.b32 %r922, %r8405, 1;
- setp.eq.s32 %p617, %r922, 0;
- selp.f32 %f781, %f5447, 0f3F800000, %p617;
- mul.rn.f32 %f782, %f5447, %f5447;
- mov.f32 %f5448, 0fB94D4153;
- @%p617 bra $L__BB0_712;
-
- mov.f32 %f3223, 0fBAB607ED;
- mov.f32 %f3224, 0f37CBAC00;
- fma.rn.f32 %f5448, %f3224, %f782, %f3223;
-
-$L__BB0_712:
- selp.f32 %f3225, 0f3C0885E4, 0f3D2AAABB, %p617;
- fma.rn.f32 %f3226, %f5448, %f782, %f3225;
- selp.f32 %f3227, 0fBE2AAAA8, 0fBEFFFFFF, %p617;
- fma.rn.f32 %f3228, %f3226, %f782, %f3227;
- mov.f32 %f3229, 0f00000000;
- fma.rn.f32 %f3230, %f782, %f781, %f3229;
- fma.rn.f32 %f5213, %f3228, %f3230, %f781;
- and.b32 %r4540, %r8405, 2;
- setp.eq.s32 %p619, %r4540, 0;
- @%p619 bra $L__BB0_714;
-
- mov.f32 %f3232, 0fBF800000;
- fma.rn.f32 %f5213, %f5213, %f3232, %f3229;
-
-$L__BB0_714:
- setp.lt.s32 %p14, %r14, %r904;
- @%p609 bra $L__BB0_727;
-
- mul.f32 %f3233, %f5334, 0f3F22F983;
- cvt.rni.s32.f32 %r8409, %f3233;
- cvt.rn.f32.s32 %f3234, %r8409;
- mov.f32 %f3235, 0fBFC90FDA;
- fma.rn.f32 %f3236, %f3234, %f3235, %f5334;
- mov.f32 %f3237, 0fB3A22168;
- fma.rn.f32 %f3238, %f3234, %f3237, %f3236;
- mov.f32 %f3239, 0fA7C234C5;
- fma.rn.f32 %f5451, %f3234, %f3239, %f3238;
- abs.f32 %f790, %f5334;
- setp.ltu.f32 %p621, %f790, 0f47CE4780;
- @%p621 bra $L__BB0_723;
-
- setp.eq.f32 %p622, %f790, 0f7F800000;
- @%p622 bra $L__BB0_722;
- bra.uni $L__BB0_717;
-
-$L__BB0_722:
- mov.f32 %f3242, 0f00000000;
- mul.rn.f32 %f5451, %f5334, %f3242;
- mov.u32 %r8409, 0;
- bra.uni $L__BB0_723;
-
-$L__BB0_717:
- mov.b32 %r924, %f5334;
- shr.u32 %r4542, %r924, 23;
- and.b32 %r4543, %r4542, 255;
- add.s32 %r925, %r4543, -128;
- shl.b32 %r4544, %r924, 8;
- or.b32 %r926, %r4544, -2147483648;
- shr.u32 %r927, %r925, 5;
- mov.u64 %rd2588, 0;
- mov.u32 %r8406, 0;
- mov.u64 %rd1334, __cudart_i2opi_f;
- mov.u64 %rd2589, %rd2588;
-
-$L__BB0_718:
- .pragma "nounroll";
- shl.b64 %rd1333, %rd2588, 2;
- add.s64 %rd1335, %rd1334, %rd1333;
- ld.global.nc.u32 %r4545, [%rd1335];
- mad.wide.u32 %rd1336, %r4545, %r926, %rd2589;
- shr.u64 %rd2589, %rd1336, 32;
- add.s64 %rd1337, %rd1, %rd1333;
- st.local.u32 [%rd1337], %rd1336;
- add.s32 %r8406, %r8406, 1;
- cvt.s64.s32 %rd2588, %r8406;
- setp.ne.s32 %p623, %r8406, 6;
- @%p623 bra $L__BB0_718;
-
- st.local.u32 [%rd5], %rd2589;
- mov.u32 %r4546, 4;
- sub.s32 %r930, %r4546, %r927;
- mov.u32 %r4547, 6;
- sub.s32 %r4548, %r4547, %r927;
- mul.wide.s32 %rd1338, %r4548, 4;
- add.s64 %rd1339, %rd1, %rd1338;
- ld.local.u32 %r8407, [%rd1339];
- ld.local.u32 %r8408, [%rd1339+-4];
- and.b32 %r933, %r925, 31;
- setp.eq.s32 %p624, %r933, 0;
- @%p624 bra $L__BB0_721;
-
- mov.u32 %r4549, 32;
- sub.s32 %r4550, %r4549, %r933;
- shr.u32 %r4551, %r8408, %r4550;
- shl.b32 %r4552, %r8407, %r933;
- add.s32 %r8407, %r4551, %r4552;
- mul.wide.s32 %rd1340, %r930, 4;
- add.s64 %rd1341, %rd1, %rd1340;
- ld.local.u32 %r4553, [%rd1341];
- shr.u32 %r4554, %r4553, %r4550;
- shl.b32 %r4555, %r8408, %r933;
- add.s32 %r8408, %r4554, %r4555;
-
-$L__BB0_721:
- and.b32 %r4556, %r924, -2147483648;
- shr.u32 %r4557, %r8408, 30;
- shl.b32 %r4558, %r8407, 2;
- or.b32 %r4559, %r4557, %r4558;
- shr.u32 %r4560, %r4559, 31;
- shr.u32 %r4561, %r8407, 30;
- add.s32 %r4562, %r4560, %r4561;
- neg.s32 %r4563, %r4562;
- setp.eq.s32 %p625, %r4556, 0;
- selp.b32 %r8409, %r4562, %r4563, %p625;
- setp.ne.s32 %p626, %r4560, 0;
- xor.b32 %r4564, %r4556, -2147483648;
- selp.b32 %r4565, %r4564, %r4556, %p626;
- selp.b32 %r4566, -1, 0, %p626;
- xor.b32 %r4567, %r4559, %r4566;
- shl.b32 %r4568, %r8408, 2;
- xor.b32 %r4569, %r4568, %r4566;
- cvt.u64.u32 %rd1342, %r4567;
- cvt.u64.u32 %rd1343, %r4569;
- bfi.b64 %rd1344, %rd1342, %rd1343, 32, 32;
- cvt.rn.f64.s64 %fd91, %rd1344;
- mul.f64 %fd92, %fd91, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3240, %fd92;
- setp.eq.s32 %p627, %r4565, 0;
- neg.f32 %f3241, %f3240;
- selp.f32 %f5451, %f3240, %f3241, %p627;
-
-$L__BB0_723:
- add.s32 %r940, %r8409, 1;
- and.b32 %r941, %r940, 1;
- setp.eq.s32 %p628, %r941, 0;
- selp.f32 %f794, %f5451, 0f3F800000, %p628;
- mul.rn.f32 %f795, %f5451, %f5451;
- mov.f32 %f5452, 0fB94D4153;
- @%p628 bra $L__BB0_725;
-
- mov.f32 %f3244, 0fBAB607ED;
- mov.f32 %f3245, 0f37CBAC00;
- fma.rn.f32 %f5452, %f3245, %f795, %f3244;
-
-$L__BB0_725:
- selp.f32 %f3246, 0f3C0885E4, 0f3D2AAABB, %p628;
- fma.rn.f32 %f3247, %f5452, %f795, %f3246;
- selp.f32 %f3248, 0fBE2AAAA8, 0fBEFFFFFF, %p628;
- fma.rn.f32 %f3249, %f3247, %f795, %f3248;
- mov.f32 %f3250, 0f00000000;
- fma.rn.f32 %f3251, %f795, %f794, %f3250;
- fma.rn.f32 %f5215, %f3249, %f3251, %f794;
- and.b32 %r4571, %r940, 2;
- setp.eq.s32 %p630, %r4571, 0;
- @%p630 bra $L__BB0_727;
-
- mov.f32 %f3253, 0fBF800000;
- fma.rn.f32 %f5215, %f5215, %f3253, %f3250;
-
-$L__BB0_727:
- selp.f32 %f802, %f5215, %f5216, %p14;
- selp.f32 %f803, %f5213, %f5214, %p14;
- @%p609 bra $L__BB0_729;
-
- add.f32 %f5517, %f803, %f802;
- mov.f32 %f5214, %f5213;
- mov.f32 %f5216, %f5215;
-
-$L__BB0_729:
- @%p438 bra $L__BB0_951;
-
- shl.b32 %r4573, %r12, 5;
- mov.u32 %r4574, -32;
- sub.s32 %r942, %r4574, %r4573;
- setp.ge.s32 %p634, %r14, %r942;
- @%p634 bra $L__BB0_743;
-
- mul.f32 %f3256, %f5341, 0f3F22F983;
- cvt.rni.s32.f32 %r8413, %f3256;
- cvt.rn.f32.s32 %f3257, %r8413;
- mov.f32 %f3258, 0fBFC90FDA;
- fma.rn.f32 %f3259, %f3257, %f3258, %f5341;
- mov.f32 %f3260, 0fB3A22168;
- fma.rn.f32 %f3261, %f3257, %f3260, %f3259;
- mov.f32 %f3262, 0fA7C234C5;
- fma.rn.f32 %f5460, %f3257, %f3262, %f3261;
- abs.f32 %f811, %f5341;
- setp.ltu.f32 %p635, %f811, 0f47CE4780;
- @%p635 bra $L__BB0_739;
-
- setp.eq.f32 %p636, %f811, 0f7F800000;
- @%p636 bra $L__BB0_738;
- bra.uni $L__BB0_733;
-
-$L__BB0_738:
- mov.f32 %f3265, 0f00000000;
- mul.rn.f32 %f5460, %f5341, %f3265;
- mov.u32 %r8413, 0;
- bra.uni $L__BB0_739;
-
-$L__BB0_733:
- mov.b32 %r944, %f5341;
- shr.u32 %r4576, %r944, 23;
- and.b32 %r4577, %r4576, 255;
- add.s32 %r945, %r4577, -128;
- shl.b32 %r4578, %r944, 8;
- or.b32 %r946, %r4578, -2147483648;
- shr.u32 %r947, %r945, 5;
- mov.u64 %rd2590, 0;
- mov.u32 %r8410, 0;
- mov.u64 %rd1348, __cudart_i2opi_f;
- mov.u64 %rd2591, %rd2590;
-
-$L__BB0_734:
- .pragma "nounroll";
- shl.b64 %rd1347, %rd2590, 2;
- add.s64 %rd1349, %rd1348, %rd1347;
- ld.global.nc.u32 %r4579, [%rd1349];
- mad.wide.u32 %rd1350, %r4579, %r946, %rd2591;
- shr.u64 %rd2591, %rd1350, 32;
- add.s64 %rd1351, %rd1, %rd1347;
- st.local.u32 [%rd1351], %rd1350;
- add.s32 %r8410, %r8410, 1;
- cvt.s64.s32 %rd2590, %r8410;
- setp.ne.s32 %p637, %r8410, 6;
- @%p637 bra $L__BB0_734;
-
- st.local.u32 [%rd5], %rd2591;
- mov.u32 %r4580, 4;
- sub.s32 %r950, %r4580, %r947;
- mov.u32 %r4581, 6;
- sub.s32 %r4582, %r4581, %r947;
- mul.wide.s32 %rd1352, %r4582, 4;
- add.s64 %rd1353, %rd1, %rd1352;
- ld.local.u32 %r8411, [%rd1353];
- ld.local.u32 %r8412, [%rd1353+-4];
- and.b32 %r953, %r945, 31;
- setp.eq.s32 %p638, %r953, 0;
- @%p638 bra $L__BB0_737;
-
- mov.u32 %r4583, 32;
- sub.s32 %r4584, %r4583, %r953;
- shr.u32 %r4585, %r8412, %r4584;
- shl.b32 %r4586, %r8411, %r953;
- add.s32 %r8411, %r4585, %r4586;
- mul.wide.s32 %rd1354, %r950, 4;
- add.s64 %rd1355, %rd1, %rd1354;
- ld.local.u32 %r4587, [%rd1355];
- shr.u32 %r4588, %r4587, %r4584;
- shl.b32 %r4589, %r8412, %r953;
- add.s32 %r8412, %r4588, %r4589;
-
-$L__BB0_737:
- and.b32 %r4590, %r944, -2147483648;
- shr.u32 %r4591, %r8412, 30;
- shl.b32 %r4592, %r8411, 2;
- or.b32 %r4593, %r4591, %r4592;
- shr.u32 %r4594, %r4593, 31;
- shr.u32 %r4595, %r8411, 30;
- add.s32 %r4596, %r4594, %r4595;
- neg.s32 %r4597, %r4596;
- setp.eq.s32 %p639, %r4590, 0;
- selp.b32 %r8413, %r4596, %r4597, %p639;
- setp.ne.s32 %p640, %r4594, 0;
- xor.b32 %r4598, %r4590, -2147483648;
- selp.b32 %r4599, %r4598, %r4590, %p640;
- selp.b32 %r4600, -1, 0, %p640;
- xor.b32 %r4601, %r4593, %r4600;
- shl.b32 %r4602, %r8412, 2;
- xor.b32 %r4603, %r4602, %r4600;
- cvt.u64.u32 %rd1356, %r4601;
- cvt.u64.u32 %rd1357, %r4603;
- bfi.b64 %rd1358, %rd1356, %rd1357, 32, 32;
- cvt.rn.f64.s64 %fd93, %rd1358;
- mul.f64 %fd94, %fd93, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3263, %fd94;
- setp.eq.s32 %p641, %r4599, 0;
- neg.f32 %f3264, %f3263;
- selp.f32 %f5460, %f3263, %f3264, %p641;
-
-$L__BB0_739:
- and.b32 %r960, %r8413, 1;
- setp.eq.s32 %p642, %r960, 0;
- selp.f32 %f815, %f5460, 0f3F800000, %p642;
- mul.rn.f32 %f816, %f5460, %f5460;
- mov.f32 %f5461, 0fB94D4153;
- @%p642 bra $L__BB0_741;
-
- mov.f32 %f3267, 0fBAB607ED;
- mov.f32 %f3268, 0f37CBAC00;
- fma.rn.f32 %f5461, %f3268, %f816, %f3267;
-
-$L__BB0_741:
- selp.f32 %f3269, 0f3C0885E4, 0f3D2AAABB, %p642;
- fma.rn.f32 %f3270, %f5461, %f816, %f3269;
- selp.f32 %f3271, 0fBE2AAAA8, 0fBEFFFFFF, %p642;
- fma.rn.f32 %f3272, %f3270, %f816, %f3271;
- mov.f32 %f3273, 0f00000000;
- fma.rn.f32 %f3274, %f816, %f815, %f3273;
- fma.rn.f32 %f5213, %f3272, %f3274, %f815;
- and.b32 %r4605, %r8413, 2;
- setp.eq.s32 %p644, %r4605, 0;
- @%p644 bra $L__BB0_743;
-
- mov.f32 %f3276, 0fBF800000;
- fma.rn.f32 %f5213, %f5213, %f3276, %f3273;
-
-$L__BB0_743:
- setp.lt.s32 %p15, %r14, %r942;
- @%p634 bra $L__BB0_756;
-
- mul.f32 %f3277, %f5333, 0f3F22F983;
- cvt.rni.s32.f32 %r8417, %f3277;
- cvt.rn.f32.s32 %f3278, %r8417;
- mov.f32 %f3279, 0fBFC90FDA;
- fma.rn.f32 %f3280, %f3278, %f3279, %f5333;
- mov.f32 %f3281, 0fB3A22168;
- fma.rn.f32 %f3282, %f3278, %f3281, %f3280;
- mov.f32 %f3283, 0fA7C234C5;
- fma.rn.f32 %f5464, %f3278, %f3283, %f3282;
- abs.f32 %f824, %f5333;
- setp.ltu.f32 %p646, %f824, 0f47CE4780;
- @%p646 bra $L__BB0_752;
-
- setp.eq.f32 %p647, %f824, 0f7F800000;
- @%p647 bra $L__BB0_751;
- bra.uni $L__BB0_746;
-
-$L__BB0_751:
- mov.f32 %f3286, 0f00000000;
- mul.rn.f32 %f5464, %f5333, %f3286;
- mov.u32 %r8417, 0;
- bra.uni $L__BB0_752;
-
-$L__BB0_746:
- mov.b32 %r962, %f5333;
- shr.u32 %r4607, %r962, 23;
- and.b32 %r4608, %r4607, 255;
- add.s32 %r963, %r4608, -128;
- shl.b32 %r4609, %r962, 8;
- or.b32 %r964, %r4609, -2147483648;
- shr.u32 %r965, %r963, 5;
- mov.u64 %rd2592, 0;
- mov.u32 %r8414, 0;
- mov.u64 %rd1362, __cudart_i2opi_f;
- mov.u64 %rd2593, %rd2592;
-
-$L__BB0_747:
- .pragma "nounroll";
- shl.b64 %rd1361, %rd2592, 2;
- add.s64 %rd1363, %rd1362, %rd1361;
- ld.global.nc.u32 %r4610, [%rd1363];
- mad.wide.u32 %rd1364, %r4610, %r964, %rd2593;
- shr.u64 %rd2593, %rd1364, 32;
- add.s64 %rd1365, %rd1, %rd1361;
- st.local.u32 [%rd1365], %rd1364;
- add.s32 %r8414, %r8414, 1;
- cvt.s64.s32 %rd2592, %r8414;
- setp.ne.s32 %p648, %r8414, 6;
- @%p648 bra $L__BB0_747;
-
- st.local.u32 [%rd5], %rd2593;
- mov.u32 %r4611, 4;
- sub.s32 %r968, %r4611, %r965;
- mov.u32 %r4612, 6;
- sub.s32 %r4613, %r4612, %r965;
- mul.wide.s32 %rd1366, %r4613, 4;
- add.s64 %rd1367, %rd1, %rd1366;
- ld.local.u32 %r8415, [%rd1367];
- ld.local.u32 %r8416, [%rd1367+-4];
- and.b32 %r971, %r963, 31;
- setp.eq.s32 %p649, %r971, 0;
- @%p649 bra $L__BB0_750;
-
- mov.u32 %r4614, 32;
- sub.s32 %r4615, %r4614, %r971;
- shr.u32 %r4616, %r8416, %r4615;
- shl.b32 %r4617, %r8415, %r971;
- add.s32 %r8415, %r4616, %r4617;
- mul.wide.s32 %rd1368, %r968, 4;
- add.s64 %rd1369, %rd1, %rd1368;
- ld.local.u32 %r4618, [%rd1369];
- shr.u32 %r4619, %r4618, %r4615;
- shl.b32 %r4620, %r8416, %r971;
- add.s32 %r8416, %r4619, %r4620;
-
-$L__BB0_750:
- and.b32 %r4621, %r962, -2147483648;
- shr.u32 %r4622, %r8416, 30;
- shl.b32 %r4623, %r8415, 2;
- or.b32 %r4624, %r4622, %r4623;
- shr.u32 %r4625, %r4624, 31;
- shr.u32 %r4626, %r8415, 30;
- add.s32 %r4627, %r4625, %r4626;
- neg.s32 %r4628, %r4627;
- setp.eq.s32 %p650, %r4621, 0;
- selp.b32 %r8417, %r4627, %r4628, %p650;
- setp.ne.s32 %p651, %r4625, 0;
- xor.b32 %r4629, %r4621, -2147483648;
- selp.b32 %r4630, %r4629, %r4621, %p651;
- selp.b32 %r4631, -1, 0, %p651;
- xor.b32 %r4632, %r4624, %r4631;
- shl.b32 %r4633, %r8416, 2;
- xor.b32 %r4634, %r4633, %r4631;
- cvt.u64.u32 %rd1370, %r4632;
- cvt.u64.u32 %rd1371, %r4634;
- bfi.b64 %rd1372, %rd1370, %rd1371, 32, 32;
- cvt.rn.f64.s64 %fd95, %rd1372;
- mul.f64 %fd96, %fd95, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3284, %fd96;
- setp.eq.s32 %p652, %r4630, 0;
- neg.f32 %f3285, %f3284;
- selp.f32 %f5464, %f3284, %f3285, %p652;
-
-$L__BB0_752:
- add.s32 %r978, %r8417, 1;
- and.b32 %r979, %r978, 1;
- setp.eq.s32 %p653, %r979, 0;
- selp.f32 %f828, %f5464, 0f3F800000, %p653;
- mul.rn.f32 %f829, %f5464, %f5464;
- mov.f32 %f5465, 0fB94D4153;
- @%p653 bra $L__BB0_754;
-
- mov.f32 %f3288, 0fBAB607ED;
- mov.f32 %f3289, 0f37CBAC00;
- fma.rn.f32 %f5465, %f3289, %f829, %f3288;
-
-$L__BB0_754:
- selp.f32 %f3290, 0f3C0885E4, 0f3D2AAABB, %p653;
- fma.rn.f32 %f3291, %f5465, %f829, %f3290;
- selp.f32 %f3292, 0fBE2AAAA8, 0fBEFFFFFF, %p653;
- fma.rn.f32 %f3293, %f3291, %f829, %f3292;
- mov.f32 %f3294, 0f00000000;
- fma.rn.f32 %f3295, %f829, %f828, %f3294;
- fma.rn.f32 %f5215, %f3293, %f3295, %f828;
- and.b32 %r4636, %r978, 2;
- setp.eq.s32 %p655, %r4636, 0;
- @%p655 bra $L__BB0_756;
-
- mov.f32 %f3297, 0fBF800000;
- fma.rn.f32 %f5215, %f5215, %f3297, %f3294;
-
-$L__BB0_756:
- selp.f32 %f836, %f5215, %f5216, %p15;
- selp.f32 %f837, %f5213, %f5214, %p15;
- @%p634 bra $L__BB0_951;
-
- add.f32 %f5516, %f837, %f836;
- mov.f32 %f5214, %f5213;
- mov.f32 %f5216, %f5215;
-
-$L__BB0_951:
- @%p32 bra $L__BB0_953;
-
- shl.b32 %r5208, %r12, 2;
- mov.u32 %r5209, -8;
- sub.s32 %r5210, %r5209, %r5208;
- add.s32 %r5211, %r13, -12;
- setp.lt.s32 %p819, %r5211, %r5210;
- @%p819 bra $L__BB0_1233;
- bra.uni $L__BB0_953;
-
-$L__BB0_1233:
- mov.u32 %r5938, %ctaid.x;
- shl.b32 %r5939, %r12, 5;
- add.s32 %r5940, %r5939, %r1;
- mul.hi.s32 %r5941, %r5940, -1840700269;
- add.s32 %r5942, %r5941, %r5940;
- shr.u32 %r5943, %r5942, 31;
- shr.s32 %r5944, %r5942, 2;
- add.s32 %r5945, %r5944, %r5943;
- mul.lo.s32 %r5946, %r5945, %r2615;
- mul.lo.s32 %r5947, %r5945, 7;
- sub.s32 %r5948, %r5940, %r5947;
- mul.lo.s32 %r5949, %r5948, %r2616;
- add.s32 %r5950, %r13, 3;
- mad.lo.s32 %r5951, %r2614, %r5938, %r2612;
- mad.lo.s32 %r5952, %r5950, %r2613, %r5951;
- add.s32 %r5953, %r5952, %r5946;
- add.s32 %r5954, %r5953, %r5949;
- mul.wide.s32 %rd1874, %r5954, 4;
- add.s64 %rd1875, %rd3, %rd1874;
- ld.global.f32 %f1382, [%rd1875];
- add.s32 %r5955, %r5940, 32;
- mul.hi.s32 %r5956, %r5955, -1840700269;
- add.s32 %r5957, %r5956, %r5955;
- shr.u32 %r5958, %r5957, 31;
- shr.s32 %r5959, %r5957, 2;
- add.s32 %r5960, %r5959, %r5958;
- mul.lo.s32 %r5961, %r5960, %r2615;
- mul.lo.s32 %r5962, %r5960, 7;
- sub.s32 %r5963, %r5955, %r5962;
- mul.lo.s32 %r5964, %r5963, %r2616;
- add.s32 %r5965, %r5952, %r5961;
- add.s32 %r5966, %r5965, %r5964;
- mul.wide.s32 %rd1876, %r5966, 4;
- add.s64 %rd1877, %rd3, %rd1876;
- ld.global.f32 %f1383, [%rd1877];
- add.s32 %r5967, %r5952, %r2613;
- add.s32 %r5968, %r5967, %r5946;
- add.s32 %r5969, %r5968, %r5949;
- mul.wide.s32 %rd1878, %r5969, 4;
- add.s64 %rd1879, %rd3, %rd1878;
- ld.global.f32 %f1384, [%rd1879];
- add.s32 %r5970, %r5967, %r5961;
- add.s32 %r5971, %r5970, %r5964;
- mul.wide.s32 %rd1880, %r5971, 4;
- add.s64 %rd1881, %rd3, %rd1880;
- ld.global.f32 %f1385, [%rd1881];
- add.s32 %r5972, %r5951, %r2612;
- mad.lo.s32 %r5973, %r13, %r2613, %r5972;
- add.s32 %r5974, %r5973, %r5946;
- add.s32 %r5975, %r5974, %r5949;
- mul.wide.s32 %rd1882, %r5975, 4;
- add.s64 %rd1883, %rd3, %rd1882;
- ld.global.f32 %f1386, [%rd1883];
- add.s32 %r5976, %r5973, %r5961;
- add.s32 %r5977, %r5976, %r5964;
- mul.wide.s32 %rd1884, %r5977, 4;
- add.s64 %rd1885, %rd3, %rd1884;
- ld.global.f32 %f1387, [%rd1885];
- add.s32 %r5978, %r5973, %r2613;
- add.s32 %r5979, %r5978, %r5946;
- add.s32 %r5980, %r5979, %r5949;
- mul.wide.s32 %rd1886, %r5980, 4;
- add.s64 %rd1887, %rd3, %rd1886;
- ld.global.f32 %f1388, [%rd1887];
- add.s32 %r5981, %r5978, %r5961;
- add.s32 %r5982, %r5981, %r5964;
- mul.wide.s32 %rd1888, %r5982, 4;
- add.s64 %rd1889, %rd3, %rd1888;
- ld.global.f32 %f1389, [%rd1889];
- mul.hi.s32 %r5983, %r5940, 954437177;
- shr.u32 %r5984, %r5983, 31;
- shr.s32 %r5985, %r5983, 1;
- add.s32 %r5986, %r5985, %r5984;
- mul.lo.s32 %r5987, %r5986, %r2605;
- mul.lo.s32 %r5988, %r5986, 9;
- sub.s32 %r5989, %r5940, %r5988;
- mul.lo.s32 %r5990, %r5989, %r2606;
- add.s32 %r5991, %r13, 2;
- shl.b32 %r5992, %r2602, 1;
- mad.lo.s32 %r5993, %r2604, %r5938, %r5992;
- mad.lo.s32 %r5994, %r5991, %r2603, %r5993;
- add.s32 %r5995, %r5994, %r5987;
- add.s32 %r5996, %r5995, %r5990;
- mul.wide.s32 %rd1890, %r5996, 4;
- add.s64 %rd1891, %rd2, %rd1890;
- ld.global.f32 %f1390, [%rd1891];
- mul.hi.s32 %r5997, %r5955, 954437177;
- shr.u32 %r5998, %r5997, 31;
- shr.s32 %r5999, %r5997, 1;
- add.s32 %r6000, %r5999, %r5998;
- mul.lo.s32 %r6001, %r6000, %r2605;
- mul.lo.s32 %r6002, %r6000, 9;
- sub.s32 %r6003, %r5955, %r6002;
- mul.lo.s32 %r6004, %r6003, %r2606;
- add.s32 %r6005, %r5994, %r6001;
- add.s32 %r6006, %r6005, %r6004;
- mul.wide.s32 %rd1892, %r6006, 4;
- add.s64 %rd1893, %rd2, %rd1892;
- ld.global.f32 %f1391, [%rd1893];
- add.s32 %r6007, %r5993, %r2602;
- mad.lo.s32 %r6008, %r13, %r2603, %r6007;
- add.s32 %r6009, %r6008, %r5987;
- add.s32 %r6010, %r6009, %r5990;
- mul.wide.s32 %rd1894, %r6010, 4;
- add.s64 %rd1895, %rd2, %rd1894;
- ld.global.f32 %f1392, [%rd1895];
- add.s32 %r6011, %r6008, %r6001;
- add.s32 %r6012, %r6011, %r6004;
- mul.wide.s32 %rd1896, %r6012, 4;
- add.s64 %rd1897, %rd2, %rd1896;
- ld.global.f32 %f1393, [%rd1897];
- mul.wide.s32 %rd1898, %r2603, 4;
- add.s64 %rd1899, %rd1895, %rd1898;
- ld.global.f32 %f1394, [%rd1899];
- add.s64 %rd1900, %rd1897, %rd1898;
- ld.global.f32 %f1395, [%rd1900];
- add.s64 %rd1901, %rd1899, %rd1898;
- ld.global.f32 %f1396, [%rd1901];
- add.s64 %rd1902, %rd1900, %rd1898;
- ld.global.f32 %f1397, [%rd1902];
- mul.f32 %f3987, %f1390, 0f3F22F983;
- cvt.rni.s32.f32 %r8549, %f3987;
- cvt.rn.f32.s32 %f3988, %r8549;
- mov.f32 %f3989, 0fBFC90FDA;
- fma.rn.f32 %f3990, %f3988, %f3989, %f1390;
- mov.f32 %f3991, 0fB3A22168;
- fma.rn.f32 %f3992, %f3988, %f3991, %f3990;
- mov.f32 %f3993, 0fA7C234C5;
- fma.rn.f32 %f5659, %f3988, %f3993, %f3992;
- abs.f32 %f1399, %f1390;
- setp.ltu.f32 %p1052, %f1399, 0f47CE4780;
- @%p1052 bra $L__BB0_1241;
-
- setp.eq.f32 %p1053, %f1399, 0f7F800000;
- @%p1053 bra $L__BB0_1240;
- bra.uni $L__BB0_1235;
-
-$L__BB0_1240:
- mov.f32 %f3996, 0f00000000;
- mul.rn.f32 %f5659, %f1390, %f3996;
- mov.u32 %r8549, 0;
- bra.uni $L__BB0_1241;
-
-$L__BB0_953:
- mov.u32 %r1276, %ctaid.x;
- mul.lo.s32 %r1277, %r2614, %r1276;
- add.s32 %r5212, %r13, -15;
- mov.u32 %r5213, -8;
- sub.s32 %r1278, %r5213, %r12;
- setp.ge.s32 %p820, %r5212, %r1278;
- add.s32 %r5214, %r13, 3;
- add.s32 %r5215, %r2612, %r1277;
- mad.lo.s32 %r1279, %r5214, %r2613, %r5215;
- @%p820 bra $L__BB0_956;
-
- shl.b32 %r1280, %r12, 5;
- neg.s32 %r5216, %r1280;
- setp.ge.s32 %p821, %r14, %r5216;
- @%p821 bra $L__BB0_956;
-
- add.s32 %r5217, %r1280, %r1;
- mul.hi.s32 %r5218, %r5217, -1840700269;
- add.s32 %r5219, %r5218, %r5217;
- shr.u32 %r5220, %r5219, 31;
- shr.s32 %r5221, %r5219, 2;
- add.s32 %r5222, %r5221, %r5220;
- mul.lo.s32 %r5223, %r5222, 7;
- sub.s32 %r5224, %r5217, %r5223;
- mad.lo.s32 %r5225, %r5222, %r2615, %r1279;
- mad.lo.s32 %r5226, %r5224, %r2616, %r5225;
- mul.wide.s32 %rd1626, %r5226, 4;
- add.s64 %rd1627, %rd3, %rd1626;
- ld.global.f32 %f5531, [%rd1627];
-
-$L__BB0_956:
- @%p820 bra $L__BB0_959;
-
- shl.b32 %r1281, %r12, 5;
- mov.u32 %r5228, -32;
- sub.s32 %r5229, %r5228, %r1281;
- setp.ge.s32 %p823, %r14, %r5229;
- @%p823 bra $L__BB0_959;
-
- add.s32 %r5230, %r1281, %r1;
- add.s32 %r5231, %r5230, 32;
- mul.hi.s32 %r5232, %r5231, -1840700269;
- add.s32 %r5233, %r5232, %r5231;
- shr.u32 %r5234, %r5233, 31;
- shr.s32 %r5235, %r5233, 2;
- add.s32 %r5236, %r5235, %r5234;
- mul.lo.s32 %r5237, %r5236, 7;
- sub.s32 %r5238, %r5231, %r5237;
- mad.lo.s32 %r5239, %r5236, %r2615, %r1279;
- mad.lo.s32 %r5240, %r5238, %r2616, %r5239;
- mul.wide.s32 %rd1628, %r5240, 4;
- add.s64 %rd1629, %rd3, %rd1628;
- ld.global.f32 %f5339, [%rd1629];
-
-$L__BB0_959:
- mov.u32 %r5242, -9;
- sub.s32 %r1282, %r5242, %r12;
- setp.ge.s32 %p824, %r5212, %r1282;
- add.s32 %r1283, %r1279, %r2613;
- @%p824 bra $L__BB0_962;
-
- shl.b32 %r1284, %r12, 5;
- neg.s32 %r5243, %r1284;
- setp.ge.s32 %p825, %r14, %r5243;
- @%p825 bra $L__BB0_962;
-
- add.s32 %r5244, %r1284, %r1;
- mul.hi.s32 %r5245, %r5244, -1840700269;
- add.s32 %r5246, %r5245, %r5244;
- shr.u32 %r5247, %r5246, 31;
- shr.s32 %r5248, %r5246, 2;
- add.s32 %r5249, %r5248, %r5247;
- mul.lo.s32 %r5250, %r5249, 7;
- sub.s32 %r5251, %r5244, %r5250;
- mad.lo.s32 %r5252, %r5249, %r2615, %r1283;
- mad.lo.s32 %r5253, %r5251, %r2616, %r5252;
- mul.wide.s32 %rd1630, %r5253, 4;
- add.s64 %rd1631, %rd3, %rd1630;
- ld.global.f32 %f5338, [%rd1631];
-
-$L__BB0_962:
- @%p824 bra $L__BB0_965;
-
- shl.b32 %r1285, %r12, 5;
- mov.u32 %r5255, -32;
- sub.s32 %r5256, %r5255, %r1285;
- setp.ge.s32 %p827, %r14, %r5256;
- @%p827 bra $L__BB0_965;
-
- add.s32 %r5257, %r1285, %r1;
- add.s32 %r5258, %r5257, 32;
- mul.hi.s32 %r5259, %r5258, -1840700269;
- add.s32 %r5260, %r5259, %r5258;
- shr.u32 %r5261, %r5260, 31;
- shr.s32 %r5262, %r5260, 2;
- add.s32 %r5263, %r5262, %r5261;
- mul.lo.s32 %r5264, %r5263, 7;
- sub.s32 %r5265, %r5258, %r5264;
- mad.lo.s32 %r5266, %r5263, %r2615, %r1283;
- mad.lo.s32 %r5267, %r5265, %r2616, %r5266;
- mul.wide.s32 %rd1632, %r5267, 4;
- add.s64 %rd1633, %rd3, %rd1632;
- ld.global.f32 %f5337, [%rd1633];
-
-$L__BB0_965:
- mov.u32 %r5269, -10;
- sub.s32 %r1286, %r5269, %r12;
- setp.ge.s32 %p828, %r5212, %r1286;
- shl.b32 %r5270, %r2612, 1;
- add.s32 %r5271, %r5270, %r1277;
- mad.lo.s32 %r1287, %r13, %r2613, %r5271;
- @%p828 bra $L__BB0_968;
-
- shl.b32 %r1288, %r12, 5;
- neg.s32 %r5272, %r1288;
- setp.ge.s32 %p829, %r14, %r5272;
- @%p829 bra $L__BB0_968;
-
- add.s32 %r5273, %r1288, %r1;
- mul.hi.s32 %r5274, %r5273, -1840700269;
- add.s32 %r5275, %r5274, %r5273;
- shr.u32 %r5276, %r5275, 31;
- shr.s32 %r5277, %r5275, 2;
- add.s32 %r5278, %r5277, %r5276;
- mul.lo.s32 %r5279, %r5278, 7;
- sub.s32 %r5280, %r5273, %r5279;
- mad.lo.s32 %r5281, %r5278, %r2615, %r1287;
- mad.lo.s32 %r5282, %r5280, %r2616, %r5281;
- mul.wide.s32 %rd1634, %r5282, 4;
- add.s64 %rd1635, %rd3, %rd1634;
- ld.global.f32 %f5336, [%rd1635];
-
-$L__BB0_968:
- @%p828 bra $L__BB0_971;
-
- shl.b32 %r1289, %r12, 5;
- mov.u32 %r5284, -32;
- sub.s32 %r5285, %r5284, %r1289;
- setp.ge.s32 %p831, %r14, %r5285;
- @%p831 bra $L__BB0_971;
-
- add.s32 %r5286, %r1289, %r1;
- add.s32 %r5287, %r5286, 32;
- mul.hi.s32 %r5288, %r5287, -1840700269;
- add.s32 %r5289, %r5288, %r5287;
- shr.u32 %r5290, %r5289, 31;
- shr.s32 %r5291, %r5289, 2;
- add.s32 %r5292, %r5291, %r5290;
- mul.lo.s32 %r5293, %r5292, 7;
- sub.s32 %r5294, %r5287, %r5293;
- mad.lo.s32 %r5295, %r5292, %r2615, %r1287;
- mad.lo.s32 %r5296, %r5294, %r2616, %r5295;
- mul.wide.s32 %rd1636, %r5296, 4;
- add.s64 %rd1637, %rd3, %rd1636;
- ld.global.f32 %f5335, [%rd1637];
-
-$L__BB0_971:
- mov.u32 %r5298, -11;
- sub.s32 %r1290, %r5298, %r12;
- setp.ge.s32 %p832, %r5212, %r1290;
- add.s32 %r1291, %r1287, %r2613;
- @%p832 bra $L__BB0_974;
-
- shl.b32 %r1292, %r12, 5;
- neg.s32 %r5299, %r1292;
- setp.ge.s32 %p833, %r14, %r5299;
- @%p833 bra $L__BB0_974;
-
- add.s32 %r5300, %r1292, %r1;
- mul.hi.s32 %r5301, %r5300, -1840700269;
- add.s32 %r5302, %r5301, %r5300;
- shr.u32 %r5303, %r5302, 31;
- shr.s32 %r5304, %r5302, 2;
- add.s32 %r5305, %r5304, %r5303;
- mul.lo.s32 %r5306, %r5305, 7;
- sub.s32 %r5307, %r5300, %r5306;
- mad.lo.s32 %r5308, %r5305, %r2615, %r1291;
- mad.lo.s32 %r5309, %r5307, %r2616, %r5308;
- mul.wide.s32 %rd1638, %r5309, 4;
- add.s64 %rd1639, %rd3, %rd1638;
- ld.global.f32 %f5334, [%rd1639];
-
-$L__BB0_974:
- @%p832 bra $L__BB0_977;
-
- shl.b32 %r1293, %r12, 5;
- mov.u32 %r5311, -32;
- sub.s32 %r5312, %r5311, %r1293;
- setp.ge.s32 %p835, %r14, %r5312;
- @%p835 bra $L__BB0_977;
-
- add.s32 %r5313, %r1293, %r1;
- add.s32 %r5314, %r5313, 32;
- mul.hi.s32 %r5315, %r5314, -1840700269;
- add.s32 %r5316, %r5315, %r5314;
- shr.u32 %r5317, %r5316, 31;
- shr.s32 %r5318, %r5316, 2;
- add.s32 %r5319, %r5318, %r5317;
- mul.lo.s32 %r5320, %r5319, 7;
- sub.s32 %r5321, %r5314, %r5320;
- mad.lo.s32 %r5322, %r5319, %r2615, %r1291;
- mad.lo.s32 %r5323, %r5321, %r2616, %r5322;
- mul.wide.s32 %rd1640, %r5323, 4;
- add.s64 %rd1641, %rd3, %rd1640;
- ld.global.f32 %f5333, [%rd1641];
-
-$L__BB0_977:
- add.s32 %r5325, %r13, 2;
- mul.lo.s32 %r1294, %r5325, %r2603;
- shl.b32 %r5326, %r2602, 1;
- mad.lo.s32 %r1295, %r2604, %r1276, %r5326;
- add.s32 %r1296, %r1295, %r1294;
- @%p820 bra $L__BB0_980;
-
- shl.b32 %r1297, %r12, 5;
- neg.s32 %r5327, %r1297;
- setp.ge.s32 %p837, %r14, %r5327;
- @%p837 bra $L__BB0_980;
-
- add.s32 %r5328, %r1297, %r1;
- mul.hi.s32 %r5329, %r5328, 954437177;
- shr.u32 %r5330, %r5329, 31;
- shr.s32 %r5331, %r5329, 1;
- add.s32 %r5332, %r5331, %r5330;
- mul.lo.s32 %r5333, %r5332, 9;
- sub.s32 %r5334, %r5328, %r5333;
- mad.lo.s32 %r5335, %r5332, %r2605, %r1296;
- mad.lo.s32 %r5336, %r5334, %r2606, %r5335;
- mul.wide.s32 %rd1642, %r5336, 4;
- add.s64 %rd1643, %rd2, %rd1642;
- ld.global.f32 %f5348, [%rd1643];
-
-$L__BB0_980:
- @%p820 bra $L__BB0_983;
-
- shl.b32 %r1298, %r12, 5;
- mov.u32 %r5338, -32;
- sub.s32 %r5339, %r5338, %r1298;
- setp.ge.s32 %p839, %r14, %r5339;
- @%p839 bra $L__BB0_983;
-
- add.s32 %r5340, %r1298, %r1;
- add.s32 %r5341, %r5340, 32;
- mul.hi.s32 %r5342, %r5341, 954437177;
- shr.u32 %r5343, %r5342, 31;
- shr.s32 %r5344, %r5342, 1;
- add.s32 %r5345, %r5344, %r5343;
- mul.lo.s32 %r5346, %r5345, 9;
- sub.s32 %r5347, %r5341, %r5346;
- mad.lo.s32 %r5348, %r5345, %r2605, %r1296;
- mad.lo.s32 %r5349, %r5347, %r2606, %r5348;
- mul.wide.s32 %rd1644, %r5349, 4;
- add.s64 %rd1645, %rd2, %rd1644;
- ld.global.f32 %f5347, [%rd1645];
-
-$L__BB0_983:
- add.s32 %r1299, %r1295, %r2602;
- mad.lo.s32 %r1300, %r13, %r2603, %r1299;
- @%p824 bra $L__BB0_986;
-
- shl.b32 %r1301, %r12, 5;
- neg.s32 %r5351, %r1301;
- setp.ge.s32 %p841, %r14, %r5351;
- @%p841 bra $L__BB0_986;
-
- add.s32 %r5352, %r1301, %r1;
- mul.hi.s32 %r5353, %r5352, 954437177;
- shr.u32 %r5354, %r5353, 31;
- shr.s32 %r5355, %r5353, 1;
- add.s32 %r5356, %r5355, %r5354;
- mul.lo.s32 %r5357, %r5356, 9;
- sub.s32 %r5358, %r5352, %r5357;
- mad.lo.s32 %r5359, %r5356, %r2605, %r1300;
- mad.lo.s32 %r5360, %r5358, %r2606, %r5359;
- mul.wide.s32 %rd1646, %r5360, 4;
- add.s64 %rd1647, %rd2, %rd1646;
- ld.global.f32 %f5346, [%rd1647];
-
-$L__BB0_986:
- @%p824 bra $L__BB0_989;
-
- shl.b32 %r1302, %r12, 5;
- mov.u32 %r5362, -32;
- sub.s32 %r5363, %r5362, %r1302;
- setp.ge.s32 %p843, %r14, %r5363;
- @%p843 bra $L__BB0_989;
-
- add.s32 %r5364, %r1302, %r1;
- add.s32 %r5365, %r5364, 32;
- mul.hi.s32 %r5366, %r5365, 954437177;
- shr.u32 %r5367, %r5366, 31;
- shr.s32 %r5368, %r5366, 1;
- add.s32 %r5369, %r5368, %r5367;
- mul.lo.s32 %r5370, %r5369, 9;
- sub.s32 %r5371, %r5365, %r5370;
- mad.lo.s32 %r5372, %r5369, %r2605, %r1300;
- mad.lo.s32 %r5373, %r5371, %r2606, %r5372;
- mul.wide.s32 %rd1648, %r5373, 4;
- add.s64 %rd1649, %rd2, %rd1648;
- ld.global.f32 %f5345, [%rd1649];
-
-$L__BB0_989:
- add.s32 %r1303, %r1300, %r2603;
- @%p828 bra $L__BB0_992;
-
- shl.b32 %r1304, %r12, 5;
- neg.s32 %r5375, %r1304;
- setp.ge.s32 %p845, %r14, %r5375;
- @%p845 bra $L__BB0_992;
-
- add.s32 %r5376, %r1304, %r1;
- mul.hi.s32 %r5377, %r5376, 954437177;
- shr.u32 %r5378, %r5377, 31;
- shr.s32 %r5379, %r5377, 1;
- add.s32 %r5380, %r5379, %r5378;
- mul.lo.s32 %r5381, %r5380, 9;
- sub.s32 %r5382, %r5376, %r5381;
- mad.lo.s32 %r5383, %r5380, %r2605, %r1303;
- mad.lo.s32 %r5384, %r5382, %r2606, %r5383;
- mul.wide.s32 %rd1650, %r5384, 4;
- add.s64 %rd1651, %rd2, %rd1650;
- ld.global.f32 %f5344, [%rd1651];
-
-$L__BB0_992:
- @%p828 bra $L__BB0_995;
-
- shl.b32 %r1305, %r12, 5;
- mov.u32 %r5386, -32;
- sub.s32 %r5387, %r5386, %r1305;
- setp.ge.s32 %p847, %r14, %r5387;
- @%p847 bra $L__BB0_995;
-
- add.s32 %r5388, %r1305, %r1;
- add.s32 %r5389, %r5388, 32;
- mul.hi.s32 %r5390, %r5389, 954437177;
- shr.u32 %r5391, %r5390, 31;
- shr.s32 %r5392, %r5390, 1;
- add.s32 %r5393, %r5392, %r5391;
- mul.lo.s32 %r5394, %r5393, 9;
- sub.s32 %r5395, %r5389, %r5394;
- mad.lo.s32 %r5396, %r5393, %r2605, %r1303;
- mad.lo.s32 %r5397, %r5395, %r2606, %r5396;
- mul.wide.s32 %rd1652, %r5397, 4;
- add.s64 %rd1653, %rd2, %rd1652;
- ld.global.f32 %f5343, [%rd1653];
-
-$L__BB0_995:
- add.s32 %r1306, %r1299, %r1294;
- @%p832 bra $L__BB0_998;
-
- shl.b32 %r1307, %r12, 5;
- neg.s32 %r5399, %r1307;
- setp.ge.s32 %p849, %r14, %r5399;
- @%p849 bra $L__BB0_998;
-
- add.s32 %r5400, %r1307, %r1;
- mul.hi.s32 %r5401, %r5400, 954437177;
- shr.u32 %r5402, %r5401, 31;
- shr.s32 %r5403, %r5401, 1;
- add.s32 %r5404, %r5403, %r5402;
- mul.lo.s32 %r5405, %r5404, 9;
- sub.s32 %r5406, %r5400, %r5405;
- mad.lo.s32 %r5407, %r5404, %r2605, %r1306;
- mad.lo.s32 %r5408, %r5406, %r2606, %r5407;
- mul.wide.s32 %rd1654, %r5408, 4;
- add.s64 %rd1655, %rd2, %rd1654;
- ld.global.f32 %f5342, [%rd1655];
-
-$L__BB0_998:
- @%p832 bra $L__BB0_1001;
-
- shl.b32 %r1308, %r12, 5;
- mov.u32 %r5410, -32;
- sub.s32 %r5411, %r5410, %r1308;
- setp.ge.s32 %p851, %r14, %r5411;
- @%p851 bra $L__BB0_1001;
-
- add.s32 %r5412, %r1308, %r1;
- add.s32 %r5413, %r5412, 32;
- mul.hi.s32 %r5414, %r5413, 954437177;
- shr.u32 %r5415, %r5414, 31;
- shr.s32 %r5416, %r5414, 1;
- add.s32 %r5417, %r5416, %r5415;
- mul.lo.s32 %r5418, %r5417, 9;
- sub.s32 %r5419, %r5413, %r5418;
- mad.lo.s32 %r5420, %r5417, %r2605, %r1306;
- mad.lo.s32 %r5421, %r5419, %r2606, %r5420;
- mul.wide.s32 %rd1656, %r5421, 4;
- add.s64 %rd1657, %rd2, %rd1656;
- ld.global.f32 %f5341, [%rd1657];
-
-$L__BB0_1001:
- @%p820 bra $L__BB0_1030;
-
- shl.b32 %r5423, %r12, 5;
- neg.s32 %r1309, %r5423;
- setp.ge.s32 %p853, %r14, %r1309;
- @%p853 bra $L__BB0_1015;
-
- mul.f32 %f3636, %f5348, 0f3F22F983;
- cvt.rni.s32.f32 %r8485, %f3636;
- cvt.rn.f32.s32 %f3637, %r8485;
- mov.f32 %f3638, 0fBFC90FDA;
- fma.rn.f32 %f3639, %f3637, %f3638, %f5348;
- mov.f32 %f3640, 0fB3A22168;
- fma.rn.f32 %f3641, %f3637, %f3640, %f3639;
- mov.f32 %f3642, 0fA7C234C5;
- fma.rn.f32 %f5560, %f3637, %f3642, %f3641;
- abs.f32 %f1116, %f5348;
- setp.ltu.f32 %p854, %f1116, 0f47CE4780;
- @%p854 bra $L__BB0_1011;
-
- setp.eq.f32 %p855, %f1116, 0f7F800000;
- @%p855 bra $L__BB0_1010;
- bra.uni $L__BB0_1005;
-
-$L__BB0_1010:
- mov.f32 %f3645, 0f00000000;
- mul.rn.f32 %f5560, %f5348, %f3645;
- mov.u32 %r8485, 0;
- bra.uni $L__BB0_1011;
-
-$L__BB0_1235:
- mov.b32 %r1614, %f1390;
- shr.u32 %r6014, %r1614, 23;
- and.b32 %r6015, %r6014, 255;
- add.s32 %r1615, %r6015, -128;
- shl.b32 %r6016, %r1614, 8;
- or.b32 %r1616, %r6016, -2147483648;
- shr.u32 %r1617, %r1615, 5;
- mov.u64 %rd2660, 0;
- mov.u32 %r8546, 0;
- mov.u64 %rd1906, __cudart_i2opi_f;
- mov.u64 %rd2661, %rd2660;
-
-$L__BB0_1236:
- .pragma "nounroll";
- shl.b64 %rd1905, %rd2660, 2;
- add.s64 %rd1907, %rd1906, %rd1905;
- ld.global.nc.u32 %r6017, [%rd1907];
- mad.wide.u32 %rd1908, %r6017, %r1616, %rd2661;
- shr.u64 %rd2661, %rd1908, 32;
- add.s64 %rd1909, %rd1, %rd1905;
- st.local.u32 [%rd1909], %rd1908;
- add.s32 %r8546, %r8546, 1;
- cvt.s64.s32 %rd2660, %r8546;
- setp.ne.s32 %p1054, %r8546, 6;
- @%p1054 bra $L__BB0_1236;
-
- st.local.u32 [%rd5], %rd2661;
- mov.u32 %r6018, 4;
- sub.s32 %r1620, %r6018, %r1617;
- mov.u32 %r6019, 6;
- sub.s32 %r6020, %r6019, %r1617;
- mul.wide.s32 %rd1910, %r6020, 4;
- add.s64 %rd1911, %rd1, %rd1910;
- ld.local.u32 %r8547, [%rd1911];
- ld.local.u32 %r8548, [%rd1911+-4];
- and.b32 %r1623, %r1615, 31;
- setp.eq.s32 %p1055, %r1623, 0;
- @%p1055 bra $L__BB0_1239;
-
- mov.u32 %r6021, 32;
- sub.s32 %r6022, %r6021, %r1623;
- shr.u32 %r6023, %r8548, %r6022;
- shl.b32 %r6024, %r8547, %r1623;
- add.s32 %r8547, %r6023, %r6024;
- mul.wide.s32 %rd1912, %r1620, 4;
- add.s64 %rd1913, %rd1, %rd1912;
- ld.local.u32 %r6025, [%rd1913];
- shr.u32 %r6026, %r6025, %r6022;
- shl.b32 %r6027, %r8548, %r1623;
- add.s32 %r8548, %r6026, %r6027;
-
-$L__BB0_1239:
- and.b32 %r6028, %r1614, -2147483648;
- shr.u32 %r6029, %r8548, 30;
- shl.b32 %r6030, %r8547, 2;
- or.b32 %r6031, %r6029, %r6030;
- shr.u32 %r6032, %r6031, 31;
- shr.u32 %r6033, %r8547, 30;
- add.s32 %r6034, %r6032, %r6033;
- neg.s32 %r6035, %r6034;
- setp.eq.s32 %p1056, %r6028, 0;
- selp.b32 %r8549, %r6034, %r6035, %p1056;
- setp.ne.s32 %p1057, %r6032, 0;
- xor.b32 %r6036, %r6028, -2147483648;
- selp.b32 %r6037, %r6036, %r6028, %p1057;
- selp.b32 %r6038, -1, 0, %p1057;
- xor.b32 %r6039, %r6031, %r6038;
- shl.b32 %r6040, %r8548, 2;
- xor.b32 %r6041, %r6040, %r6038;
- cvt.u64.u32 %rd1914, %r6039;
- cvt.u64.u32 %rd1915, %r6041;
- bfi.b64 %rd1916, %rd1914, %rd1915, 32, 32;
- cvt.rn.f64.s64 %fd161, %rd1916;
- mul.f64 %fd162, %fd161, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3994, %fd162;
- setp.eq.s32 %p1058, %r6037, 0;
- neg.f32 %f3995, %f3994;
- selp.f32 %f5659, %f3994, %f3995, %p1058;
-
-$L__BB0_1241:
- and.b32 %r1630, %r8549, 1;
- setp.eq.s32 %p1059, %r1630, 0;
- selp.f32 %f1403, %f5659, 0f3F800000, %p1059;
- mul.rn.f32 %f1404, %f5659, %f5659;
- mov.f32 %f5660, 0fB94D4153;
- @%p1059 bra $L__BB0_1243;
-
- mov.f32 %f3998, 0fBAB607ED;
- mov.f32 %f3999, 0f37CBAC00;
- fma.rn.f32 %f5660, %f3999, %f1404, %f3998;
-
-$L__BB0_1243:
- selp.f32 %f4000, 0f3C0885E4, 0f3D2AAABB, %p1059;
- fma.rn.f32 %f4001, %f5660, %f1404, %f4000;
- selp.f32 %f4002, 0fBE2AAAA8, 0fBEFFFFFF, %p1059;
- fma.rn.f32 %f4003, %f4001, %f1404, %f4002;
- mov.f32 %f4004, 0f00000000;
- fma.rn.f32 %f4005, %f1404, %f1403, %f4004;
- fma.rn.f32 %f5661, %f4003, %f4005, %f1403;
- and.b32 %r6043, %r8549, 2;
- setp.eq.s32 %p1061, %r6043, 0;
- @%p1061 bra $L__BB0_1245;
-
- mov.f32 %f4007, 0fBF800000;
- fma.rn.f32 %f5661, %f5661, %f4007, %f4004;
-
-$L__BB0_1245:
- mul.f32 %f4008, %f1382, 0f3F22F983;
- cvt.rni.s32.f32 %r8553, %f4008;
- cvt.rn.f32.s32 %f4009, %r8553;
- mov.f32 %f4010, 0fBFC90FDA;
- fma.rn.f32 %f4011, %f4009, %f4010, %f1382;
- mov.f32 %f4012, 0fB3A22168;
- fma.rn.f32 %f4013, %f4009, %f4012, %f4011;
- mov.f32 %f4014, 0fA7C234C5;
- fma.rn.f32 %f5662, %f4009, %f4014, %f4013;
- abs.f32 %f1411, %f1382;
- setp.ltu.f32 %p1062, %f1411, 0f47CE4780;
- @%p1062 bra $L__BB0_1253;
-
- setp.eq.f32 %p1063, %f1411, 0f7F800000;
- @%p1063 bra $L__BB0_1252;
- bra.uni $L__BB0_1247;
-
-$L__BB0_1252:
- mov.f32 %f4017, 0f00000000;
- mul.rn.f32 %f5662, %f1382, %f4017;
- mov.u32 %r8553, 0;
- bra.uni $L__BB0_1253;
-
-$L__BB0_1247:
- mov.b32 %r1632, %f1382;
- shr.u32 %r6045, %r1632, 23;
- and.b32 %r6046, %r6045, 255;
- add.s32 %r1633, %r6046, -128;
- shl.b32 %r6047, %r1632, 8;
- or.b32 %r1634, %r6047, -2147483648;
- shr.u32 %r1635, %r1633, 5;
- mov.u64 %rd2662, 0;
- mov.u32 %r8550, 0;
- mov.u64 %rd1920, __cudart_i2opi_f;
- mov.u64 %rd2663, %rd2662;
-
-$L__BB0_1248:
- .pragma "nounroll";
- shl.b64 %rd1919, %rd2662, 2;
- add.s64 %rd1921, %rd1920, %rd1919;
- ld.global.nc.u32 %r6048, [%rd1921];
- mad.wide.u32 %rd1922, %r6048, %r1634, %rd2663;
- shr.u64 %rd2663, %rd1922, 32;
- add.s64 %rd1923, %rd1, %rd1919;
- st.local.u32 [%rd1923], %rd1922;
- add.s32 %r8550, %r8550, 1;
- cvt.s64.s32 %rd2662, %r8550;
- setp.ne.s32 %p1064, %r8550, 6;
- @%p1064 bra $L__BB0_1248;
-
- st.local.u32 [%rd5], %rd2663;
- mov.u32 %r6049, 4;
- sub.s32 %r1638, %r6049, %r1635;
- mov.u32 %r6050, 6;
- sub.s32 %r6051, %r6050, %r1635;
- mul.wide.s32 %rd1924, %r6051, 4;
- add.s64 %rd1925, %rd1, %rd1924;
- ld.local.u32 %r8551, [%rd1925];
- ld.local.u32 %r8552, [%rd1925+-4];
- and.b32 %r1641, %r1633, 31;
- setp.eq.s32 %p1065, %r1641, 0;
- @%p1065 bra $L__BB0_1251;
-
- mov.u32 %r6052, 32;
- sub.s32 %r6053, %r6052, %r1641;
- shr.u32 %r6054, %r8552, %r6053;
- shl.b32 %r6055, %r8551, %r1641;
- add.s32 %r8551, %r6054, %r6055;
- mul.wide.s32 %rd1926, %r1638, 4;
- add.s64 %rd1927, %rd1, %rd1926;
- ld.local.u32 %r6056, [%rd1927];
- shr.u32 %r6057, %r6056, %r6053;
- shl.b32 %r6058, %r8552, %r1641;
- add.s32 %r8552, %r6057, %r6058;
-
-$L__BB0_1251:
- and.b32 %r6059, %r1632, -2147483648;
- shr.u32 %r6060, %r8552, 30;
- shl.b32 %r6061, %r8551, 2;
- or.b32 %r6062, %r6060, %r6061;
- shr.u32 %r6063, %r6062, 31;
- shr.u32 %r6064, %r8551, 30;
- add.s32 %r6065, %r6063, %r6064;
- neg.s32 %r6066, %r6065;
- setp.eq.s32 %p1066, %r6059, 0;
- selp.b32 %r8553, %r6065, %r6066, %p1066;
- setp.ne.s32 %p1067, %r6063, 0;
- xor.b32 %r6067, %r6059, -2147483648;
- selp.b32 %r6068, %r6067, %r6059, %p1067;
- selp.b32 %r6069, -1, 0, %p1067;
- xor.b32 %r6070, %r6062, %r6069;
- shl.b32 %r6071, %r8552, 2;
- xor.b32 %r6072, %r6071, %r6069;
- cvt.u64.u32 %rd1928, %r6070;
- cvt.u64.u32 %rd1929, %r6072;
- bfi.b64 %rd1930, %rd1928, %rd1929, 32, 32;
- cvt.rn.f64.s64 %fd163, %rd1930;
- mul.f64 %fd164, %fd163, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4015, %fd164;
- setp.eq.s32 %p1068, %r6068, 0;
- neg.f32 %f4016, %f4015;
- selp.f32 %f5662, %f4015, %f4016, %p1068;
-
-$L__BB0_1253:
- add.s32 %r1648, %r8553, 1;
- and.b32 %r1649, %r1648, 1;
- setp.eq.s32 %p1069, %r1649, 0;
- selp.f32 %f1415, %f5662, 0f3F800000, %p1069;
- mul.rn.f32 %f1416, %f5662, %f5662;
- mov.f32 %f5663, 0fB94D4153;
- @%p1069 bra $L__BB0_1255;
-
- mov.f32 %f4019, 0fBAB607ED;
- mov.f32 %f4020, 0f37CBAC00;
- fma.rn.f32 %f5663, %f4020, %f1416, %f4019;
-
-$L__BB0_1255:
- selp.f32 %f4021, 0f3C0885E4, 0f3D2AAABB, %p1069;
- fma.rn.f32 %f4022, %f5663, %f1416, %f4021;
- selp.f32 %f4023, 0fBE2AAAA8, 0fBEFFFFFF, %p1069;
- fma.rn.f32 %f4024, %f4022, %f1416, %f4023;
- mov.f32 %f4025, 0f00000000;
- fma.rn.f32 %f4026, %f1416, %f1415, %f4025;
- fma.rn.f32 %f5664, %f4024, %f4026, %f1415;
- and.b32 %r6074, %r1648, 2;
- setp.eq.s32 %p1071, %r6074, 0;
- @%p1071 bra $L__BB0_1257;
-
- mov.f32 %f4028, 0fBF800000;
- fma.rn.f32 %f5664, %f5664, %f4028, %f4025;
-
-$L__BB0_1257:
- add.f32 %f5714, %f5661, %f5664;
- mul.f32 %f4029, %f1391, 0f3F22F983;
- cvt.rni.s32.f32 %r8557, %f4029;
- cvt.rn.f32.s32 %f4030, %r8557;
- mov.f32 %f4031, 0fBFC90FDA;
- fma.rn.f32 %f4032, %f4030, %f4031, %f1391;
- mov.f32 %f4033, 0fB3A22168;
- fma.rn.f32 %f4034, %f4030, %f4033, %f4032;
- mov.f32 %f4035, 0fA7C234C5;
- fma.rn.f32 %f5665, %f4030, %f4035, %f4034;
- abs.f32 %f1424, %f1391;
- setp.ltu.f32 %p1072, %f1424, 0f47CE4780;
- @%p1072 bra $L__BB0_1265;
-
- setp.eq.f32 %p1073, %f1424, 0f7F800000;
- @%p1073 bra $L__BB0_1264;
- bra.uni $L__BB0_1259;
-
-$L__BB0_1264:
- mov.f32 %f4038, 0f00000000;
- mul.rn.f32 %f5665, %f1391, %f4038;
- mov.u32 %r8557, 0;
- bra.uni $L__BB0_1265;
-
-$L__BB0_1259:
- mov.b32 %r1651, %f1391;
- shr.u32 %r6076, %r1651, 23;
- and.b32 %r6077, %r6076, 255;
- add.s32 %r1652, %r6077, -128;
- shl.b32 %r6078, %r1651, 8;
- or.b32 %r1653, %r6078, -2147483648;
- shr.u32 %r1654, %r1652, 5;
- mov.u64 %rd2664, 0;
- mov.u32 %r8554, 0;
- mov.u64 %rd1934, __cudart_i2opi_f;
- mov.u64 %rd2665, %rd2664;
-
-$L__BB0_1260:
- .pragma "nounroll";
- shl.b64 %rd1933, %rd2664, 2;
- add.s64 %rd1935, %rd1934, %rd1933;
- ld.global.nc.u32 %r6079, [%rd1935];
- mad.wide.u32 %rd1936, %r6079, %r1653, %rd2665;
- shr.u64 %rd2665, %rd1936, 32;
- add.s64 %rd1937, %rd1, %rd1933;
- st.local.u32 [%rd1937], %rd1936;
- add.s32 %r8554, %r8554, 1;
- cvt.s64.s32 %rd2664, %r8554;
- setp.ne.s32 %p1074, %r8554, 6;
- @%p1074 bra $L__BB0_1260;
-
- st.local.u32 [%rd5], %rd2665;
- mov.u32 %r6080, 4;
- sub.s32 %r1657, %r6080, %r1654;
- mov.u32 %r6081, 6;
- sub.s32 %r6082, %r6081, %r1654;
- mul.wide.s32 %rd1938, %r6082, 4;
- add.s64 %rd1939, %rd1, %rd1938;
- ld.local.u32 %r8555, [%rd1939];
- ld.local.u32 %r8556, [%rd1939+-4];
- and.b32 %r1660, %r1652, 31;
- setp.eq.s32 %p1075, %r1660, 0;
- @%p1075 bra $L__BB0_1263;
-
- mov.u32 %r6083, 32;
- sub.s32 %r6084, %r6083, %r1660;
- shr.u32 %r6085, %r8556, %r6084;
- shl.b32 %r6086, %r8555, %r1660;
- add.s32 %r8555, %r6085, %r6086;
- mul.wide.s32 %rd1940, %r1657, 4;
- add.s64 %rd1941, %rd1, %rd1940;
- ld.local.u32 %r6087, [%rd1941];
- shr.u32 %r6088, %r6087, %r6084;
- shl.b32 %r6089, %r8556, %r1660;
- add.s32 %r8556, %r6088, %r6089;
-
-$L__BB0_1263:
- and.b32 %r6090, %r1651, -2147483648;
- shr.u32 %r6091, %r8556, 30;
- shl.b32 %r6092, %r8555, 2;
- or.b32 %r6093, %r6091, %r6092;
- shr.u32 %r6094, %r6093, 31;
- shr.u32 %r6095, %r8555, 30;
- add.s32 %r6096, %r6094, %r6095;
- neg.s32 %r6097, %r6096;
- setp.eq.s32 %p1076, %r6090, 0;
- selp.b32 %r8557, %r6096, %r6097, %p1076;
- setp.ne.s32 %p1077, %r6094, 0;
- xor.b32 %r6098, %r6090, -2147483648;
- selp.b32 %r6099, %r6098, %r6090, %p1077;
- selp.b32 %r6100, -1, 0, %p1077;
- xor.b32 %r6101, %r6093, %r6100;
- shl.b32 %r6102, %r8556, 2;
- xor.b32 %r6103, %r6102, %r6100;
- cvt.u64.u32 %rd1942, %r6101;
- cvt.u64.u32 %rd1943, %r6103;
- bfi.b64 %rd1944, %rd1942, %rd1943, 32, 32;
- cvt.rn.f64.s64 %fd165, %rd1944;
- mul.f64 %fd166, %fd165, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4036, %fd166;
- setp.eq.s32 %p1078, %r6099, 0;
- neg.f32 %f4037, %f4036;
- selp.f32 %f5665, %f4036, %f4037, %p1078;
-
-$L__BB0_1265:
- and.b32 %r1667, %r8557, 1;
- setp.eq.s32 %p1079, %r1667, 0;
- selp.f32 %f1428, %f5665, 0f3F800000, %p1079;
- mul.rn.f32 %f1429, %f5665, %f5665;
- mov.f32 %f5666, 0fB94D4153;
- @%p1079 bra $L__BB0_1267;
-
- mov.f32 %f4040, 0fBAB607ED;
- mov.f32 %f4041, 0f37CBAC00;
- fma.rn.f32 %f5666, %f4041, %f1429, %f4040;
-
-$L__BB0_1267:
- selp.f32 %f4042, 0f3C0885E4, 0f3D2AAABB, %p1079;
- fma.rn.f32 %f4043, %f5666, %f1429, %f4042;
- selp.f32 %f4044, 0fBE2AAAA8, 0fBEFFFFFF, %p1079;
- fma.rn.f32 %f4045, %f4043, %f1429, %f4044;
- mov.f32 %f4046, 0f00000000;
- fma.rn.f32 %f4047, %f1429, %f1428, %f4046;
- fma.rn.f32 %f5667, %f4045, %f4047, %f1428;
- and.b32 %r6105, %r8557, 2;
- setp.eq.s32 %p1081, %r6105, 0;
- @%p1081 bra $L__BB0_1269;
-
- mov.f32 %f4049, 0fBF800000;
- fma.rn.f32 %f5667, %f5667, %f4049, %f4046;
-
-$L__BB0_1269:
- mul.f32 %f4050, %f1383, 0f3F22F983;
- cvt.rni.s32.f32 %r8561, %f4050;
- cvt.rn.f32.s32 %f4051, %r8561;
- mov.f32 %f4052, 0fBFC90FDA;
- fma.rn.f32 %f4053, %f4051, %f4052, %f1383;
- mov.f32 %f4054, 0fB3A22168;
- fma.rn.f32 %f4055, %f4051, %f4054, %f4053;
- mov.f32 %f4056, 0fA7C234C5;
- fma.rn.f32 %f5668, %f4051, %f4056, %f4055;
- abs.f32 %f1436, %f1383;
- setp.ltu.f32 %p1082, %f1436, 0f47CE4780;
- @%p1082 bra $L__BB0_1277;
-
- setp.eq.f32 %p1083, %f1436, 0f7F800000;
- @%p1083 bra $L__BB0_1276;
- bra.uni $L__BB0_1271;
-
-$L__BB0_1276:
- mov.f32 %f4059, 0f00000000;
- mul.rn.f32 %f5668, %f1383, %f4059;
- mov.u32 %r8561, 0;
- bra.uni $L__BB0_1277;
-
-$L__BB0_1271:
- mov.b32 %r1669, %f1383;
- shr.u32 %r6107, %r1669, 23;
- and.b32 %r6108, %r6107, 255;
- add.s32 %r1670, %r6108, -128;
- shl.b32 %r6109, %r1669, 8;
- or.b32 %r1671, %r6109, -2147483648;
- shr.u32 %r1672, %r1670, 5;
- mov.u64 %rd2666, 0;
- mov.u32 %r8558, 0;
- mov.u64 %rd1948, __cudart_i2opi_f;
- mov.u64 %rd2667, %rd2666;
-
-$L__BB0_1272:
- .pragma "nounroll";
- shl.b64 %rd1947, %rd2666, 2;
- add.s64 %rd1949, %rd1948, %rd1947;
- ld.global.nc.u32 %r6110, [%rd1949];
- mad.wide.u32 %rd1950, %r6110, %r1671, %rd2667;
- shr.u64 %rd2667, %rd1950, 32;
- add.s64 %rd1951, %rd1, %rd1947;
- st.local.u32 [%rd1951], %rd1950;
- add.s32 %r8558, %r8558, 1;
- cvt.s64.s32 %rd2666, %r8558;
- setp.ne.s32 %p1084, %r8558, 6;
- @%p1084 bra $L__BB0_1272;
-
- st.local.u32 [%rd5], %rd2667;
- mov.u32 %r6111, 4;
- sub.s32 %r1675, %r6111, %r1672;
- mov.u32 %r6112, 6;
- sub.s32 %r6113, %r6112, %r1672;
- mul.wide.s32 %rd1952, %r6113, 4;
- add.s64 %rd1953, %rd1, %rd1952;
- ld.local.u32 %r8559, [%rd1953];
- ld.local.u32 %r8560, [%rd1953+-4];
- and.b32 %r1678, %r1670, 31;
- setp.eq.s32 %p1085, %r1678, 0;
- @%p1085 bra $L__BB0_1275;
-
- mov.u32 %r6114, 32;
- sub.s32 %r6115, %r6114, %r1678;
- shr.u32 %r6116, %r8560, %r6115;
- shl.b32 %r6117, %r8559, %r1678;
- add.s32 %r8559, %r6116, %r6117;
- mul.wide.s32 %rd1954, %r1675, 4;
- add.s64 %rd1955, %rd1, %rd1954;
- ld.local.u32 %r6118, [%rd1955];
- shr.u32 %r6119, %r6118, %r6115;
- shl.b32 %r6120, %r8560, %r1678;
- add.s32 %r8560, %r6119, %r6120;
-
-$L__BB0_1275:
- and.b32 %r6121, %r1669, -2147483648;
- shr.u32 %r6122, %r8560, 30;
- shl.b32 %r6123, %r8559, 2;
- or.b32 %r6124, %r6122, %r6123;
- shr.u32 %r6125, %r6124, 31;
- shr.u32 %r6126, %r8559, 30;
- add.s32 %r6127, %r6125, %r6126;
- neg.s32 %r6128, %r6127;
- setp.eq.s32 %p1086, %r6121, 0;
- selp.b32 %r8561, %r6127, %r6128, %p1086;
- setp.ne.s32 %p1087, %r6125, 0;
- xor.b32 %r6129, %r6121, -2147483648;
- selp.b32 %r6130, %r6129, %r6121, %p1087;
- selp.b32 %r6131, -1, 0, %p1087;
- xor.b32 %r6132, %r6124, %r6131;
- shl.b32 %r6133, %r8560, 2;
- xor.b32 %r6134, %r6133, %r6131;
- cvt.u64.u32 %rd1956, %r6132;
- cvt.u64.u32 %rd1957, %r6134;
- bfi.b64 %rd1958, %rd1956, %rd1957, 32, 32;
- cvt.rn.f64.s64 %fd167, %rd1958;
- mul.f64 %fd168, %fd167, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4057, %fd168;
- setp.eq.s32 %p1088, %r6130, 0;
- neg.f32 %f4058, %f4057;
- selp.f32 %f5668, %f4057, %f4058, %p1088;
-
-$L__BB0_1277:
- add.s32 %r1685, %r8561, 1;
- and.b32 %r1686, %r1685, 1;
- setp.eq.s32 %p1089, %r1686, 0;
- selp.f32 %f1440, %f5668, 0f3F800000, %p1089;
- mul.rn.f32 %f1441, %f5668, %f5668;
- mov.f32 %f5669, 0fB94D4153;
- @%p1089 bra $L__BB0_1279;
-
- mov.f32 %f4061, 0fBAB607ED;
- mov.f32 %f4062, 0f37CBAC00;
- fma.rn.f32 %f5669, %f4062, %f1441, %f4061;
-
-$L__BB0_1279:
- selp.f32 %f4063, 0f3C0885E4, 0f3D2AAABB, %p1089;
- fma.rn.f32 %f4064, %f5669, %f1441, %f4063;
- selp.f32 %f4065, 0fBE2AAAA8, 0fBEFFFFFF, %p1089;
- fma.rn.f32 %f4066, %f4064, %f1441, %f4065;
- mov.f32 %f4067, 0f00000000;
- fma.rn.f32 %f4068, %f1441, %f1440, %f4067;
- fma.rn.f32 %f5670, %f4066, %f4068, %f1440;
- and.b32 %r6136, %r1685, 2;
- setp.eq.s32 %p1091, %r6136, 0;
- @%p1091 bra $L__BB0_1281;
-
- mov.f32 %f4070, 0fBF800000;
- fma.rn.f32 %f5670, %f5670, %f4070, %f4067;
-
-$L__BB0_1281:
- add.f32 %f5713, %f5667, %f5670;
- mul.f32 %f4071, %f1392, 0f3F22F983;
- cvt.rni.s32.f32 %r8565, %f4071;
- cvt.rn.f32.s32 %f4072, %r8565;
- mov.f32 %f4073, 0fBFC90FDA;
- fma.rn.f32 %f4074, %f4072, %f4073, %f1392;
- mov.f32 %f4075, 0fB3A22168;
- fma.rn.f32 %f4076, %f4072, %f4075, %f4074;
- mov.f32 %f4077, 0fA7C234C5;
- fma.rn.f32 %f5671, %f4072, %f4077, %f4076;
- abs.f32 %f1449, %f1392;
- setp.ltu.f32 %p1092, %f1449, 0f47CE4780;
- @%p1092 bra $L__BB0_1289;
-
- setp.eq.f32 %p1093, %f1449, 0f7F800000;
- @%p1093 bra $L__BB0_1288;
- bra.uni $L__BB0_1283;
-
-$L__BB0_1288:
- mov.f32 %f4080, 0f00000000;
- mul.rn.f32 %f5671, %f1392, %f4080;
- mov.u32 %r8565, 0;
- bra.uni $L__BB0_1289;
-
-$L__BB0_1283:
- mov.b32 %r1688, %f1392;
- shr.u32 %r6138, %r1688, 23;
- and.b32 %r6139, %r6138, 255;
- add.s32 %r1689, %r6139, -128;
- shl.b32 %r6140, %r1688, 8;
- or.b32 %r1690, %r6140, -2147483648;
- shr.u32 %r1691, %r1689, 5;
- mov.u64 %rd2668, 0;
- mov.u32 %r8562, 0;
- mov.u64 %rd1962, __cudart_i2opi_f;
- mov.u64 %rd2669, %rd2668;
-
-$L__BB0_1284:
- .pragma "nounroll";
- shl.b64 %rd1961, %rd2668, 2;
- add.s64 %rd1963, %rd1962, %rd1961;
- ld.global.nc.u32 %r6141, [%rd1963];
- mad.wide.u32 %rd1964, %r6141, %r1690, %rd2669;
- shr.u64 %rd2669, %rd1964, 32;
- add.s64 %rd1965, %rd1, %rd1961;
- st.local.u32 [%rd1965], %rd1964;
- add.s32 %r8562, %r8562, 1;
- cvt.s64.s32 %rd2668, %r8562;
- setp.ne.s32 %p1094, %r8562, 6;
- @%p1094 bra $L__BB0_1284;
-
- st.local.u32 [%rd5], %rd2669;
- mov.u32 %r6142, 4;
- sub.s32 %r1694, %r6142, %r1691;
- mov.u32 %r6143, 6;
- sub.s32 %r6144, %r6143, %r1691;
- mul.wide.s32 %rd1966, %r6144, 4;
- add.s64 %rd1967, %rd1, %rd1966;
- ld.local.u32 %r8563, [%rd1967];
- ld.local.u32 %r8564, [%rd1967+-4];
- and.b32 %r1697, %r1689, 31;
- setp.eq.s32 %p1095, %r1697, 0;
- @%p1095 bra $L__BB0_1287;
-
- mov.u32 %r6145, 32;
- sub.s32 %r6146, %r6145, %r1697;
- shr.u32 %r6147, %r8564, %r6146;
- shl.b32 %r6148, %r8563, %r1697;
- add.s32 %r8563, %r6147, %r6148;
- mul.wide.s32 %rd1968, %r1694, 4;
- add.s64 %rd1969, %rd1, %rd1968;
- ld.local.u32 %r6149, [%rd1969];
- shr.u32 %r6150, %r6149, %r6146;
- shl.b32 %r6151, %r8564, %r1697;
- add.s32 %r8564, %r6150, %r6151;
-
-$L__BB0_1287:
- and.b32 %r6152, %r1688, -2147483648;
- shr.u32 %r6153, %r8564, 30;
- shl.b32 %r6154, %r8563, 2;
- or.b32 %r6155, %r6153, %r6154;
- shr.u32 %r6156, %r6155, 31;
- shr.u32 %r6157, %r8563, 30;
- add.s32 %r6158, %r6156, %r6157;
- neg.s32 %r6159, %r6158;
- setp.eq.s32 %p1096, %r6152, 0;
- selp.b32 %r8565, %r6158, %r6159, %p1096;
- setp.ne.s32 %p1097, %r6156, 0;
- xor.b32 %r6160, %r6152, -2147483648;
- selp.b32 %r6161, %r6160, %r6152, %p1097;
- selp.b32 %r6162, -1, 0, %p1097;
- xor.b32 %r6163, %r6155, %r6162;
- shl.b32 %r6164, %r8564, 2;
- xor.b32 %r6165, %r6164, %r6162;
- cvt.u64.u32 %rd1970, %r6163;
- cvt.u64.u32 %rd1971, %r6165;
- bfi.b64 %rd1972, %rd1970, %rd1971, 32, 32;
- cvt.rn.f64.s64 %fd169, %rd1972;
- mul.f64 %fd170, %fd169, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4078, %fd170;
- setp.eq.s32 %p1098, %r6161, 0;
- neg.f32 %f4079, %f4078;
- selp.f32 %f5671, %f4078, %f4079, %p1098;
-
-$L__BB0_1289:
- and.b32 %r1704, %r8565, 1;
- setp.eq.s32 %p1099, %r1704, 0;
- selp.f32 %f1453, %f5671, 0f3F800000, %p1099;
- mul.rn.f32 %f1454, %f5671, %f5671;
- mov.f32 %f5672, 0fB94D4153;
- @%p1099 bra $L__BB0_1291;
-
- mov.f32 %f4082, 0fBAB607ED;
- mov.f32 %f4083, 0f37CBAC00;
- fma.rn.f32 %f5672, %f4083, %f1454, %f4082;
-
-$L__BB0_1291:
- selp.f32 %f4084, 0f3C0885E4, 0f3D2AAABB, %p1099;
- fma.rn.f32 %f4085, %f5672, %f1454, %f4084;
- selp.f32 %f4086, 0fBE2AAAA8, 0fBEFFFFFF, %p1099;
- fma.rn.f32 %f4087, %f4085, %f1454, %f4086;
- mov.f32 %f4088, 0f00000000;
- fma.rn.f32 %f4089, %f1454, %f1453, %f4088;
- fma.rn.f32 %f5673, %f4087, %f4089, %f1453;
- and.b32 %r6167, %r8565, 2;
- setp.eq.s32 %p1101, %r6167, 0;
- @%p1101 bra $L__BB0_1293;
-
- mov.f32 %f4091, 0fBF800000;
- fma.rn.f32 %f5673, %f5673, %f4091, %f4088;
-
-$L__BB0_1293:
- mul.f32 %f4092, %f1384, 0f3F22F983;
- cvt.rni.s32.f32 %r8569, %f4092;
- cvt.rn.f32.s32 %f4093, %r8569;
- mov.f32 %f4094, 0fBFC90FDA;
- fma.rn.f32 %f4095, %f4093, %f4094, %f1384;
- mov.f32 %f4096, 0fB3A22168;
- fma.rn.f32 %f4097, %f4093, %f4096, %f4095;
- mov.f32 %f4098, 0fA7C234C5;
- fma.rn.f32 %f5674, %f4093, %f4098, %f4097;
- abs.f32 %f1461, %f1384;
- setp.ltu.f32 %p1102, %f1461, 0f47CE4780;
- @%p1102 bra $L__BB0_1301;
-
- setp.eq.f32 %p1103, %f1461, 0f7F800000;
- @%p1103 bra $L__BB0_1300;
- bra.uni $L__BB0_1295;
-
-$L__BB0_1300:
- mov.f32 %f4101, 0f00000000;
- mul.rn.f32 %f5674, %f1384, %f4101;
- mov.u32 %r8569, 0;
- bra.uni $L__BB0_1301;
-
-$L__BB0_1295:
- mov.b32 %r1706, %f1384;
- shr.u32 %r6169, %r1706, 23;
- and.b32 %r6170, %r6169, 255;
- add.s32 %r1707, %r6170, -128;
- shl.b32 %r6171, %r1706, 8;
- or.b32 %r1708, %r6171, -2147483648;
- shr.u32 %r1709, %r1707, 5;
- mov.u64 %rd2670, 0;
- mov.u32 %r8566, 0;
- mov.u64 %rd1976, __cudart_i2opi_f;
- mov.u64 %rd2671, %rd2670;
-
-$L__BB0_1296:
- .pragma "nounroll";
- shl.b64 %rd1975, %rd2670, 2;
- add.s64 %rd1977, %rd1976, %rd1975;
- ld.global.nc.u32 %r6172, [%rd1977];
- mad.wide.u32 %rd1978, %r6172, %r1708, %rd2671;
- shr.u64 %rd2671, %rd1978, 32;
- add.s64 %rd1979, %rd1, %rd1975;
- st.local.u32 [%rd1979], %rd1978;
- add.s32 %r8566, %r8566, 1;
- cvt.s64.s32 %rd2670, %r8566;
- setp.ne.s32 %p1104, %r8566, 6;
- @%p1104 bra $L__BB0_1296;
-
- st.local.u32 [%rd5], %rd2671;
- mov.u32 %r6173, 4;
- sub.s32 %r1712, %r6173, %r1709;
- mov.u32 %r6174, 6;
- sub.s32 %r6175, %r6174, %r1709;
- mul.wide.s32 %rd1980, %r6175, 4;
- add.s64 %rd1981, %rd1, %rd1980;
- ld.local.u32 %r8567, [%rd1981];
- ld.local.u32 %r8568, [%rd1981+-4];
- and.b32 %r1715, %r1707, 31;
- setp.eq.s32 %p1105, %r1715, 0;
- @%p1105 bra $L__BB0_1299;
-
- mov.u32 %r6176, 32;
- sub.s32 %r6177, %r6176, %r1715;
- shr.u32 %r6178, %r8568, %r6177;
- shl.b32 %r6179, %r8567, %r1715;
- add.s32 %r8567, %r6178, %r6179;
- mul.wide.s32 %rd1982, %r1712, 4;
- add.s64 %rd1983, %rd1, %rd1982;
- ld.local.u32 %r6180, [%rd1983];
- shr.u32 %r6181, %r6180, %r6177;
- shl.b32 %r6182, %r8568, %r1715;
- add.s32 %r8568, %r6181, %r6182;
-
-$L__BB0_1299:
- and.b32 %r6183, %r1706, -2147483648;
- shr.u32 %r6184, %r8568, 30;
- shl.b32 %r6185, %r8567, 2;
- or.b32 %r6186, %r6184, %r6185;
- shr.u32 %r6187, %r6186, 31;
- shr.u32 %r6188, %r8567, 30;
- add.s32 %r6189, %r6187, %r6188;
- neg.s32 %r6190, %r6189;
- setp.eq.s32 %p1106, %r6183, 0;
- selp.b32 %r8569, %r6189, %r6190, %p1106;
- setp.ne.s32 %p1107, %r6187, 0;
- xor.b32 %r6191, %r6183, -2147483648;
- selp.b32 %r6192, %r6191, %r6183, %p1107;
- selp.b32 %r6193, -1, 0, %p1107;
- xor.b32 %r6194, %r6186, %r6193;
- shl.b32 %r6195, %r8568, 2;
- xor.b32 %r6196, %r6195, %r6193;
- cvt.u64.u32 %rd1984, %r6194;
- cvt.u64.u32 %rd1985, %r6196;
- bfi.b64 %rd1986, %rd1984, %rd1985, 32, 32;
- cvt.rn.f64.s64 %fd171, %rd1986;
- mul.f64 %fd172, %fd171, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4099, %fd172;
- setp.eq.s32 %p1108, %r6192, 0;
- neg.f32 %f4100, %f4099;
- selp.f32 %f5674, %f4099, %f4100, %p1108;
-
-$L__BB0_1301:
- add.s32 %r1722, %r8569, 1;
- and.b32 %r1723, %r1722, 1;
- setp.eq.s32 %p1109, %r1723, 0;
- selp.f32 %f1465, %f5674, 0f3F800000, %p1109;
- mul.rn.f32 %f1466, %f5674, %f5674;
- mov.f32 %f5675, 0fB94D4153;
- @%p1109 bra $L__BB0_1303;
-
- mov.f32 %f4103, 0fBAB607ED;
- mov.f32 %f4104, 0f37CBAC00;
- fma.rn.f32 %f5675, %f4104, %f1466, %f4103;
-
-$L__BB0_1303:
- selp.f32 %f4105, 0f3C0885E4, 0f3D2AAABB, %p1109;
- fma.rn.f32 %f4106, %f5675, %f1466, %f4105;
- selp.f32 %f4107, 0fBE2AAAA8, 0fBEFFFFFF, %p1109;
- fma.rn.f32 %f4108, %f4106, %f1466, %f4107;
- mov.f32 %f4109, 0f00000000;
- fma.rn.f32 %f4110, %f1466, %f1465, %f4109;
- fma.rn.f32 %f5676, %f4108, %f4110, %f1465;
- and.b32 %r6198, %r1722, 2;
- setp.eq.s32 %p1111, %r6198, 0;
- @%p1111 bra $L__BB0_1305;
-
- mov.f32 %f4112, 0fBF800000;
- fma.rn.f32 %f5676, %f5676, %f4112, %f4109;
-
-$L__BB0_1305:
- add.f32 %f5712, %f5673, %f5676;
- mul.f32 %f4113, %f1393, 0f3F22F983;
- cvt.rni.s32.f32 %r8573, %f4113;
- cvt.rn.f32.s32 %f4114, %r8573;
- mov.f32 %f4115, 0fBFC90FDA;
- fma.rn.f32 %f4116, %f4114, %f4115, %f1393;
- mov.f32 %f4117, 0fB3A22168;
- fma.rn.f32 %f4118, %f4114, %f4117, %f4116;
- mov.f32 %f4119, 0fA7C234C5;
- fma.rn.f32 %f5677, %f4114, %f4119, %f4118;
- abs.f32 %f1474, %f1393;
- setp.ltu.f32 %p1112, %f1474, 0f47CE4780;
- @%p1112 bra $L__BB0_1313;
-
- setp.eq.f32 %p1113, %f1474, 0f7F800000;
- @%p1113 bra $L__BB0_1312;
- bra.uni $L__BB0_1307;
-
-$L__BB0_1312:
- mov.f32 %f4122, 0f00000000;
- mul.rn.f32 %f5677, %f1393, %f4122;
- mov.u32 %r8573, 0;
- bra.uni $L__BB0_1313;
-
-$L__BB0_1307:
- mov.b32 %r1725, %f1393;
- shr.u32 %r6200, %r1725, 23;
- and.b32 %r6201, %r6200, 255;
- add.s32 %r1726, %r6201, -128;
- shl.b32 %r6202, %r1725, 8;
- or.b32 %r1727, %r6202, -2147483648;
- shr.u32 %r1728, %r1726, 5;
- mov.u64 %rd2672, 0;
- mov.u32 %r8570, 0;
- mov.u64 %rd1990, __cudart_i2opi_f;
- mov.u64 %rd2673, %rd2672;
-
-$L__BB0_1308:
- .pragma "nounroll";
- shl.b64 %rd1989, %rd2672, 2;
- add.s64 %rd1991, %rd1990, %rd1989;
- ld.global.nc.u32 %r6203, [%rd1991];
- mad.wide.u32 %rd1992, %r6203, %r1727, %rd2673;
- shr.u64 %rd2673, %rd1992, 32;
- add.s64 %rd1993, %rd1, %rd1989;
- st.local.u32 [%rd1993], %rd1992;
- add.s32 %r8570, %r8570, 1;
- cvt.s64.s32 %rd2672, %r8570;
- setp.ne.s32 %p1114, %r8570, 6;
- @%p1114 bra $L__BB0_1308;
-
- st.local.u32 [%rd5], %rd2673;
- mov.u32 %r6204, 4;
- sub.s32 %r1731, %r6204, %r1728;
- mov.u32 %r6205, 6;
- sub.s32 %r6206, %r6205, %r1728;
- mul.wide.s32 %rd1994, %r6206, 4;
- add.s64 %rd1995, %rd1, %rd1994;
- ld.local.u32 %r8571, [%rd1995];
- ld.local.u32 %r8572, [%rd1995+-4];
- and.b32 %r1734, %r1726, 31;
- setp.eq.s32 %p1115, %r1734, 0;
- @%p1115 bra $L__BB0_1311;
-
- mov.u32 %r6207, 32;
- sub.s32 %r6208, %r6207, %r1734;
- shr.u32 %r6209, %r8572, %r6208;
- shl.b32 %r6210, %r8571, %r1734;
- add.s32 %r8571, %r6209, %r6210;
- mul.wide.s32 %rd1996, %r1731, 4;
- add.s64 %rd1997, %rd1, %rd1996;
- ld.local.u32 %r6211, [%rd1997];
- shr.u32 %r6212, %r6211, %r6208;
- shl.b32 %r6213, %r8572, %r1734;
- add.s32 %r8572, %r6212, %r6213;
-
-$L__BB0_1311:
- and.b32 %r6214, %r1725, -2147483648;
- shr.u32 %r6215, %r8572, 30;
- shl.b32 %r6216, %r8571, 2;
- or.b32 %r6217, %r6215, %r6216;
- shr.u32 %r6218, %r6217, 31;
- shr.u32 %r6219, %r8571, 30;
- add.s32 %r6220, %r6218, %r6219;
- neg.s32 %r6221, %r6220;
- setp.eq.s32 %p1116, %r6214, 0;
- selp.b32 %r8573, %r6220, %r6221, %p1116;
- setp.ne.s32 %p1117, %r6218, 0;
- xor.b32 %r6222, %r6214, -2147483648;
- selp.b32 %r6223, %r6222, %r6214, %p1117;
- selp.b32 %r6224, -1, 0, %p1117;
- xor.b32 %r6225, %r6217, %r6224;
- shl.b32 %r6226, %r8572, 2;
- xor.b32 %r6227, %r6226, %r6224;
- cvt.u64.u32 %rd1998, %r6225;
- cvt.u64.u32 %rd1999, %r6227;
- bfi.b64 %rd2000, %rd1998, %rd1999, 32, 32;
- cvt.rn.f64.s64 %fd173, %rd2000;
- mul.f64 %fd174, %fd173, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4120, %fd174;
- setp.eq.s32 %p1118, %r6223, 0;
- neg.f32 %f4121, %f4120;
- selp.f32 %f5677, %f4120, %f4121, %p1118;
-
-$L__BB0_1313:
- and.b32 %r1741, %r8573, 1;
- setp.eq.s32 %p1119, %r1741, 0;
- selp.f32 %f1478, %f5677, 0f3F800000, %p1119;
- mul.rn.f32 %f1479, %f5677, %f5677;
- mov.f32 %f5678, 0fB94D4153;
- @%p1119 bra $L__BB0_1315;
-
- mov.f32 %f4124, 0fBAB607ED;
- mov.f32 %f4125, 0f37CBAC00;
- fma.rn.f32 %f5678, %f4125, %f1479, %f4124;
-
-$L__BB0_1315:
- selp.f32 %f4126, 0f3C0885E4, 0f3D2AAABB, %p1119;
- fma.rn.f32 %f4127, %f5678, %f1479, %f4126;
- selp.f32 %f4128, 0fBE2AAAA8, 0fBEFFFFFF, %p1119;
- fma.rn.f32 %f4129, %f4127, %f1479, %f4128;
- mov.f32 %f4130, 0f00000000;
- fma.rn.f32 %f4131, %f1479, %f1478, %f4130;
- fma.rn.f32 %f5679, %f4129, %f4131, %f1478;
- and.b32 %r6229, %r8573, 2;
- setp.eq.s32 %p1121, %r6229, 0;
- @%p1121 bra $L__BB0_1317;
-
- mov.f32 %f4133, 0fBF800000;
- fma.rn.f32 %f5679, %f5679, %f4133, %f4130;
-
-$L__BB0_1317:
- mul.f32 %f4134, %f1385, 0f3F22F983;
- cvt.rni.s32.f32 %r8577, %f4134;
- cvt.rn.f32.s32 %f4135, %r8577;
- mov.f32 %f4136, 0fBFC90FDA;
- fma.rn.f32 %f4137, %f4135, %f4136, %f1385;
- mov.f32 %f4138, 0fB3A22168;
- fma.rn.f32 %f4139, %f4135, %f4138, %f4137;
- mov.f32 %f4140, 0fA7C234C5;
- fma.rn.f32 %f5680, %f4135, %f4140, %f4139;
- abs.f32 %f1486, %f1385;
- setp.ltu.f32 %p1122, %f1486, 0f47CE4780;
- @%p1122 bra $L__BB0_1325;
-
- setp.eq.f32 %p1123, %f1486, 0f7F800000;
- @%p1123 bra $L__BB0_1324;
- bra.uni $L__BB0_1319;
-
-$L__BB0_1324:
- mov.f32 %f4143, 0f00000000;
- mul.rn.f32 %f5680, %f1385, %f4143;
- mov.u32 %r8577, 0;
- bra.uni $L__BB0_1325;
-
-$L__BB0_1319:
- mov.b32 %r1743, %f1385;
- shr.u32 %r6231, %r1743, 23;
- and.b32 %r6232, %r6231, 255;
- add.s32 %r1744, %r6232, -128;
- shl.b32 %r6233, %r1743, 8;
- or.b32 %r1745, %r6233, -2147483648;
- shr.u32 %r1746, %r1744, 5;
- mov.u64 %rd2674, 0;
- mov.u32 %r8574, 0;
- mov.u64 %rd2004, __cudart_i2opi_f;
- mov.u64 %rd2675, %rd2674;
-
-$L__BB0_1320:
- .pragma "nounroll";
- shl.b64 %rd2003, %rd2674, 2;
- add.s64 %rd2005, %rd2004, %rd2003;
- ld.global.nc.u32 %r6234, [%rd2005];
- mad.wide.u32 %rd2006, %r6234, %r1745, %rd2675;
- shr.u64 %rd2675, %rd2006, 32;
- add.s64 %rd2007, %rd1, %rd2003;
- st.local.u32 [%rd2007], %rd2006;
- add.s32 %r8574, %r8574, 1;
- cvt.s64.s32 %rd2674, %r8574;
- setp.ne.s32 %p1124, %r8574, 6;
- @%p1124 bra $L__BB0_1320;
-
- st.local.u32 [%rd5], %rd2675;
- mov.u32 %r6235, 4;
- sub.s32 %r1749, %r6235, %r1746;
- mov.u32 %r6236, 6;
- sub.s32 %r6237, %r6236, %r1746;
- mul.wide.s32 %rd2008, %r6237, 4;
- add.s64 %rd2009, %rd1, %rd2008;
- ld.local.u32 %r8575, [%rd2009];
- ld.local.u32 %r8576, [%rd2009+-4];
- and.b32 %r1752, %r1744, 31;
- setp.eq.s32 %p1125, %r1752, 0;
- @%p1125 bra $L__BB0_1323;
-
- mov.u32 %r6238, 32;
- sub.s32 %r6239, %r6238, %r1752;
- shr.u32 %r6240, %r8576, %r6239;
- shl.b32 %r6241, %r8575, %r1752;
- add.s32 %r8575, %r6240, %r6241;
- mul.wide.s32 %rd2010, %r1749, 4;
- add.s64 %rd2011, %rd1, %rd2010;
- ld.local.u32 %r6242, [%rd2011];
- shr.u32 %r6243, %r6242, %r6239;
- shl.b32 %r6244, %r8576, %r1752;
- add.s32 %r8576, %r6243, %r6244;
-
-$L__BB0_1323:
- and.b32 %r6245, %r1743, -2147483648;
- shr.u32 %r6246, %r8576, 30;
- shl.b32 %r6247, %r8575, 2;
- or.b32 %r6248, %r6246, %r6247;
- shr.u32 %r6249, %r6248, 31;
- shr.u32 %r6250, %r8575, 30;
- add.s32 %r6251, %r6249, %r6250;
- neg.s32 %r6252, %r6251;
- setp.eq.s32 %p1126, %r6245, 0;
- selp.b32 %r8577, %r6251, %r6252, %p1126;
- setp.ne.s32 %p1127, %r6249, 0;
- xor.b32 %r6253, %r6245, -2147483648;
- selp.b32 %r6254, %r6253, %r6245, %p1127;
- selp.b32 %r6255, -1, 0, %p1127;
- xor.b32 %r6256, %r6248, %r6255;
- shl.b32 %r6257, %r8576, 2;
- xor.b32 %r6258, %r6257, %r6255;
- cvt.u64.u32 %rd2012, %r6256;
- cvt.u64.u32 %rd2013, %r6258;
- bfi.b64 %rd2014, %rd2012, %rd2013, 32, 32;
- cvt.rn.f64.s64 %fd175, %rd2014;
- mul.f64 %fd176, %fd175, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4141, %fd176;
- setp.eq.s32 %p1128, %r6254, 0;
- neg.f32 %f4142, %f4141;
- selp.f32 %f5680, %f4141, %f4142, %p1128;
-
-$L__BB0_1325:
- add.s32 %r1759, %r8577, 1;
- and.b32 %r1760, %r1759, 1;
- setp.eq.s32 %p1129, %r1760, 0;
- selp.f32 %f1490, %f5680, 0f3F800000, %p1129;
- mul.rn.f32 %f1491, %f5680, %f5680;
- mov.f32 %f5681, 0fB94D4153;
- @%p1129 bra $L__BB0_1327;
-
- mov.f32 %f4145, 0fBAB607ED;
- mov.f32 %f4146, 0f37CBAC00;
- fma.rn.f32 %f5681, %f4146, %f1491, %f4145;
-
-$L__BB0_1327:
- selp.f32 %f4147, 0f3C0885E4, 0f3D2AAABB, %p1129;
- fma.rn.f32 %f4148, %f5681, %f1491, %f4147;
- selp.f32 %f4149, 0fBE2AAAA8, 0fBEFFFFFF, %p1129;
- fma.rn.f32 %f4150, %f4148, %f1491, %f4149;
- mov.f32 %f4151, 0f00000000;
- fma.rn.f32 %f4152, %f1491, %f1490, %f4151;
- fma.rn.f32 %f5682, %f4150, %f4152, %f1490;
- and.b32 %r6260, %r1759, 2;
- setp.eq.s32 %p1131, %r6260, 0;
- @%p1131 bra $L__BB0_1329;
-
- mov.f32 %f4154, 0fBF800000;
- fma.rn.f32 %f5682, %f5682, %f4154, %f4151;
-
-$L__BB0_1329:
- add.f32 %f5711, %f5679, %f5682;
- mul.f32 %f4155, %f1394, 0f3F22F983;
- cvt.rni.s32.f32 %r8581, %f4155;
- cvt.rn.f32.s32 %f4156, %r8581;
- mov.f32 %f4157, 0fBFC90FDA;
- fma.rn.f32 %f4158, %f4156, %f4157, %f1394;
- mov.f32 %f4159, 0fB3A22168;
- fma.rn.f32 %f4160, %f4156, %f4159, %f4158;
- mov.f32 %f4161, 0fA7C234C5;
- fma.rn.f32 %f5683, %f4156, %f4161, %f4160;
- abs.f32 %f1499, %f1394;
- setp.ltu.f32 %p1132, %f1499, 0f47CE4780;
- @%p1132 bra $L__BB0_1337;
-
- setp.eq.f32 %p1133, %f1499, 0f7F800000;
- @%p1133 bra $L__BB0_1336;
- bra.uni $L__BB0_1331;
-
-$L__BB0_1336:
- mov.f32 %f4164, 0f00000000;
- mul.rn.f32 %f5683, %f1394, %f4164;
- mov.u32 %r8581, 0;
- bra.uni $L__BB0_1337;
-
-$L__BB0_1331:
- mov.b32 %r1762, %f1394;
- shr.u32 %r6262, %r1762, 23;
- and.b32 %r6263, %r6262, 255;
- add.s32 %r1763, %r6263, -128;
- shl.b32 %r6264, %r1762, 8;
- or.b32 %r1764, %r6264, -2147483648;
- shr.u32 %r1765, %r1763, 5;
- mov.u64 %rd2678, 0;
- mov.u32 %r8578, 0;
- mov.u64 %rd2676, __cudart_i2opi_f;
- mov.u64 %rd2677, %rd1;
-
-$L__BB0_1332:
- .pragma "nounroll";
- ld.global.nc.u32 %r6265, [%rd2676];
- mad.wide.u32 %rd2017, %r6265, %r1764, %rd2678;
- shr.u64 %rd2678, %rd2017, 32;
- st.local.u32 [%rd2677], %rd2017;
- add.s64 %rd2677, %rd2677, 4;
- add.s64 %rd2676, %rd2676, 4;
- add.s32 %r8578, %r8578, 1;
- setp.ne.s32 %p1134, %r8578, 6;
- @%p1134 bra $L__BB0_1332;
-
- st.local.u32 [%rd5], %rd2678;
- mov.u32 %r6266, 4;
- sub.s32 %r1768, %r6266, %r1765;
- mov.u32 %r6267, 6;
- sub.s32 %r6268, %r6267, %r1765;
- mul.wide.s32 %rd2018, %r6268, 4;
- add.s64 %rd2019, %rd1, %rd2018;
- ld.local.u32 %r8579, [%rd2019];
- ld.local.u32 %r8580, [%rd2019+-4];
- and.b32 %r1771, %r1763, 31;
- setp.eq.s32 %p1135, %r1771, 0;
- @%p1135 bra $L__BB0_1335;
-
- mov.u32 %r6269, 32;
- sub.s32 %r6270, %r6269, %r1771;
- shr.u32 %r6271, %r8580, %r6270;
- shl.b32 %r6272, %r8579, %r1771;
- add.s32 %r8579, %r6271, %r6272;
- mul.wide.s32 %rd2020, %r1768, 4;
- add.s64 %rd2021, %rd1, %rd2020;
- ld.local.u32 %r6273, [%rd2021];
- shr.u32 %r6274, %r6273, %r6270;
- shl.b32 %r6275, %r8580, %r1771;
- add.s32 %r8580, %r6274, %r6275;
-
-$L__BB0_1335:
- and.b32 %r6276, %r1762, -2147483648;
- shr.u32 %r6277, %r8580, 30;
- shl.b32 %r6278, %r8579, 2;
- or.b32 %r6279, %r6277, %r6278;
- shr.u32 %r6280, %r6279, 31;
- shr.u32 %r6281, %r8579, 30;
- add.s32 %r6282, %r6280, %r6281;
- neg.s32 %r6283, %r6282;
- setp.eq.s32 %p1136, %r6276, 0;
- selp.b32 %r8581, %r6282, %r6283, %p1136;
- setp.ne.s32 %p1137, %r6280, 0;
- xor.b32 %r6284, %r6276, -2147483648;
- selp.b32 %r6285, %r6284, %r6276, %p1137;
- selp.b32 %r6286, -1, 0, %p1137;
- xor.b32 %r6287, %r6279, %r6286;
- shl.b32 %r6288, %r8580, 2;
- xor.b32 %r6289, %r6288, %r6286;
- cvt.u64.u32 %rd2022, %r6287;
- cvt.u64.u32 %rd2023, %r6289;
- bfi.b64 %rd2024, %rd2022, %rd2023, 32, 32;
- cvt.rn.f64.s64 %fd177, %rd2024;
- mul.f64 %fd178, %fd177, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4162, %fd178;
- setp.eq.s32 %p1138, %r6285, 0;
- neg.f32 %f4163, %f4162;
- selp.f32 %f5683, %f4162, %f4163, %p1138;
-
-$L__BB0_1337:
- and.b32 %r1778, %r8581, 1;
- setp.eq.s32 %p1139, %r1778, 0;
- selp.f32 %f1503, %f5683, 0f3F800000, %p1139;
- mul.rn.f32 %f1504, %f5683, %f5683;
- mov.f32 %f5684, 0fB94D4153;
- @%p1139 bra $L__BB0_1339;
-
- mov.f32 %f4166, 0fBAB607ED;
- mov.f32 %f4167, 0f37CBAC00;
- fma.rn.f32 %f5684, %f4167, %f1504, %f4166;
-
-$L__BB0_1339:
- selp.f32 %f4168, 0f3C0885E4, 0f3D2AAABB, %p1139;
- fma.rn.f32 %f4169, %f5684, %f1504, %f4168;
- selp.f32 %f4170, 0fBE2AAAA8, 0fBEFFFFFF, %p1139;
- fma.rn.f32 %f4171, %f4169, %f1504, %f4170;
- mov.f32 %f4172, 0f00000000;
- fma.rn.f32 %f4173, %f1504, %f1503, %f4172;
- fma.rn.f32 %f5685, %f4171, %f4173, %f1503;
- and.b32 %r6291, %r8581, 2;
- setp.eq.s32 %p1141, %r6291, 0;
- @%p1141 bra $L__BB0_1341;
-
- mov.f32 %f4175, 0fBF800000;
- fma.rn.f32 %f5685, %f5685, %f4175, %f4172;
-
-$L__BB0_1341:
- mul.f32 %f4176, %f1386, 0f3F22F983;
- cvt.rni.s32.f32 %r8585, %f4176;
- cvt.rn.f32.s32 %f4177, %r8585;
- mov.f32 %f4178, 0fBFC90FDA;
- fma.rn.f32 %f4179, %f4177, %f4178, %f1386;
- mov.f32 %f4180, 0fB3A22168;
- fma.rn.f32 %f4181, %f4177, %f4180, %f4179;
- mov.f32 %f4182, 0fA7C234C5;
- fma.rn.f32 %f5686, %f4177, %f4182, %f4181;
- abs.f32 %f1511, %f1386;
- setp.ltu.f32 %p1142, %f1511, 0f47CE4780;
- @%p1142 bra $L__BB0_1349;
-
- setp.eq.f32 %p1143, %f1511, 0f7F800000;
- @%p1143 bra $L__BB0_1348;
- bra.uni $L__BB0_1343;
-
-$L__BB0_1348:
- mov.f32 %f4185, 0f00000000;
- mul.rn.f32 %f5686, %f1386, %f4185;
- mov.u32 %r8585, 0;
- bra.uni $L__BB0_1349;
-
-$L__BB0_1343:
- mov.b32 %r1780, %f1386;
- shr.u32 %r6293, %r1780, 23;
- and.b32 %r6294, %r6293, 255;
- add.s32 %r1781, %r6294, -128;
- shl.b32 %r6295, %r1780, 8;
- or.b32 %r1782, %r6295, -2147483648;
- shr.u32 %r1783, %r1781, 5;
- mov.u64 %rd2681, 0;
- mov.u32 %r8582, 0;
- mov.u64 %rd2679, __cudart_i2opi_f;
- mov.u64 %rd2680, %rd1;
+ shl.b64 %rd2043, %rd2687, 2;
+ add.s64 %rd2045, %rd2044, %rd2043;
+ ld.global.nc.u32 %r6222, [%rd2045];
+ mad.wide.u32 %rd2046, %r6222, %r1782, %rd2688;
+ shr.u64 %rd2688, %rd2046, 32;
+ add.s64 %rd2047, %rd1, %rd2043;
+ st.local.u32 [%rd2047], %rd2046;
+ add.s32 %r8331, %r8331, 1;
+ cvt.s64.s32 %rd2687, %r8331;
+ setp.ne.s32 %p1143, %r8331, 6;
+ @%p1143 bra $L__BB0_1341;
+
+ st.local.u32 [%rd4], %rd2688;
+ mov.u32 %r6223, 4;
+ sub.s32 %r1786, %r6223, %r1783;
+ mov.u32 %r6224, 6;
+ sub.s32 %r6225, %r6224, %r1783;
+ mul.wide.s32 %rd2048, %r6225, 4;
+ add.s64 %rd2049, %rd1, %rd2048;
+ ld.local.u32 %r8332, [%rd2049];
+ ld.local.u32 %r8333, [%rd2049+-4];
+ and.b32 %r1789, %r1781, 31;
+ setp.eq.s32 %p1144, %r1789, 0;
+ @%p1144 bra $L__BB0_1344;
+
+ mov.u32 %r6226, 32;
+ sub.s32 %r6227, %r6226, %r1789;
+ shr.u32 %r6228, %r8333, %r6227;
+ shl.b32 %r6229, %r8332, %r1789;
+ add.s32 %r8332, %r6228, %r6229;
+ mul.wide.s32 %rd2050, %r1786, 4;
+ add.s64 %rd2051, %rd1, %rd2050;
+ ld.local.u32 %r6230, [%rd2051];
+ shr.u32 %r6231, %r6230, %r6227;
+ shl.b32 %r6232, %r8333, %r1789;
+ add.s32 %r8333, %r6231, %r6232;
$L__BB0_1344:
- .pragma "nounroll";
- ld.global.nc.u32 %r6296, [%rd2679];
- mad.wide.u32 %rd2027, %r6296, %r1782, %rd2681;
- shr.u64 %rd2681, %rd2027, 32;
- st.local.u32 [%rd2680], %rd2027;
- add.s64 %rd2680, %rd2680, 4;
- add.s64 %rd2679, %rd2679, 4;
- add.s32 %r8582, %r8582, 1;
- setp.ne.s32 %p1144, %r8582, 6;
- @%p1144 bra $L__BB0_1344;
-
- st.local.u32 [%rd5], %rd2681;
- mov.u32 %r6297, 4;
- sub.s32 %r1786, %r6297, %r1783;
- mov.u32 %r6298, 6;
- sub.s32 %r6299, %r6298, %r1783;
- mul.wide.s32 %rd2028, %r6299, 4;
- add.s64 %rd2029, %rd1, %rd2028;
- ld.local.u32 %r8583, [%rd2029];
- ld.local.u32 %r8584, [%rd2029+-4];
- and.b32 %r1789, %r1781, 31;
- setp.eq.s32 %p1145, %r1789, 0;
- @%p1145 bra $L__BB0_1347;
-
- mov.u32 %r6300, 32;
- sub.s32 %r6301, %r6300, %r1789;
- shr.u32 %r6302, %r8584, %r6301;
- shl.b32 %r6303, %r8583, %r1789;
- add.s32 %r8583, %r6302, %r6303;
- mul.wide.s32 %rd2030, %r1786, 4;
- add.s64 %rd2031, %rd1, %rd2030;
- ld.local.u32 %r6304, [%rd2031];
- shr.u32 %r6305, %r6304, %r6301;
- shl.b32 %r6306, %r8584, %r1789;
- add.s32 %r8584, %r6305, %r6306;
-
-$L__BB0_1347:
- and.b32 %r6307, %r1780, -2147483648;
- shr.u32 %r6308, %r8584, 30;
- shl.b32 %r6309, %r8583, 2;
- or.b32 %r6310, %r6308, %r6309;
- shr.u32 %r6311, %r6310, 31;
- shr.u32 %r6312, %r8583, 30;
- add.s32 %r6313, %r6311, %r6312;
- neg.s32 %r6314, %r6313;
- setp.eq.s32 %p1146, %r6307, 0;
- selp.b32 %r8585, %r6313, %r6314, %p1146;
- setp.ne.s32 %p1147, %r6311, 0;
- xor.b32 %r6315, %r6307, -2147483648;
- selp.b32 %r6316, %r6315, %r6307, %p1147;
- selp.b32 %r6317, -1, 0, %p1147;
- xor.b32 %r6318, %r6310, %r6317;
- shl.b32 %r6319, %r8584, 2;
- xor.b32 %r6320, %r6319, %r6317;
- cvt.u64.u32 %rd2032, %r6318;
- cvt.u64.u32 %rd2033, %r6320;
- bfi.b64 %rd2034, %rd2032, %rd2033, 32, 32;
- cvt.rn.f64.s64 %fd179, %rd2034;
- mul.f64 %fd180, %fd179, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4183, %fd180;
- setp.eq.s32 %p1148, %r6316, 0;
- neg.f32 %f4184, %f4183;
- selp.f32 %f5686, %f4183, %f4184, %p1148;
-
-$L__BB0_1349:
- add.s32 %r1796, %r8585, 1;
- and.b32 %r1797, %r1796, 1;
- setp.eq.s32 %p1149, %r1797, 0;
- selp.f32 %f1515, %f5686, 0f3F800000, %p1149;
- mul.rn.f32 %f1516, %f5686, %f5686;
- mov.f32 %f5687, 0fB94D4153;
- @%p1149 bra $L__BB0_1351;
-
- mov.f32 %f4187, 0fBAB607ED;
- mov.f32 %f4188, 0f37CBAC00;
- fma.rn.f32 %f5687, %f4188, %f1516, %f4187;
-
-$L__BB0_1351:
- selp.f32 %f4189, 0f3C0885E4, 0f3D2AAABB, %p1149;
- fma.rn.f32 %f4190, %f5687, %f1516, %f4189;
- selp.f32 %f4191, 0fBE2AAAA8, 0fBEFFFFFF, %p1149;
- fma.rn.f32 %f4192, %f4190, %f1516, %f4191;
- mov.f32 %f4193, 0f00000000;
- fma.rn.f32 %f4194, %f1516, %f1515, %f4193;
- fma.rn.f32 %f5688, %f4192, %f4194, %f1515;
- and.b32 %r6322, %r1796, 2;
- setp.eq.s32 %p1151, %r6322, 0;
- @%p1151 bra $L__BB0_1353;
-
- mov.f32 %f4196, 0fBF800000;
- fma.rn.f32 %f5688, %f5688, %f4196, %f4193;
-
-$L__BB0_1353:
- add.f32 %f5710, %f5685, %f5688;
- mul.f32 %f4197, %f1395, 0f3F22F983;
- cvt.rni.s32.f32 %r8589, %f4197;
- cvt.rn.f32.s32 %f4198, %r8589;
- mov.f32 %f4199, 0fBFC90FDA;
- fma.rn.f32 %f4200, %f4198, %f4199, %f1395;
- mov.f32 %f4201, 0fB3A22168;
- fma.rn.f32 %f4202, %f4198, %f4201, %f4200;
- mov.f32 %f4203, 0fA7C234C5;
- fma.rn.f32 %f5689, %f4198, %f4203, %f4202;
- abs.f32 %f1524, %f1395;
- setp.ltu.f32 %p1152, %f1524, 0f47CE4780;
- @%p1152 bra $L__BB0_1361;
-
- setp.eq.f32 %p1153, %f1524, 0f7F800000;
- @%p1153 bra $L__BB0_1360;
- bra.uni $L__BB0_1355;
-
-$L__BB0_1360:
- mov.f32 %f4206, 0f00000000;
- mul.rn.f32 %f5689, %f1395, %f4206;
- mov.u32 %r8589, 0;
- bra.uni $L__BB0_1361;
-
-$L__BB0_1355:
- mov.b32 %r1799, %f1395;
- shr.u32 %r6324, %r1799, 23;
- and.b32 %r6325, %r6324, 255;
- add.s32 %r1800, %r6325, -128;
- shl.b32 %r6326, %r1799, 8;
- or.b32 %r1801, %r6326, -2147483648;
- shr.u32 %r1802, %r1800, 5;
- mov.u64 %rd2684, 0;
- mov.u32 %r8586, 0;
- mov.u64 %rd2682, __cudart_i2opi_f;
- mov.u64 %rd2683, %rd1;
-
-$L__BB0_1356:
- .pragma "nounroll";
- ld.global.nc.u32 %r6327, [%rd2682];
- mad.wide.u32 %rd2037, %r6327, %r1801, %rd2684;
- shr.u64 %rd2684, %rd2037, 32;
- st.local.u32 [%rd2683], %rd2037;
- add.s64 %rd2683, %rd2683, 4;
- add.s64 %rd2682, %rd2682, 4;
- add.s32 %r8586, %r8586, 1;
- setp.ne.s32 %p1154, %r8586, 6;
- @%p1154 bra $L__BB0_1356;
-
- st.local.u32 [%rd5], %rd2684;
- mov.u32 %r6328, 4;
- sub.s32 %r1805, %r6328, %r1802;
- mov.u32 %r6329, 6;
- sub.s32 %r6330, %r6329, %r1802;
- mul.wide.s32 %rd2038, %r6330, 4;
- add.s64 %rd2039, %rd1, %rd2038;
- ld.local.u32 %r8587, [%rd2039];
- ld.local.u32 %r8588, [%rd2039+-4];
- and.b32 %r1808, %r1800, 31;
- setp.eq.s32 %p1155, %r1808, 0;
- @%p1155 bra $L__BB0_1359;
-
- mov.u32 %r6331, 32;
- sub.s32 %r6332, %r6331, %r1808;
- shr.u32 %r6333, %r8588, %r6332;
- shl.b32 %r6334, %r8587, %r1808;
- add.s32 %r8587, %r6333, %r6334;
- mul.wide.s32 %rd2040, %r1805, 4;
- add.s64 %rd2041, %rd1, %rd2040;
- ld.local.u32 %r6335, [%rd2041];
- shr.u32 %r6336, %r6335, %r6332;
- shl.b32 %r6337, %r8588, %r1808;
- add.s32 %r8588, %r6336, %r6337;
-
-$L__BB0_1359:
- and.b32 %r6338, %r1799, -2147483648;
- shr.u32 %r6339, %r8588, 30;
- shl.b32 %r6340, %r8587, 2;
- or.b32 %r6341, %r6339, %r6340;
- shr.u32 %r6342, %r6341, 31;
- shr.u32 %r6343, %r8587, 30;
- add.s32 %r6344, %r6342, %r6343;
- neg.s32 %r6345, %r6344;
- setp.eq.s32 %p1156, %r6338, 0;
- selp.b32 %r8589, %r6344, %r6345, %p1156;
- setp.ne.s32 %p1157, %r6342, 0;
- xor.b32 %r6346, %r6338, -2147483648;
- selp.b32 %r6347, %r6346, %r6338, %p1157;
- selp.b32 %r6348, -1, 0, %p1157;
- xor.b32 %r6349, %r6341, %r6348;
- shl.b32 %r6350, %r8588, 2;
- xor.b32 %r6351, %r6350, %r6348;
- cvt.u64.u32 %rd2042, %r6349;
- cvt.u64.u32 %rd2043, %r6351;
- bfi.b64 %rd2044, %rd2042, %rd2043, 32, 32;
- cvt.rn.f64.s64 %fd181, %rd2044;
- mul.f64 %fd182, %fd181, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4204, %fd182;
- setp.eq.s32 %p1158, %r6347, 0;
- neg.f32 %f4205, %f4204;
- selp.f32 %f5689, %f4204, %f4205, %p1158;
-
-$L__BB0_1361:
- and.b32 %r1815, %r8589, 1;
- setp.eq.s32 %p1159, %r1815, 0;
- selp.f32 %f1528, %f5689, 0f3F800000, %p1159;
- mul.rn.f32 %f1529, %f5689, %f5689;
- mov.f32 %f5690, 0fB94D4153;
- @%p1159 bra $L__BB0_1363;
-
- mov.f32 %f4208, 0fBAB607ED;
- mov.f32 %f4209, 0f37CBAC00;
- fma.rn.f32 %f5690, %f4209, %f1529, %f4208;
-
-$L__BB0_1363:
- selp.f32 %f4210, 0f3C0885E4, 0f3D2AAABB, %p1159;
- fma.rn.f32 %f4211, %f5690, %f1529, %f4210;
- selp.f32 %f4212, 0fBE2AAAA8, 0fBEFFFFFF, %p1159;
- fma.rn.f32 %f4213, %f4211, %f1529, %f4212;
- mov.f32 %f4214, 0f00000000;
- fma.rn.f32 %f4215, %f1529, %f1528, %f4214;
- fma.rn.f32 %f5691, %f4213, %f4215, %f1528;
- and.b32 %r6353, %r8589, 2;
- setp.eq.s32 %p1161, %r6353, 0;
- @%p1161 bra $L__BB0_1365;
-
- mov.f32 %f4217, 0fBF800000;
- fma.rn.f32 %f5691, %f5691, %f4217, %f4214;
-
-$L__BB0_1365:
- mul.f32 %f4218, %f1387, 0f3F22F983;
- cvt.rni.s32.f32 %r8593, %f4218;
- cvt.rn.f32.s32 %f4219, %r8593;
- mov.f32 %f4220, 0fBFC90FDA;
- fma.rn.f32 %f4221, %f4219, %f4220, %f1387;
- mov.f32 %f4222, 0fB3A22168;
- fma.rn.f32 %f4223, %f4219, %f4222, %f4221;
- mov.f32 %f4224, 0fA7C234C5;
- fma.rn.f32 %f5692, %f4219, %f4224, %f4223;
- abs.f32 %f1536, %f1387;
- setp.ltu.f32 %p1162, %f1536, 0f47CE4780;
- @%p1162 bra $L__BB0_1373;
-
- setp.eq.f32 %p1163, %f1536, 0f7F800000;
- @%p1163 bra $L__BB0_1372;
- bra.uni $L__BB0_1367;
-
-$L__BB0_1372:
- mov.f32 %f4227, 0f00000000;
- mul.rn.f32 %f5692, %f1387, %f4227;
- mov.u32 %r8593, 0;
- bra.uni $L__BB0_1373;
-
-$L__BB0_1367:
- mov.b32 %r1817, %f1387;
- shr.u32 %r6355, %r1817, 23;
- and.b32 %r6356, %r6355, 255;
- add.s32 %r1818, %r6356, -128;
- shl.b32 %r6357, %r1817, 8;
- or.b32 %r1819, %r6357, -2147483648;
- shr.u32 %r1820, %r1818, 5;
- mov.u64 %rd2687, 0;
- mov.u32 %r8590, 0;
- mov.u64 %rd2685, __cudart_i2opi_f;
- mov.u64 %rd2686, %rd1;
-
-$L__BB0_1368:
- .pragma "nounroll";
- ld.global.nc.u32 %r6358, [%rd2685];
- mad.wide.u32 %rd2047, %r6358, %r1819, %rd2687;
- shr.u64 %rd2687, %rd2047, 32;
- st.local.u32 [%rd2686], %rd2047;
- add.s64 %rd2686, %rd2686, 4;
- add.s64 %rd2685, %rd2685, 4;
- add.s32 %r8590, %r8590, 1;
- setp.ne.s32 %p1164, %r8590, 6;
- @%p1164 bra $L__BB0_1368;
-
- st.local.u32 [%rd5], %rd2687;
- mov.u32 %r6359, 4;
- sub.s32 %r1823, %r6359, %r1820;
- mov.u32 %r6360, 6;
- sub.s32 %r6361, %r6360, %r1820;
- mul.wide.s32 %rd2048, %r6361, 4;
- add.s64 %rd2049, %rd1, %rd2048;
- ld.local.u32 %r8591, [%rd2049];
- ld.local.u32 %r8592, [%rd2049+-4];
- and.b32 %r1826, %r1818, 31;
- setp.eq.s32 %p1165, %r1826, 0;
- @%p1165 bra $L__BB0_1371;
-
- mov.u32 %r6362, 32;
- sub.s32 %r6363, %r6362, %r1826;
- shr.u32 %r6364, %r8592, %r6363;
- shl.b32 %r6365, %r8591, %r1826;
- add.s32 %r8591, %r6364, %r6365;
- mul.wide.s32 %rd2050, %r1823, 4;
- add.s64 %rd2051, %rd1, %rd2050;
- ld.local.u32 %r6366, [%rd2051];
- shr.u32 %r6367, %r6366, %r6363;
- shl.b32 %r6368, %r8592, %r1826;
- add.s32 %r8592, %r6367, %r6368;
-
-$L__BB0_1371:
- and.b32 %r6369, %r1817, -2147483648;
- shr.u32 %r6370, %r8592, 30;
- shl.b32 %r6371, %r8591, 2;
- or.b32 %r6372, %r6370, %r6371;
- shr.u32 %r6373, %r6372, 31;
- shr.u32 %r6374, %r8591, 30;
- add.s32 %r6375, %r6373, %r6374;
- neg.s32 %r6376, %r6375;
- setp.eq.s32 %p1166, %r6369, 0;
- selp.b32 %r8593, %r6375, %r6376, %p1166;
- setp.ne.s32 %p1167, %r6373, 0;
- xor.b32 %r6377, %r6369, -2147483648;
- selp.b32 %r6378, %r6377, %r6369, %p1167;
- selp.b32 %r6379, -1, 0, %p1167;
- xor.b32 %r6380, %r6372, %r6379;
- shl.b32 %r6381, %r8592, 2;
- xor.b32 %r6382, %r6381, %r6379;
- cvt.u64.u32 %rd2052, %r6380;
- cvt.u64.u32 %rd2053, %r6382;
+ and.b32 %r6233, %r1780, -2147483648;
+ shr.u32 %r6234, %r8333, 30;
+ shl.b32 %r6235, %r8332, 2;
+ or.b32 %r6236, %r6234, %r6235;
+ shr.u32 %r6237, %r6236, 31;
+ shr.u32 %r6238, %r8332, 30;
+ add.s32 %r6239, %r6237, %r6238;
+ neg.s32 %r6240, %r6239;
+ setp.eq.s32 %p1145, %r6233, 0;
+ selp.b32 %r8334, %r6239, %r6240, %p1145;
+ setp.ne.s32 %p1146, %r6237, 0;
+ xor.b32 %r6241, %r6233, -2147483648;
+ selp.b32 %r6242, %r6241, %r6233, %p1146;
+ selp.b32 %r6243, -1, 0, %p1146;
+ xor.b32 %r6244, %r6236, %r6243;
+ shl.b32 %r6245, %r8333, 2;
+ xor.b32 %r6246, %r6245, %r6243;
+ cvt.u64.u32 %rd2052, %r6244;
+ cvt.u64.u32 %rd2053, %r6246;
bfi.b64 %rd2054, %rd2052, %rd2053, 32, 32;
cvt.rn.f64.s64 %fd183, %rd2054;
mul.f64 %fd184, %fd183, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4225, %fd184;
- setp.eq.s32 %p1168, %r6378, 0;
- neg.f32 %f4226, %f4225;
- selp.f32 %f5692, %f4225, %f4226, %p1168;
-
-$L__BB0_1373:
- add.s32 %r1833, %r8593, 1;
+ cvt.rn.f32.f64 %f4281, %fd184;
+ setp.eq.s32 %p1147, %r6242, 0;
+ neg.f32 %f4282, %f4281;
+ selp.f32 %f5776, %f4281, %f4282, %p1147;
+
+$L__BB0_1346:
+ add.s32 %r1796, %r8334, 1;
+ and.b32 %r1797, %r1796, 1;
+ setp.eq.s32 %p1148, %r1797, 0;
+ selp.f32 %f1564, %f5776, 0f3F800000, %p1148;
+ mul.rn.f32 %f1565, %f5776, %f5776;
+ mov.f32 %f5777, 0fB94D4153;
+ @%p1148 bra $L__BB0_1348;
+
+ mov.f32 %f4285, 0fBAB607ED;
+ mov.f32 %f4286, 0f37CBAC00;
+ fma.rn.f32 %f5777, %f4286, %f1565, %f4285;
+
+$L__BB0_1348:
+ selp.f32 %f4287, 0f3C0885E4, 0f3D2AAABB, %p1148;
+ fma.rn.f32 %f4288, %f5777, %f1565, %f4287;
+ selp.f32 %f4289, 0fBE2AAAA8, 0fBEFFFFFF, %p1148;
+ fma.rn.f32 %f4290, %f4288, %f1565, %f4289;
+ mov.f32 %f4291, 0f00000000;
+ fma.rn.f32 %f4292, %f1565, %f1564, %f4291;
+ fma.rn.f32 %f5778, %f4290, %f4292, %f1564;
+ and.b32 %r6248, %r1796, 2;
+ setp.eq.s32 %p1150, %r6248, 0;
+ @%p1150 bra $L__BB0_1350;
+
+ mov.f32 %f4294, 0fBF800000;
+ fma.rn.f32 %f5778, %f5778, %f4294, %f4291;
+
+$L__BB0_1350:
+ add.f32 %f5793, %f5775, %f5778;
+ mul.f32 %f4295, %f1420, 0f3F22F983;
+ cvt.rni.s32.f32 %r8338, %f4295;
+ cvt.rn.f32.s32 %f4296, %r8338;
+ mov.f32 %f4297, 0fBFC90FDA;
+ fma.rn.f32 %f4298, %f4296, %f4297, %f1420;
+ mov.f32 %f4299, 0fB3A22168;
+ fma.rn.f32 %f4300, %f4296, %f4299, %f4298;
+ mov.f32 %f4301, 0fA7C234C5;
+ fma.rn.f32 %f5779, %f4296, %f4301, %f4300;
+ abs.f32 %f1573, %f1420;
+ setp.ltu.f32 %p1151, %f1573, 0f47CE4780;
+ @%p1151 bra $L__BB0_1358;
+
+ setp.eq.f32 %p1152, %f1573, 0f7F800000;
+ @%p1152 bra $L__BB0_1357;
+ bra.uni $L__BB0_1352;
+
+$L__BB0_1357:
+ mov.f32 %f4304, 0f00000000;
+ mul.rn.f32 %f5779, %f1420, %f4304;
+ mov.u32 %r8338, 0;
+ bra.uni $L__BB0_1358;
+
+$L__BB0_1352:
+ mov.b32 %r1799, %f1420;
+ shr.u32 %r6250, %r1799, 23;
+ and.b32 %r6251, %r6250, 255;
+ add.s32 %r1800, %r6251, -128;
+ shl.b32 %r6252, %r1799, 8;
+ or.b32 %r1801, %r6252, -2147483648;
+ shr.u32 %r1802, %r1800, 5;
+ mov.u64 %rd2689, 0;
+ mov.u32 %r8335, 0;
+ mov.u64 %rd2058, __cudart_i2opi_f;
+ mov.u64 %rd2690, %rd2689;
+
+$L__BB0_1353:
+ .pragma "nounroll";
+ shl.b64 %rd2057, %rd2689, 2;
+ add.s64 %rd2059, %rd2058, %rd2057;
+ ld.global.nc.u32 %r6253, [%rd2059];
+ mad.wide.u32 %rd2060, %r6253, %r1801, %rd2690;
+ shr.u64 %rd2690, %rd2060, 32;
+ add.s64 %rd2061, %rd1, %rd2057;
+ st.local.u32 [%rd2061], %rd2060;
+ add.s32 %r8335, %r8335, 1;
+ cvt.s64.s32 %rd2689, %r8335;
+ setp.ne.s32 %p1153, %r8335, 6;
+ @%p1153 bra $L__BB0_1353;
+
+ st.local.u32 [%rd4], %rd2690;
+ mov.u32 %r6254, 4;
+ sub.s32 %r1805, %r6254, %r1802;
+ mov.u32 %r6255, 6;
+ sub.s32 %r6256, %r6255, %r1802;
+ mul.wide.s32 %rd2062, %r6256, 4;
+ add.s64 %rd2063, %rd1, %rd2062;
+ ld.local.u32 %r8336, [%rd2063];
+ ld.local.u32 %r8337, [%rd2063+-4];
+ and.b32 %r1808, %r1800, 31;
+ setp.eq.s32 %p1154, %r1808, 0;
+ @%p1154 bra $L__BB0_1356;
+
+ mov.u32 %r6257, 32;
+ sub.s32 %r6258, %r6257, %r1808;
+ shr.u32 %r6259, %r8337, %r6258;
+ shl.b32 %r6260, %r8336, %r1808;
+ add.s32 %r8336, %r6259, %r6260;
+ mul.wide.s32 %rd2064, %r1805, 4;
+ add.s64 %rd2065, %rd1, %rd2064;
+ ld.local.u32 %r6261, [%rd2065];
+ shr.u32 %r6262, %r6261, %r6258;
+ shl.b32 %r6263, %r8337, %r1808;
+ add.s32 %r8337, %r6262, %r6263;
+
+$L__BB0_1356:
+ and.b32 %r6264, %r1799, -2147483648;
+ shr.u32 %r6265, %r8337, 30;
+ shl.b32 %r6266, %r8336, 2;
+ or.b32 %r6267, %r6265, %r6266;
+ shr.u32 %r6268, %r6267, 31;
+ shr.u32 %r6269, %r8336, 30;
+ add.s32 %r6270, %r6268, %r6269;
+ neg.s32 %r6271, %r6270;
+ setp.eq.s32 %p1155, %r6264, 0;
+ selp.b32 %r8338, %r6270, %r6271, %p1155;
+ setp.ne.s32 %p1156, %r6268, 0;
+ xor.b32 %r6272, %r6264, -2147483648;
+ selp.b32 %r6273, %r6272, %r6264, %p1156;
+ selp.b32 %r6274, -1, 0, %p1156;
+ xor.b32 %r6275, %r6267, %r6274;
+ shl.b32 %r6276, %r8337, 2;
+ xor.b32 %r6277, %r6276, %r6274;
+ cvt.u64.u32 %rd2066, %r6275;
+ cvt.u64.u32 %rd2067, %r6277;
+ bfi.b64 %rd2068, %rd2066, %rd2067, 32, 32;
+ cvt.rn.f64.s64 %fd185, %rd2068;
+ mul.f64 %fd186, %fd185, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4302, %fd186;
+ setp.eq.s32 %p1157, %r6273, 0;
+ neg.f32 %f4303, %f4302;
+ selp.f32 %f5779, %f4302, %f4303, %p1157;
+
+$L__BB0_1358:
+ and.b32 %r1815, %r8338, 1;
+ setp.eq.s32 %p1158, %r1815, 0;
+ selp.f32 %f1577, %f5779, 0f3F800000, %p1158;
+ mul.rn.f32 %f1578, %f5779, %f5779;
+ mov.f32 %f5780, 0fB94D4153;
+ @%p1158 bra $L__BB0_1360;
+
+ mov.f32 %f4306, 0fBAB607ED;
+ mov.f32 %f4307, 0f37CBAC00;
+ fma.rn.f32 %f5780, %f4307, %f1578, %f4306;
+
+$L__BB0_1360:
+ selp.f32 %f4308, 0f3C0885E4, 0f3D2AAABB, %p1158;
+ fma.rn.f32 %f4309, %f5780, %f1578, %f4308;
+ selp.f32 %f4310, 0fBE2AAAA8, 0fBEFFFFFF, %p1158;
+ fma.rn.f32 %f4311, %f4309, %f1578, %f4310;
+ mov.f32 %f4312, 0f00000000;
+ fma.rn.f32 %f4313, %f1578, %f1577, %f4312;
+ fma.rn.f32 %f5781, %f4311, %f4313, %f1577;
+ and.b32 %r6279, %r8338, 2;
+ setp.eq.s32 %p1160, %r6279, 0;
+ @%p1160 bra $L__BB0_1362;
+
+ mov.f32 %f4315, 0fBF800000;
+ fma.rn.f32 %f5781, %f5781, %f4315, %f4312;
+
+$L__BB0_1362:
+ mul.f32 %f4316, %f1412, 0f3F22F983;
+ cvt.rni.s32.f32 %r8342, %f4316;
+ cvt.rn.f32.s32 %f4317, %r8342;
+ mov.f32 %f4318, 0fBFC90FDA;
+ fma.rn.f32 %f4319, %f4317, %f4318, %f1412;
+ mov.f32 %f4320, 0fB3A22168;
+ fma.rn.f32 %f4321, %f4317, %f4320, %f4319;
+ mov.f32 %f4322, 0fA7C234C5;
+ fma.rn.f32 %f5782, %f4317, %f4322, %f4321;
+ abs.f32 %f1585, %f1412;
+ setp.ltu.f32 %p1161, %f1585, 0f47CE4780;
+ @%p1161 bra $L__BB0_1370;
+
+ setp.eq.f32 %p1162, %f1585, 0f7F800000;
+ @%p1162 bra $L__BB0_1369;
+ bra.uni $L__BB0_1364;
+
+$L__BB0_1369:
+ mov.f32 %f4325, 0f00000000;
+ mul.rn.f32 %f5782, %f1412, %f4325;
+ mov.u32 %r8342, 0;
+ bra.uni $L__BB0_1370;
+
+$L__BB0_1364:
+ mov.b32 %r1817, %f1412;
+ shr.u32 %r6281, %r1817, 23;
+ and.b32 %r6282, %r6281, 255;
+ add.s32 %r1818, %r6282, -128;
+ shl.b32 %r6283, %r1817, 8;
+ or.b32 %r1819, %r6283, -2147483648;
+ shr.u32 %r1820, %r1818, 5;
+ mov.u64 %rd2691, 0;
+ mov.u32 %r8339, 0;
+ mov.u64 %rd2072, __cudart_i2opi_f;
+ mov.u64 %rd2692, %rd2691;
+
+$L__BB0_1365:
+ .pragma "nounroll";
+ shl.b64 %rd2071, %rd2691, 2;
+ add.s64 %rd2073, %rd2072, %rd2071;
+ ld.global.nc.u32 %r6284, [%rd2073];
+ mad.wide.u32 %rd2074, %r6284, %r1819, %rd2692;
+ shr.u64 %rd2692, %rd2074, 32;
+ add.s64 %rd2075, %rd1, %rd2071;
+ st.local.u32 [%rd2075], %rd2074;
+ add.s32 %r8339, %r8339, 1;
+ cvt.s64.s32 %rd2691, %r8339;
+ setp.ne.s32 %p1163, %r8339, 6;
+ @%p1163 bra $L__BB0_1365;
+
+ st.local.u32 [%rd4], %rd2692;
+ mov.u32 %r6285, 4;
+ sub.s32 %r1823, %r6285, %r1820;
+ mov.u32 %r6286, 6;
+ sub.s32 %r6287, %r6286, %r1820;
+ mul.wide.s32 %rd2076, %r6287, 4;
+ add.s64 %rd2077, %rd1, %rd2076;
+ ld.local.u32 %r8340, [%rd2077];
+ ld.local.u32 %r8341, [%rd2077+-4];
+ and.b32 %r1826, %r1818, 31;
+ setp.eq.s32 %p1164, %r1826, 0;
+ @%p1164 bra $L__BB0_1368;
+
+ mov.u32 %r6288, 32;
+ sub.s32 %r6289, %r6288, %r1826;
+ shr.u32 %r6290, %r8341, %r6289;
+ shl.b32 %r6291, %r8340, %r1826;
+ add.s32 %r8340, %r6290, %r6291;
+ mul.wide.s32 %rd2078, %r1823, 4;
+ add.s64 %rd2079, %rd1, %rd2078;
+ ld.local.u32 %r6292, [%rd2079];
+ shr.u32 %r6293, %r6292, %r6289;
+ shl.b32 %r6294, %r8341, %r1826;
+ add.s32 %r8341, %r6293, %r6294;
+
+$L__BB0_1368:
+ and.b32 %r6295, %r1817, -2147483648;
+ shr.u32 %r6296, %r8341, 30;
+ shl.b32 %r6297, %r8340, 2;
+ or.b32 %r6298, %r6296, %r6297;
+ shr.u32 %r6299, %r6298, 31;
+ shr.u32 %r6300, %r8340, 30;
+ add.s32 %r6301, %r6299, %r6300;
+ neg.s32 %r6302, %r6301;
+ setp.eq.s32 %p1165, %r6295, 0;
+ selp.b32 %r8342, %r6301, %r6302, %p1165;
+ setp.ne.s32 %p1166, %r6299, 0;
+ xor.b32 %r6303, %r6295, -2147483648;
+ selp.b32 %r6304, %r6303, %r6295, %p1166;
+ selp.b32 %r6305, -1, 0, %p1166;
+ xor.b32 %r6306, %r6298, %r6305;
+ shl.b32 %r6307, %r8341, 2;
+ xor.b32 %r6308, %r6307, %r6305;
+ cvt.u64.u32 %rd2080, %r6306;
+ cvt.u64.u32 %rd2081, %r6308;
+ bfi.b64 %rd2082, %rd2080, %rd2081, 32, 32;
+ cvt.rn.f64.s64 %fd187, %rd2082;
+ mul.f64 %fd188, %fd187, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4323, %fd188;
+ setp.eq.s32 %p1167, %r6304, 0;
+ neg.f32 %f4324, %f4323;
+ selp.f32 %f5782, %f4323, %f4324, %p1167;
+
+$L__BB0_1370:
+ add.s32 %r1833, %r8342, 1;
and.b32 %r1834, %r1833, 1;
- setp.eq.s32 %p1169, %r1834, 0;
- selp.f32 %f1540, %f5692, 0f3F800000, %p1169;
- mul.rn.f32 %f1541, %f5692, %f5692;
- mov.f32 %f5693, 0fB94D4153;
- @%p1169 bra $L__BB0_1375;
-
- mov.f32 %f4229, 0fBAB607ED;
- mov.f32 %f4230, 0f37CBAC00;
- fma.rn.f32 %f5693, %f4230, %f1541, %f4229;
-
-$L__BB0_1375:
- selp.f32 %f4231, 0f3C0885E4, 0f3D2AAABB, %p1169;
- fma.rn.f32 %f4232, %f5693, %f1541, %f4231;
- selp.f32 %f4233, 0fBE2AAAA8, 0fBEFFFFFF, %p1169;
- fma.rn.f32 %f4234, %f4232, %f1541, %f4233;
- mov.f32 %f4235, 0f00000000;
- fma.rn.f32 %f4236, %f1541, %f1540, %f4235;
- fma.rn.f32 %f5694, %f4234, %f4236, %f1540;
- and.b32 %r6384, %r1833, 2;
- setp.eq.s32 %p1171, %r6384, 0;
- @%p1171 bra $L__BB0_1377;
-
- mov.f32 %f4238, 0fBF800000;
- fma.rn.f32 %f5694, %f5694, %f4238, %f4235;
+ setp.eq.s32 %p1168, %r1834, 0;
+ selp.f32 %f1589, %f5782, 0f3F800000, %p1168;
+ mul.rn.f32 %f1590, %f5782, %f5782;
+ mov.f32 %f5783, 0fB94D4153;
+ @%p1168 bra $L__BB0_1372;
+
+ mov.f32 %f4327, 0fBAB607ED;
+ mov.f32 %f4328, 0f37CBAC00;
+ fma.rn.f32 %f5783, %f4328, %f1590, %f4327;
+
+$L__BB0_1372:
+ selp.f32 %f4329, 0f3C0885E4, 0f3D2AAABB, %p1168;
+ fma.rn.f32 %f4330, %f5783, %f1590, %f4329;
+ selp.f32 %f4331, 0fBE2AAAA8, 0fBEFFFFFF, %p1168;
+ fma.rn.f32 %f4332, %f4330, %f1590, %f4331;
+ mov.f32 %f4333, 0f00000000;
+ fma.rn.f32 %f4334, %f1590, %f1589, %f4333;
+ fma.rn.f32 %f5784, %f4332, %f4334, %f1589;
+ and.b32 %r6310, %r1833, 2;
+ setp.eq.s32 %p1170, %r6310, 0;
+ @%p1170 bra $L__BB0_1374;
+
+ mov.f32 %f4336, 0fBF800000;
+ fma.rn.f32 %f5784, %f5784, %f4336, %f4333;
+
+$L__BB0_1374:
+ add.f32 %f5792, %f5781, %f5784;
+ mul.f32 %f4337, %f1421, 0f3F22F983;
+ cvt.rni.s32.f32 %r8346, %f4337;
+ cvt.rn.f32.s32 %f4338, %r8346;
+ mov.f32 %f4339, 0fBFC90FDA;
+ fma.rn.f32 %f4340, %f4338, %f4339, %f1421;
+ mov.f32 %f4341, 0fB3A22168;
+ fma.rn.f32 %f4342, %f4338, %f4341, %f4340;
+ mov.f32 %f4343, 0fA7C234C5;
+ fma.rn.f32 %f5785, %f4338, %f4343, %f4342;
+ abs.f32 %f1598, %f1421;
+ setp.ltu.f32 %p1171, %f1598, 0f47CE4780;
+ @%p1171 bra $L__BB0_1382;
+
+ setp.eq.f32 %p1172, %f1598, 0f7F800000;
+ @%p1172 bra $L__BB0_1381;
+ bra.uni $L__BB0_1376;
+
+$L__BB0_1381:
+ mov.f32 %f4346, 0f00000000;
+ mul.rn.f32 %f5785, %f1421, %f4346;
+ mov.u32 %r8346, 0;
+ bra.uni $L__BB0_1382;
+
+$L__BB0_1376:
+ mov.b32 %r1836, %f1421;
+ shr.u32 %r6312, %r1836, 23;
+ and.b32 %r6313, %r6312, 255;
+ add.s32 %r1837, %r6313, -128;
+ shl.b32 %r6314, %r1836, 8;
+ or.b32 %r1838, %r6314, -2147483648;
+ shr.u32 %r1839, %r1837, 5;
+ mov.u64 %rd2693, 0;
+ mov.u32 %r8343, 0;
+ mov.u64 %rd2086, __cudart_i2opi_f;
+ mov.u64 %rd2694, %rd2693;
$L__BB0_1377:
- add.f32 %f5709, %f5691, %f5694;
- mul.f32 %f4239, %f1396, 0f3F22F983;
- cvt.rni.s32.f32 %r8597, %f4239;
- cvt.rn.f32.s32 %f4240, %r8597;
- mov.f32 %f4241, 0fBFC90FDA;
- fma.rn.f32 %f4242, %f4240, %f4241, %f1396;
- mov.f32 %f4243, 0fB3A22168;
- fma.rn.f32 %f4244, %f4240, %f4243, %f4242;
- mov.f32 %f4245, 0fA7C234C5;
- fma.rn.f32 %f5695, %f4240, %f4245, %f4244;
- abs.f32 %f1549, %f1396;
- setp.ltu.f32 %p1172, %f1549, 0f47CE4780;
- @%p1172 bra $L__BB0_1385;
-
- setp.eq.f32 %p1173, %f1549, 0f7F800000;
- @%p1173 bra $L__BB0_1384;
- bra.uni $L__BB0_1379;
+ .pragma "nounroll";
+ shl.b64 %rd2085, %rd2693, 2;
+ add.s64 %rd2087, %rd2086, %rd2085;
+ ld.global.nc.u32 %r6315, [%rd2087];
+ mad.wide.u32 %rd2088, %r6315, %r1838, %rd2694;
+ shr.u64 %rd2694, %rd2088, 32;
+ add.s64 %rd2089, %rd1, %rd2085;
+ st.local.u32 [%rd2089], %rd2088;
+ add.s32 %r8343, %r8343, 1;
+ cvt.s64.s32 %rd2693, %r8343;
+ setp.ne.s32 %p1173, %r8343, 6;
+ @%p1173 bra $L__BB0_1377;
+
+ st.local.u32 [%rd4], %rd2694;
+ mov.u32 %r6316, 4;
+ sub.s32 %r1842, %r6316, %r1839;
+ mov.u32 %r6317, 6;
+ sub.s32 %r6318, %r6317, %r1839;
+ mul.wide.s32 %rd2090, %r6318, 4;
+ add.s64 %rd2091, %rd1, %rd2090;
+ ld.local.u32 %r8344, [%rd2091];
+ ld.local.u32 %r8345, [%rd2091+-4];
+ and.b32 %r1845, %r1837, 31;
+ setp.eq.s32 %p1174, %r1845, 0;
+ @%p1174 bra $L__BB0_1380;
+
+ mov.u32 %r6319, 32;
+ sub.s32 %r6320, %r6319, %r1845;
+ shr.u32 %r6321, %r8345, %r6320;
+ shl.b32 %r6322, %r8344, %r1845;
+ add.s32 %r8344, %r6321, %r6322;
+ mul.wide.s32 %rd2092, %r1842, 4;
+ add.s64 %rd2093, %rd1, %rd2092;
+ ld.local.u32 %r6323, [%rd2093];
+ shr.u32 %r6324, %r6323, %r6320;
+ shl.b32 %r6325, %r8345, %r1845;
+ add.s32 %r8345, %r6324, %r6325;
+
+$L__BB0_1380:
+ and.b32 %r6326, %r1836, -2147483648;
+ shr.u32 %r6327, %r8345, 30;
+ shl.b32 %r6328, %r8344, 2;
+ or.b32 %r6329, %r6327, %r6328;
+ shr.u32 %r6330, %r6329, 31;
+ shr.u32 %r6331, %r8344, 30;
+ add.s32 %r6332, %r6330, %r6331;
+ neg.s32 %r6333, %r6332;
+ setp.eq.s32 %p1175, %r6326, 0;
+ selp.b32 %r8346, %r6332, %r6333, %p1175;
+ setp.ne.s32 %p1176, %r6330, 0;
+ xor.b32 %r6334, %r6326, -2147483648;
+ selp.b32 %r6335, %r6334, %r6326, %p1176;
+ selp.b32 %r6336, -1, 0, %p1176;
+ xor.b32 %r6337, %r6329, %r6336;
+ shl.b32 %r6338, %r8345, 2;
+ xor.b32 %r6339, %r6338, %r6336;
+ cvt.u64.u32 %rd2094, %r6337;
+ cvt.u64.u32 %rd2095, %r6339;
+ bfi.b64 %rd2096, %rd2094, %rd2095, 32, 32;
+ cvt.rn.f64.s64 %fd189, %rd2096;
+ mul.f64 %fd190, %fd189, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4344, %fd190;
+ setp.eq.s32 %p1177, %r6335, 0;
+ neg.f32 %f4345, %f4344;
+ selp.f32 %f5785, %f4344, %f4345, %p1177;
+
+$L__BB0_1382:
+ and.b32 %r1852, %r8346, 1;
+ setp.eq.s32 %p1178, %r1852, 0;
+ selp.f32 %f1602, %f5785, 0f3F800000, %p1178;
+ mul.rn.f32 %f1603, %f5785, %f5785;
+ mov.f32 %f5786, 0fB94D4153;
+ @%p1178 bra $L__BB0_1384;
+
+ mov.f32 %f4348, 0fBAB607ED;
+ mov.f32 %f4349, 0f37CBAC00;
+ fma.rn.f32 %f5786, %f4349, %f1603, %f4348;
$L__BB0_1384:
- mov.f32 %f4248, 0f00000000;
- mul.rn.f32 %f5695, %f1396, %f4248;
- mov.u32 %r8597, 0;
- bra.uni $L__BB0_1385;
-
-$L__BB0_1379:
- mov.b32 %r1836, %f1396;
- shr.u32 %r6386, %r1836, 23;
- and.b32 %r6387, %r6386, 255;
- add.s32 %r1837, %r6387, -128;
- shl.b32 %r6388, %r1836, 8;
- or.b32 %r1838, %r6388, -2147483648;
- shr.u32 %r1839, %r1837, 5;
- mov.u64 %rd2690, 0;
- mov.u32 %r8594, 0;
- mov.u64 %rd2688, __cudart_i2opi_f;
- mov.u64 %rd2689, %rd1;
-
-$L__BB0_1380:
+ selp.f32 %f4350, 0f3C0885E4, 0f3D2AAABB, %p1178;
+ fma.rn.f32 %f4351, %f5786, %f1603, %f4350;
+ selp.f32 %f4352, 0fBE2AAAA8, 0fBEFFFFFF, %p1178;
+ fma.rn.f32 %f4353, %f4351, %f1603, %f4352;
+ mov.f32 %f4354, 0f00000000;
+ fma.rn.f32 %f4355, %f1603, %f1602, %f4354;
+ fma.rn.f32 %f5787, %f4353, %f4355, %f1602;
+ and.b32 %r6341, %r8346, 2;
+ setp.eq.s32 %p1180, %r6341, 0;
+ @%p1180 bra $L__BB0_1386;
+
+ mov.f32 %f4357, 0fBF800000;
+ fma.rn.f32 %f5787, %f5787, %f4357, %f4354;
+
+$L__BB0_1386:
+ mul.f32 %f4358, %f1413, 0f3F22F983;
+ cvt.rni.s32.f32 %r8350, %f4358;
+ cvt.rn.f32.s32 %f4359, %r8350;
+ mov.f32 %f4360, 0fBFC90FDA;
+ fma.rn.f32 %f4361, %f4359, %f4360, %f1413;
+ mov.f32 %f4362, 0fB3A22168;
+ fma.rn.f32 %f4363, %f4359, %f4362, %f4361;
+ mov.f32 %f4364, 0fA7C234C5;
+ fma.rn.f32 %f5788, %f4359, %f4364, %f4363;
+ abs.f32 %f1610, %f1413;
+ setp.ltu.f32 %p1181, %f1610, 0f47CE4780;
+ @%p1181 bra $L__BB0_1394;
+
+ setp.eq.f32 %p1182, %f1610, 0f7F800000;
+ @%p1182 bra $L__BB0_1393;
+ bra.uni $L__BB0_1388;
+
+$L__BB0_1393:
+ mov.f32 %f4367, 0f00000000;
+ mul.rn.f32 %f5788, %f1413, %f4367;
+ mov.u32 %r8350, 0;
+ bra.uni $L__BB0_1394;
+
+$L__BB0_1388:
+ mov.b32 %r1854, %f1413;
+ shr.u32 %r6343, %r1854, 23;
+ and.b32 %r6344, %r6343, 255;
+ add.s32 %r1855, %r6344, -128;
+ shl.b32 %r6345, %r1854, 8;
+ or.b32 %r1856, %r6345, -2147483648;
+ shr.u32 %r1857, %r1855, 5;
+ mov.u64 %rd2695, 0;
+ mov.u32 %r8347, 0;
+ mov.u64 %rd2100, __cudart_i2opi_f;
+ mov.u64 %rd2696, %rd2695;
+
+$L__BB0_1389:
.pragma "nounroll";
- ld.global.nc.u32 %r6389, [%rd2688];
- mad.wide.u32 %rd2057, %r6389, %r1838, %rd2690;
- shr.u64 %rd2690, %rd2057, 32;
- st.local.u32 [%rd2689], %rd2057;
- add.s64 %rd2689, %rd2689, 4;
- add.s64 %rd2688, %rd2688, 4;
- add.s32 %r8594, %r8594, 1;
- setp.ne.s32 %p1174, %r8594, 6;
- @%p1174 bra $L__BB0_1380;
-
- st.local.u32 [%rd5], %rd2690;
- mov.u32 %r6390, 4;
- sub.s32 %r1842, %r6390, %r1839;
- mov.u32 %r6391, 6;
- sub.s32 %r6392, %r6391, %r1839;
- mul.wide.s32 %rd2058, %r6392, 4;
- add.s64 %rd2059, %rd1, %rd2058;
- ld.local.u32 %r8595, [%rd2059];
- ld.local.u32 %r8596, [%rd2059+-4];
- and.b32 %r1845, %r1837, 31;
- setp.eq.s32 %p1175, %r1845, 0;
- @%p1175 bra $L__BB0_1383;
-
- mov.u32 %r6393, 32;
- sub.s32 %r6394, %r6393, %r1845;
- shr.u32 %r6395, %r8596, %r6394;
- shl.b32 %r6396, %r8595, %r1845;
- add.s32 %r8595, %r6395, %r6396;
- mul.wide.s32 %rd2060, %r1842, 4;
- add.s64 %rd2061, %rd1, %rd2060;
- ld.local.u32 %r6397, [%rd2061];
- shr.u32 %r6398, %r6397, %r6394;
- shl.b32 %r6399, %r8596, %r1845;
- add.s32 %r8596, %r6398, %r6399;
-
-$L__BB0_1383:
- and.b32 %r6400, %r1836, -2147483648;
- shr.u32 %r6401, %r8596, 30;
- shl.b32 %r6402, %r8595, 2;
- or.b32 %r6403, %r6401, %r6402;
+ shl.b64 %rd2099, %rd2695, 2;
+ add.s64 %rd2101, %rd2100, %rd2099;
+ ld.global.nc.u32 %r6346, [%rd2101];
+ mad.wide.u32 %rd2102, %r6346, %r1856, %rd2696;
+ shr.u64 %rd2696, %rd2102, 32;
+ add.s64 %rd2103, %rd1, %rd2099;
+ st.local.u32 [%rd2103], %rd2102;
+ add.s32 %r8347, %r8347, 1;
+ cvt.s64.s32 %rd2695, %r8347;
+ setp.ne.s32 %p1183, %r8347, 6;
+ @%p1183 bra $L__BB0_1389;
+
+ st.local.u32 [%rd4], %rd2696;
+ mov.u32 %r6347, 4;
+ sub.s32 %r1860, %r6347, %r1857;
+ mov.u32 %r6348, 6;
+ sub.s32 %r6349, %r6348, %r1857;
+ mul.wide.s32 %rd2104, %r6349, 4;
+ add.s64 %rd2105, %rd1, %rd2104;
+ ld.local.u32 %r8348, [%rd2105];
+ ld.local.u32 %r8349, [%rd2105+-4];
+ and.b32 %r1863, %r1855, 31;
+ setp.eq.s32 %p1184, %r1863, 0;
+ @%p1184 bra $L__BB0_1392;
+
+ mov.u32 %r6350, 32;
+ sub.s32 %r6351, %r6350, %r1863;
+ shr.u32 %r6352, %r8349, %r6351;
+ shl.b32 %r6353, %r8348, %r1863;
+ add.s32 %r8348, %r6352, %r6353;
+ mul.wide.s32 %rd2106, %r1860, 4;
+ add.s64 %rd2107, %rd1, %rd2106;
+ ld.local.u32 %r6354, [%rd2107];
+ shr.u32 %r6355, %r6354, %r6351;
+ shl.b32 %r6356, %r8349, %r1863;
+ add.s32 %r8349, %r6355, %r6356;
+
+$L__BB0_1392:
+ and.b32 %r6357, %r1854, -2147483648;
+ shr.u32 %r6358, %r8349, 30;
+ shl.b32 %r6359, %r8348, 2;
+ or.b32 %r6360, %r6358, %r6359;
+ shr.u32 %r6361, %r6360, 31;
+ shr.u32 %r6362, %r8348, 30;
+ add.s32 %r6363, %r6361, %r6362;
+ neg.s32 %r6364, %r6363;
+ setp.eq.s32 %p1185, %r6357, 0;
+ selp.b32 %r8350, %r6363, %r6364, %p1185;
+ setp.ne.s32 %p1186, %r6361, 0;
+ xor.b32 %r6365, %r6357, -2147483648;
+ selp.b32 %r6366, %r6365, %r6357, %p1186;
+ selp.b32 %r6367, -1, 0, %p1186;
+ xor.b32 %r6368, %r6360, %r6367;
+ shl.b32 %r6369, %r8349, 2;
+ xor.b32 %r6370, %r6369, %r6367;
+ cvt.u64.u32 %rd2108, %r6368;
+ cvt.u64.u32 %rd2109, %r6370;
+ bfi.b64 %rd2110, %rd2108, %rd2109, 32, 32;
+ cvt.rn.f64.s64 %fd191, %rd2110;
+ mul.f64 %fd192, %fd191, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4365, %fd192;
+ setp.eq.s32 %p1187, %r6366, 0;
+ neg.f32 %f4366, %f4365;
+ selp.f32 %f5788, %f4365, %f4366, %p1187;
+
+$L__BB0_1394:
+ add.s32 %r1870, %r8350, 1;
+ and.b32 %r1871, %r1870, 1;
+ setp.eq.s32 %p1188, %r1871, 0;
+ selp.f32 %f1614, %f5788, 0f3F800000, %p1188;
+ mul.rn.f32 %f1615, %f5788, %f5788;
+ mov.f32 %f5789, 0fB94D4153;
+ @%p1188 bra $L__BB0_1396;
+
+ mov.f32 %f4369, 0fBAB607ED;
+ mov.f32 %f4370, 0f37CBAC00;
+ fma.rn.f32 %f5789, %f4370, %f1615, %f4369;
+
+$L__BB0_1396:
+ selp.f32 %f4371, 0f3C0885E4, 0f3D2AAABB, %p1188;
+ fma.rn.f32 %f4372, %f5789, %f1615, %f4371;
+ selp.f32 %f4373, 0fBE2AAAA8, 0fBEFFFFFF, %p1188;
+ fma.rn.f32 %f4374, %f4372, %f1615, %f4373;
+ mov.f32 %f4375, 0f00000000;
+ fma.rn.f32 %f4376, %f1615, %f1614, %f4375;
+ fma.rn.f32 %f5790, %f4374, %f4376, %f1614;
+ and.b32 %r6372, %r1870, 2;
+ setp.eq.s32 %p1190, %r6372, 0;
+ @%p1190 bra $L__BB0_1398;
+
+ mov.f32 %f4378, 0fBF800000;
+ fma.rn.f32 %f5790, %f5790, %f4378, %f4375;
+
+$L__BB0_1398:
+ add.f32 %f5791, %f5787, %f5790;
+ bra.uni $L__BB0_1399;
+
+$L__BB0_978:
+ mov.b32 %r1274, %f5416;
+ shr.u32 %r5299, %r1274, 23;
+ and.b32 %r5300, %r5299, 255;
+ add.s32 %r1275, %r5300, -128;
+ shl.b32 %r5301, %r1274, 8;
+ or.b32 %r1276, %r5301, -2147483648;
+ shr.u32 %r1277, %r1275, 5;
+ mov.u64 %rd2633, 0;
+ mov.u32 %r8223, 0;
+ mov.u64 %rd1637, __cudart_i2opi_f;
+ mov.u64 %rd2634, %rd2633;
+
+$L__BB0_979:
+ .pragma "nounroll";
+ shl.b64 %rd1636, %rd2633, 2;
+ add.s64 %rd1638, %rd1637, %rd1636;
+ ld.global.nc.u32 %r5302, [%rd1638];
+ mad.wide.u32 %rd1639, %r5302, %r1276, %rd2634;
+ shr.u64 %rd2634, %rd1639, 32;
+ add.s64 %rd1640, %rd1, %rd1636;
+ st.local.u32 [%rd1640], %rd1639;
+ add.s32 %r8223, %r8223, 1;
+ cvt.s64.s32 %rd2633, %r8223;
+ setp.ne.s32 %p835, %r8223, 6;
+ @%p835 bra $L__BB0_979;
+
+ st.local.u32 [%rd4], %rd2634;
+ mov.u32 %r5303, 4;
+ sub.s32 %r1280, %r5303, %r1277;
+ mov.u32 %r5304, 6;
+ sub.s32 %r5305, %r5304, %r1277;
+ mul.wide.s32 %rd1641, %r5305, 4;
+ add.s64 %rd1642, %rd1, %rd1641;
+ ld.local.u32 %r8224, [%rd1642];
+ ld.local.u32 %r8225, [%rd1642+-4];
+ and.b32 %r1283, %r1275, 31;
+ setp.eq.s32 %p836, %r1283, 0;
+ @%p836 bra $L__BB0_982;
+
+ mov.u32 %r5306, 32;
+ sub.s32 %r5307, %r5306, %r1283;
+ shr.u32 %r5308, %r8225, %r5307;
+ shl.b32 %r5309, %r8224, %r1283;
+ add.s32 %r8224, %r5308, %r5309;
+ mul.wide.s32 %rd1643, %r1280, 4;
+ add.s64 %rd1644, %rd1, %rd1643;
+ ld.local.u32 %r5310, [%rd1644];
+ shr.u32 %r5311, %r5310, %r5307;
+ shl.b32 %r5312, %r8225, %r1283;
+ add.s32 %r8225, %r5311, %r5312;
+
+$L__BB0_982:
+ and.b32 %r5313, %r1274, -2147483648;
+ shr.u32 %r5314, %r8225, 30;
+ shl.b32 %r5315, %r8224, 2;
+ or.b32 %r5316, %r5314, %r5315;
+ shr.u32 %r5317, %r5316, 31;
+ shr.u32 %r5318, %r8224, 30;
+ add.s32 %r5319, %r5317, %r5318;
+ neg.s32 %r5320, %r5319;
+ setp.eq.s32 %p837, %r5313, 0;
+ selp.b32 %r8226, %r5319, %r5320, %p837;
+ setp.ne.s32 %p838, %r5317, 0;
+ xor.b32 %r5321, %r5313, -2147483648;
+ selp.b32 %r5322, %r5321, %r5313, %p838;
+ selp.b32 %r5323, -1, 0, %p838;
+ xor.b32 %r5324, %r5316, %r5323;
+ shl.b32 %r5325, %r8225, 2;
+ xor.b32 %r5326, %r5325, %r5323;
+ cvt.u64.u32 %rd1645, %r5324;
+ cvt.u64.u32 %rd1646, %r5326;
+ bfi.b64 %rd1647, %rd1645, %rd1646, 32, 32;
+ cvt.rn.f64.s64 %fd129, %rd1647;
+ mul.f64 %fd130, %fd129, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3699, %fd130;
+ setp.eq.s32 %p839, %r5322, 0;
+ neg.f32 %f3700, %f3699;
+ selp.f32 %f5644, %f3699, %f3700, %p839;
+
+$L__BB0_984:
+ and.b32 %r1290, %r8226, 1;
+ setp.eq.s32 %p840, %r1290, 0;
+ selp.f32 %f1144, %f5644, 0f3F800000, %p840;
+ mul.rn.f32 %f1145, %f5644, %f5644;
+ mov.f32 %f5645, 0fB94D4153;
+ @%p840 bra $L__BB0_986;
+
+ mov.f32 %f3703, 0fBAB607ED;
+ mov.f32 %f3704, 0f37CBAC00;
+ fma.rn.f32 %f5645, %f3704, %f1145, %f3703;
+
+$L__BB0_986:
+ selp.f32 %f3705, 0f3C0885E4, 0f3D2AAABB, %p840;
+ fma.rn.f32 %f3706, %f5645, %f1145, %f3705;
+ selp.f32 %f3707, 0fBE2AAAA8, 0fBEFFFFFF, %p840;
+ fma.rn.f32 %f3708, %f3706, %f1145, %f3707;
+ mov.f32 %f3709, 0f00000000;
+ fma.rn.f32 %f3710, %f1145, %f1144, %f3709;
+ fma.rn.f32 %f5281, %f3708, %f3710, %f1144;
+ and.b32 %r5328, %r8226, 2;
+ setp.eq.s32 %p842, %r5328, 0;
+ @%p842 bra $L__BB0_988;
+
+ mov.f32 %f3712, 0fBF800000;
+ fma.rn.f32 %f5281, %f5281, %f3712, %f3709;
+
+$L__BB0_988:
+ setp.lt.s32 %p16, %r11, %r1272;
+ @%p832 bra $L__BB0_1001;
+
+ mul.f32 %f3713, %f5607, 0f3F22F983;
+ cvt.rni.s32.f32 %r8230, %f3713;
+ cvt.rn.f32.s32 %f3714, %r8230;
+ mov.f32 %f3715, 0fBFC90FDA;
+ fma.rn.f32 %f3716, %f3714, %f3715, %f5607;
+ mov.f32 %f3717, 0fB3A22168;
+ fma.rn.f32 %f3718, %f3714, %f3717, %f3716;
+ mov.f32 %f3719, 0fA7C234C5;
+ fma.rn.f32 %f5648, %f3714, %f3719, %f3718;
+ abs.f32 %f1153, %f5607;
+ setp.ltu.f32 %p844, %f1153, 0f47CE4780;
+ @%p844 bra $L__BB0_997;
+
+ setp.eq.f32 %p845, %f1153, 0f7F800000;
+ @%p845 bra $L__BB0_996;
+ bra.uni $L__BB0_991;
+
+$L__BB0_996:
+ mov.f32 %f3722, 0f00000000;
+ mul.rn.f32 %f5648, %f5607, %f3722;
+ mov.u32 %r8230, 0;
+ bra.uni $L__BB0_997;
+
+$L__BB0_991:
+ mov.b32 %r1292, %f5607;
+ shr.u32 %r5330, %r1292, 23;
+ and.b32 %r5331, %r5330, 255;
+ add.s32 %r1293, %r5331, -128;
+ shl.b32 %r5332, %r1292, 8;
+ or.b32 %r1294, %r5332, -2147483648;
+ shr.u32 %r1295, %r1293, 5;
+ mov.u64 %rd2635, 0;
+ mov.u32 %r8227, 0;
+ mov.u64 %rd1651, __cudart_i2opi_f;
+ mov.u64 %rd2636, %rd2635;
+
+$L__BB0_992:
+ .pragma "nounroll";
+ shl.b64 %rd1650, %rd2635, 2;
+ add.s64 %rd1652, %rd1651, %rd1650;
+ ld.global.nc.u32 %r5333, [%rd1652];
+ mad.wide.u32 %rd1653, %r5333, %r1294, %rd2636;
+ shr.u64 %rd2636, %rd1653, 32;
+ add.s64 %rd1654, %rd1, %rd1650;
+ st.local.u32 [%rd1654], %rd1653;
+ add.s32 %r8227, %r8227, 1;
+ cvt.s64.s32 %rd2635, %r8227;
+ setp.ne.s32 %p846, %r8227, 6;
+ @%p846 bra $L__BB0_992;
+
+ st.local.u32 [%rd4], %rd2636;
+ mov.u32 %r5334, 4;
+ sub.s32 %r1298, %r5334, %r1295;
+ mov.u32 %r5335, 6;
+ sub.s32 %r5336, %r5335, %r1295;
+ mul.wide.s32 %rd1655, %r5336, 4;
+ add.s64 %rd1656, %rd1, %rd1655;
+ ld.local.u32 %r8228, [%rd1656];
+ ld.local.u32 %r8229, [%rd1656+-4];
+ and.b32 %r1301, %r1293, 31;
+ setp.eq.s32 %p847, %r1301, 0;
+ @%p847 bra $L__BB0_995;
+
+ mov.u32 %r5337, 32;
+ sub.s32 %r5338, %r5337, %r1301;
+ shr.u32 %r5339, %r8229, %r5338;
+ shl.b32 %r5340, %r8228, %r1301;
+ add.s32 %r8228, %r5339, %r5340;
+ mul.wide.s32 %rd1657, %r1298, 4;
+ add.s64 %rd1658, %rd1, %rd1657;
+ ld.local.u32 %r5341, [%rd1658];
+ shr.u32 %r5342, %r5341, %r5338;
+ shl.b32 %r5343, %r8229, %r1301;
+ add.s32 %r8229, %r5342, %r5343;
+
+$L__BB0_995:
+ and.b32 %r5344, %r1292, -2147483648;
+ shr.u32 %r5345, %r8229, 30;
+ shl.b32 %r5346, %r8228, 2;
+ or.b32 %r5347, %r5345, %r5346;
+ shr.u32 %r5348, %r5347, 31;
+ shr.u32 %r5349, %r8228, 30;
+ add.s32 %r5350, %r5348, %r5349;
+ neg.s32 %r5351, %r5350;
+ setp.eq.s32 %p848, %r5344, 0;
+ selp.b32 %r8230, %r5350, %r5351, %p848;
+ setp.ne.s32 %p849, %r5348, 0;
+ xor.b32 %r5352, %r5344, -2147483648;
+ selp.b32 %r5353, %r5352, %r5344, %p849;
+ selp.b32 %r5354, -1, 0, %p849;
+ xor.b32 %r5355, %r5347, %r5354;
+ shl.b32 %r5356, %r8229, 2;
+ xor.b32 %r5357, %r5356, %r5354;
+ cvt.u64.u32 %rd1659, %r5355;
+ cvt.u64.u32 %rd1660, %r5357;
+ bfi.b64 %rd1661, %rd1659, %rd1660, 32, 32;
+ cvt.rn.f64.s64 %fd131, %rd1661;
+ mul.f64 %fd132, %fd131, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3720, %fd132;
+ setp.eq.s32 %p850, %r5353, 0;
+ neg.f32 %f3721, %f3720;
+ selp.f32 %f5648, %f3720, %f3721, %p850;
+
+$L__BB0_997:
+ add.s32 %r1308, %r8230, 1;
+ and.b32 %r1309, %r1308, 1;
+ setp.eq.s32 %p851, %r1309, 0;
+ selp.f32 %f1157, %f5648, 0f3F800000, %p851;
+ mul.rn.f32 %f1158, %f5648, %f5648;
+ mov.f32 %f5649, 0fB94D4153;
+ @%p851 bra $L__BB0_999;
+
+ mov.f32 %f3724, 0fBAB607ED;
+ mov.f32 %f3725, 0f37CBAC00;
+ fma.rn.f32 %f5649, %f3725, %f1158, %f3724;
+
+$L__BB0_999:
+ selp.f32 %f3726, 0f3C0885E4, 0f3D2AAABB, %p851;
+ fma.rn.f32 %f3727, %f5649, %f1158, %f3726;
+ selp.f32 %f3728, 0fBE2AAAA8, 0fBEFFFFFF, %p851;
+ fma.rn.f32 %f3729, %f3727, %f1158, %f3728;
+ mov.f32 %f3730, 0f00000000;
+ fma.rn.f32 %f3731, %f1158, %f1157, %f3730;
+ fma.rn.f32 %f5283, %f3729, %f3731, %f1157;
+ and.b32 %r5359, %r1308, 2;
+ setp.eq.s32 %p853, %r5359, 0;
+ @%p853 bra $L__BB0_1001;
+
+ mov.f32 %f3733, 0fBF800000;
+ fma.rn.f32 %f5283, %f5283, %f3733, %f3730;
+
+$L__BB0_1001:
+ selp.f32 %f1165, %f5283, %f5284, %p16;
+ selp.f32 %f1166, %f5281, %f5282, %p16;
+ @%p832 bra $L__BB0_1003;
+
+ add.f32 %f5798, %f1166, %f1165;
+ mov.f32 %f5282, %f5281;
+ mov.f32 %f5284, %f5283;
+
+$L__BB0_1003:
+ @%p807 bra $L__BB0_1032;
+
+ shl.b32 %r5360, %r12, 5;
+ mov.u32 %r5361, -32;
+ sub.s32 %r1310, %r5361, %r5360;
+ setp.ge.s32 %p857, %r11, %r1310;
+ @%p857 bra $L__BB0_1017;
+
+ mul.f32 %f3736, %f5415, 0f3F22F983;
+ cvt.rni.s32.f32 %r8234, %f3736;
+ cvt.rn.f32.s32 %f3737, %r8234;
+ mov.f32 %f3738, 0fBFC90FDA;
+ fma.rn.f32 %f3739, %f3737, %f3738, %f5415;
+ mov.f32 %f3740, 0fB3A22168;
+ fma.rn.f32 %f3741, %f3737, %f3740, %f3739;
+ mov.f32 %f3742, 0fA7C234C5;
+ fma.rn.f32 %f5657, %f3737, %f3742, %f3741;
+ abs.f32 %f1174, %f5415;
+ setp.ltu.f32 %p858, %f1174, 0f47CE4780;
+ @%p858 bra $L__BB0_1013;
+
+ setp.eq.f32 %p859, %f1174, 0f7F800000;
+ @%p859 bra $L__BB0_1012;
+ bra.uni $L__BB0_1007;
+
+$L__BB0_1012:
+ mov.f32 %f3745, 0f00000000;
+ mul.rn.f32 %f5657, %f5415, %f3745;
+ mov.u32 %r8234, 0;
+ bra.uni $L__BB0_1013;
+
+$L__BB0_1007:
+ mov.b32 %r1312, %f5415;
+ shr.u32 %r5363, %r1312, 23;
+ and.b32 %r5364, %r5363, 255;
+ add.s32 %r1313, %r5364, -128;
+ shl.b32 %r5365, %r1312, 8;
+ or.b32 %r1314, %r5365, -2147483648;
+ shr.u32 %r1315, %r1313, 5;
+ mov.u64 %rd2637, 0;
+ mov.u32 %r8231, 0;
+ mov.u64 %rd1665, __cudart_i2opi_f;
+ mov.u64 %rd2638, %rd2637;
+
+$L__BB0_1008:
+ .pragma "nounroll";
+ shl.b64 %rd1664, %rd2637, 2;
+ add.s64 %rd1666, %rd1665, %rd1664;
+ ld.global.nc.u32 %r5366, [%rd1666];
+ mad.wide.u32 %rd1667, %r5366, %r1314, %rd2638;
+ shr.u64 %rd2638, %rd1667, 32;
+ add.s64 %rd1668, %rd1, %rd1664;
+ st.local.u32 [%rd1668], %rd1667;
+ add.s32 %r8231, %r8231, 1;
+ cvt.s64.s32 %rd2637, %r8231;
+ setp.ne.s32 %p860, %r8231, 6;
+ @%p860 bra $L__BB0_1008;
+
+ st.local.u32 [%rd4], %rd2638;
+ mov.u32 %r5367, 4;
+ sub.s32 %r1318, %r5367, %r1315;
+ mov.u32 %r5368, 6;
+ sub.s32 %r5369, %r5368, %r1315;
+ mul.wide.s32 %rd1669, %r5369, 4;
+ add.s64 %rd1670, %rd1, %rd1669;
+ ld.local.u32 %r8232, [%rd1670];
+ ld.local.u32 %r8233, [%rd1670+-4];
+ and.b32 %r1321, %r1313, 31;
+ setp.eq.s32 %p861, %r1321, 0;
+ @%p861 bra $L__BB0_1011;
+
+ mov.u32 %r5370, 32;
+ sub.s32 %r5371, %r5370, %r1321;
+ shr.u32 %r5372, %r8233, %r5371;
+ shl.b32 %r5373, %r8232, %r1321;
+ add.s32 %r8232, %r5372, %r5373;
+ mul.wide.s32 %rd1671, %r1318, 4;
+ add.s64 %rd1672, %rd1, %rd1671;
+ ld.local.u32 %r5374, [%rd1672];
+ shr.u32 %r5375, %r5374, %r5371;
+ shl.b32 %r5376, %r8233, %r1321;
+ add.s32 %r8233, %r5375, %r5376;
+
+$L__BB0_1011:
+ and.b32 %r5377, %r1312, -2147483648;
+ shr.u32 %r5378, %r8233, 30;
+ shl.b32 %r5379, %r8232, 2;
+ or.b32 %r5380, %r5378, %r5379;
+ shr.u32 %r5381, %r5380, 31;
+ shr.u32 %r5382, %r8232, 30;
+ add.s32 %r5383, %r5381, %r5382;
+ neg.s32 %r5384, %r5383;
+ setp.eq.s32 %p862, %r5377, 0;
+ selp.b32 %r8234, %r5383, %r5384, %p862;
+ setp.ne.s32 %p863, %r5381, 0;
+ xor.b32 %r5385, %r5377, -2147483648;
+ selp.b32 %r5386, %r5385, %r5377, %p863;
+ selp.b32 %r5387, -1, 0, %p863;
+ xor.b32 %r5388, %r5380, %r5387;
+ shl.b32 %r5389, %r8233, 2;
+ xor.b32 %r5390, %r5389, %r5387;
+ cvt.u64.u32 %rd1673, %r5388;
+ cvt.u64.u32 %rd1674, %r5390;
+ bfi.b64 %rd1675, %rd1673, %rd1674, 32, 32;
+ cvt.rn.f64.s64 %fd133, %rd1675;
+ mul.f64 %fd134, %fd133, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3743, %fd134;
+ setp.eq.s32 %p864, %r5386, 0;
+ neg.f32 %f3744, %f3743;
+ selp.f32 %f5657, %f3743, %f3744, %p864;
+
+$L__BB0_1013:
+ and.b32 %r1328, %r8234, 1;
+ setp.eq.s32 %p865, %r1328, 0;
+ selp.f32 %f1178, %f5657, 0f3F800000, %p865;
+ mul.rn.f32 %f1179, %f5657, %f5657;
+ mov.f32 %f5658, 0fB94D4153;
+ @%p865 bra $L__BB0_1015;
+
+ mov.f32 %f3747, 0fBAB607ED;
+ mov.f32 %f3748, 0f37CBAC00;
+ fma.rn.f32 %f5658, %f3748, %f1179, %f3747;
+
+$L__BB0_1015:
+ selp.f32 %f3749, 0f3C0885E4, 0f3D2AAABB, %p865;
+ fma.rn.f32 %f3750, %f5658, %f1179, %f3749;
+ selp.f32 %f3751, 0fBE2AAAA8, 0fBEFFFFFF, %p865;
+ fma.rn.f32 %f3752, %f3750, %f1179, %f3751;
+ mov.f32 %f3753, 0f00000000;
+ fma.rn.f32 %f3754, %f1179, %f1178, %f3753;
+ fma.rn.f32 %f5281, %f3752, %f3754, %f1178;
+ and.b32 %r5392, %r8234, 2;
+ setp.eq.s32 %p867, %r5392, 0;
+ @%p867 bra $L__BB0_1017;
+
+ mov.f32 %f3756, 0fBF800000;
+ fma.rn.f32 %f5281, %f5281, %f3756, %f3753;
+
+$L__BB0_1017:
+ setp.lt.s32 %p17, %r11, %r1310;
+ @%p857 bra $L__BB0_1030;
+
+ mul.f32 %f3757, %f5606, 0f3F22F983;
+ cvt.rni.s32.f32 %r8238, %f3757;
+ cvt.rn.f32.s32 %f3758, %r8238;
+ mov.f32 %f3759, 0fBFC90FDA;
+ fma.rn.f32 %f3760, %f3758, %f3759, %f5606;
+ mov.f32 %f3761, 0fB3A22168;
+ fma.rn.f32 %f3762, %f3758, %f3761, %f3760;
+ mov.f32 %f3763, 0fA7C234C5;
+ fma.rn.f32 %f5661, %f3758, %f3763, %f3762;
+ abs.f32 %f1187, %f5606;
+ setp.ltu.f32 %p869, %f1187, 0f47CE4780;
+ @%p869 bra $L__BB0_1026;
+
+ setp.eq.f32 %p870, %f1187, 0f7F800000;
+ @%p870 bra $L__BB0_1025;
+ bra.uni $L__BB0_1020;
+
+$L__BB0_1025:
+ mov.f32 %f3766, 0f00000000;
+ mul.rn.f32 %f5661, %f5606, %f3766;
+ mov.u32 %r8238, 0;
+ bra.uni $L__BB0_1026;
+
+$L__BB0_1020:
+ mov.b32 %r1330, %f5606;
+ shr.u32 %r5394, %r1330, 23;
+ and.b32 %r5395, %r5394, 255;
+ add.s32 %r1331, %r5395, -128;
+ shl.b32 %r5396, %r1330, 8;
+ or.b32 %r1332, %r5396, -2147483648;
+ shr.u32 %r1333, %r1331, 5;
+ mov.u64 %rd2639, 0;
+ mov.u32 %r8235, 0;
+ mov.u64 %rd1679, __cudart_i2opi_f;
+ mov.u64 %rd2640, %rd2639;
+
+$L__BB0_1021:
+ .pragma "nounroll";
+ shl.b64 %rd1678, %rd2639, 2;
+ add.s64 %rd1680, %rd1679, %rd1678;
+ ld.global.nc.u32 %r5397, [%rd1680];
+ mad.wide.u32 %rd1681, %r5397, %r1332, %rd2640;
+ shr.u64 %rd2640, %rd1681, 32;
+ add.s64 %rd1682, %rd1, %rd1678;
+ st.local.u32 [%rd1682], %rd1681;
+ add.s32 %r8235, %r8235, 1;
+ cvt.s64.s32 %rd2639, %r8235;
+ setp.ne.s32 %p871, %r8235, 6;
+ @%p871 bra $L__BB0_1021;
+
+ st.local.u32 [%rd4], %rd2640;
+ mov.u32 %r5398, 4;
+ sub.s32 %r1336, %r5398, %r1333;
+ mov.u32 %r5399, 6;
+ sub.s32 %r5400, %r5399, %r1333;
+ mul.wide.s32 %rd1683, %r5400, 4;
+ add.s64 %rd1684, %rd1, %rd1683;
+ ld.local.u32 %r8236, [%rd1684];
+ ld.local.u32 %r8237, [%rd1684+-4];
+ and.b32 %r1339, %r1331, 31;
+ setp.eq.s32 %p872, %r1339, 0;
+ @%p872 bra $L__BB0_1024;
+
+ mov.u32 %r5401, 32;
+ sub.s32 %r5402, %r5401, %r1339;
+ shr.u32 %r5403, %r8237, %r5402;
+ shl.b32 %r5404, %r8236, %r1339;
+ add.s32 %r8236, %r5403, %r5404;
+ mul.wide.s32 %rd1685, %r1336, 4;
+ add.s64 %rd1686, %rd1, %rd1685;
+ ld.local.u32 %r5405, [%rd1686];
+ shr.u32 %r5406, %r5405, %r5402;
+ shl.b32 %r5407, %r8237, %r1339;
+ add.s32 %r8237, %r5406, %r5407;
+
+$L__BB0_1024:
+ and.b32 %r5408, %r1330, -2147483648;
+ shr.u32 %r5409, %r8237, 30;
+ shl.b32 %r5410, %r8236, 2;
+ or.b32 %r5411, %r5409, %r5410;
+ shr.u32 %r5412, %r5411, 31;
+ shr.u32 %r5413, %r8236, 30;
+ add.s32 %r5414, %r5412, %r5413;
+ neg.s32 %r5415, %r5414;
+ setp.eq.s32 %p873, %r5408, 0;
+ selp.b32 %r8238, %r5414, %r5415, %p873;
+ setp.ne.s32 %p874, %r5412, 0;
+ xor.b32 %r5416, %r5408, -2147483648;
+ selp.b32 %r5417, %r5416, %r5408, %p874;
+ selp.b32 %r5418, -1, 0, %p874;
+ xor.b32 %r5419, %r5411, %r5418;
+ shl.b32 %r5420, %r8237, 2;
+ xor.b32 %r5421, %r5420, %r5418;
+ cvt.u64.u32 %rd1687, %r5419;
+ cvt.u64.u32 %rd1688, %r5421;
+ bfi.b64 %rd1689, %rd1687, %rd1688, 32, 32;
+ cvt.rn.f64.s64 %fd135, %rd1689;
+ mul.f64 %fd136, %fd135, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3764, %fd136;
+ setp.eq.s32 %p875, %r5417, 0;
+ neg.f32 %f3765, %f3764;
+ selp.f32 %f5661, %f3764, %f3765, %p875;
+
+$L__BB0_1026:
+ add.s32 %r1346, %r8238, 1;
+ and.b32 %r1347, %r1346, 1;
+ setp.eq.s32 %p876, %r1347, 0;
+ selp.f32 %f1191, %f5661, 0f3F800000, %p876;
+ mul.rn.f32 %f1192, %f5661, %f5661;
+ mov.f32 %f5662, 0fB94D4153;
+ @%p876 bra $L__BB0_1028;
+
+ mov.f32 %f3768, 0fBAB607ED;
+ mov.f32 %f3769, 0f37CBAC00;
+ fma.rn.f32 %f5662, %f3769, %f1192, %f3768;
+
+$L__BB0_1028:
+ selp.f32 %f3770, 0f3C0885E4, 0f3D2AAABB, %p876;
+ fma.rn.f32 %f3771, %f5662, %f1192, %f3770;
+ selp.f32 %f3772, 0fBE2AAAA8, 0fBEFFFFFF, %p876;
+ fma.rn.f32 %f3773, %f3771, %f1192, %f3772;
+ mov.f32 %f3774, 0f00000000;
+ fma.rn.f32 %f3775, %f1192, %f1191, %f3774;
+ fma.rn.f32 %f5283, %f3773, %f3775, %f1191;
+ and.b32 %r5423, %r1346, 2;
+ setp.eq.s32 %p878, %r5423, 0;
+ @%p878 bra $L__BB0_1030;
+
+ mov.f32 %f3777, 0fBF800000;
+ fma.rn.f32 %f5283, %f5283, %f3777, %f3774;
+
+$L__BB0_1030:
+ selp.f32 %f1199, %f5283, %f5284, %p17;
+ selp.f32 %f1200, %f5281, %f5282, %p17;
+ @%p857 bra $L__BB0_1032;
+
+ add.f32 %f5797, %f1200, %f1199;
+ mov.f32 %f5282, %f5281;
+ mov.f32 %f5284, %f5283;
+
+$L__BB0_1032:
+ @%p810 bra $L__BB0_1061;
+
+ shl.b32 %r5424, %r12, 5;
+ neg.s32 %r1348, %r5424;
+ setp.ge.s32 %p882, %r11, %r1348;
+ @%p882 bra $L__BB0_1046;
+
+ mul.f32 %f3780, %f5414, 0f3F22F983;
+ cvt.rni.s32.f32 %r8242, %f3780;
+ cvt.rn.f32.s32 %f3781, %r8242;
+ mov.f32 %f3782, 0fBFC90FDA;
+ fma.rn.f32 %f3783, %f3781, %f3782, %f5414;
+ mov.f32 %f3784, 0fB3A22168;
+ fma.rn.f32 %f3785, %f3781, %f3784, %f3783;
+ mov.f32 %f3786, 0fA7C234C5;
+ fma.rn.f32 %f5670, %f3781, %f3786, %f3785;
+ abs.f32 %f1208, %f5414;
+ setp.ltu.f32 %p883, %f1208, 0f47CE4780;
+ @%p883 bra $L__BB0_1042;
+
+ setp.eq.f32 %p884, %f1208, 0f7F800000;
+ @%p884 bra $L__BB0_1041;
+ bra.uni $L__BB0_1036;
+
+$L__BB0_1041:
+ mov.f32 %f3789, 0f00000000;
+ mul.rn.f32 %f5670, %f5414, %f3789;
+ mov.u32 %r8242, 0;
+ bra.uni $L__BB0_1042;
+
+$L__BB0_1036:
+ mov.b32 %r1350, %f5414;
+ shr.u32 %r5426, %r1350, 23;
+ and.b32 %r5427, %r5426, 255;
+ add.s32 %r1351, %r5427, -128;
+ shl.b32 %r5428, %r1350, 8;
+ or.b32 %r1352, %r5428, -2147483648;
+ shr.u32 %r1353, %r1351, 5;
+ mov.u64 %rd2641, 0;
+ mov.u32 %r8239, 0;
+ mov.u64 %rd1693, __cudart_i2opi_f;
+ mov.u64 %rd2642, %rd2641;
+
+$L__BB0_1037:
+ .pragma "nounroll";
+ shl.b64 %rd1692, %rd2641, 2;
+ add.s64 %rd1694, %rd1693, %rd1692;
+ ld.global.nc.u32 %r5429, [%rd1694];
+ mad.wide.u32 %rd1695, %r5429, %r1352, %rd2642;
+ shr.u64 %rd2642, %rd1695, 32;
+ add.s64 %rd1696, %rd1, %rd1692;
+ st.local.u32 [%rd1696], %rd1695;
+ add.s32 %r8239, %r8239, 1;
+ cvt.s64.s32 %rd2641, %r8239;
+ setp.ne.s32 %p885, %r8239, 6;
+ @%p885 bra $L__BB0_1037;
+
+ st.local.u32 [%rd4], %rd2642;
+ mov.u32 %r5430, 4;
+ sub.s32 %r1356, %r5430, %r1353;
+ mov.u32 %r5431, 6;
+ sub.s32 %r5432, %r5431, %r1353;
+ mul.wide.s32 %rd1697, %r5432, 4;
+ add.s64 %rd1698, %rd1, %rd1697;
+ ld.local.u32 %r8240, [%rd1698];
+ ld.local.u32 %r8241, [%rd1698+-4];
+ and.b32 %r1359, %r1351, 31;
+ setp.eq.s32 %p886, %r1359, 0;
+ @%p886 bra $L__BB0_1040;
+
+ mov.u32 %r5433, 32;
+ sub.s32 %r5434, %r5433, %r1359;
+ shr.u32 %r5435, %r8241, %r5434;
+ shl.b32 %r5436, %r8240, %r1359;
+ add.s32 %r8240, %r5435, %r5436;
+ mul.wide.s32 %rd1699, %r1356, 4;
+ add.s64 %rd1700, %rd1, %rd1699;
+ ld.local.u32 %r5437, [%rd1700];
+ shr.u32 %r5438, %r5437, %r5434;
+ shl.b32 %r5439, %r8241, %r1359;
+ add.s32 %r8241, %r5438, %r5439;
+
+$L__BB0_1040:
+ and.b32 %r5440, %r1350, -2147483648;
+ shr.u32 %r5441, %r8241, 30;
+ shl.b32 %r5442, %r8240, 2;
+ or.b32 %r5443, %r5441, %r5442;
+ shr.u32 %r5444, %r5443, 31;
+ shr.u32 %r5445, %r8240, 30;
+ add.s32 %r5446, %r5444, %r5445;
+ neg.s32 %r5447, %r5446;
+ setp.eq.s32 %p887, %r5440, 0;
+ selp.b32 %r8242, %r5446, %r5447, %p887;
+ setp.ne.s32 %p888, %r5444, 0;
+ xor.b32 %r5448, %r5440, -2147483648;
+ selp.b32 %r5449, %r5448, %r5440, %p888;
+ selp.b32 %r5450, -1, 0, %p888;
+ xor.b32 %r5451, %r5443, %r5450;
+ shl.b32 %r5452, %r8241, 2;
+ xor.b32 %r5453, %r5452, %r5450;
+ cvt.u64.u32 %rd1701, %r5451;
+ cvt.u64.u32 %rd1702, %r5453;
+ bfi.b64 %rd1703, %rd1701, %rd1702, 32, 32;
+ cvt.rn.f64.s64 %fd137, %rd1703;
+ mul.f64 %fd138, %fd137, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3787, %fd138;
+ setp.eq.s32 %p889, %r5449, 0;
+ neg.f32 %f3788, %f3787;
+ selp.f32 %f5670, %f3787, %f3788, %p889;
+
+$L__BB0_1042:
+ and.b32 %r1366, %r8242, 1;
+ setp.eq.s32 %p890, %r1366, 0;
+ selp.f32 %f1212, %f5670, 0f3F800000, %p890;
+ mul.rn.f32 %f1213, %f5670, %f5670;
+ mov.f32 %f5671, 0fB94D4153;
+ @%p890 bra $L__BB0_1044;
+
+ mov.f32 %f3791, 0fBAB607ED;
+ mov.f32 %f3792, 0f37CBAC00;
+ fma.rn.f32 %f5671, %f3792, %f1213, %f3791;
+
+$L__BB0_1044:
+ selp.f32 %f3793, 0f3C0885E4, 0f3D2AAABB, %p890;
+ fma.rn.f32 %f3794, %f5671, %f1213, %f3793;
+ selp.f32 %f3795, 0fBE2AAAA8, 0fBEFFFFFF, %p890;
+ fma.rn.f32 %f3796, %f3794, %f1213, %f3795;
+ mov.f32 %f3797, 0f00000000;
+ fma.rn.f32 %f3798, %f1213, %f1212, %f3797;
+ fma.rn.f32 %f5281, %f3796, %f3798, %f1212;
+ and.b32 %r5455, %r8242, 2;
+ setp.eq.s32 %p892, %r5455, 0;
+ @%p892 bra $L__BB0_1046;
+
+ mov.f32 %f3800, 0fBF800000;
+ fma.rn.f32 %f5281, %f5281, %f3800, %f3797;
+
+$L__BB0_1046:
+ setp.lt.s32 %p18, %r11, %r1348;
+ @%p882 bra $L__BB0_1059;
+
+ mul.f32 %f3801, %f5406, 0f3F22F983;
+ cvt.rni.s32.f32 %r8246, %f3801;
+ cvt.rn.f32.s32 %f3802, %r8246;
+ mov.f32 %f3803, 0fBFC90FDA;
+ fma.rn.f32 %f3804, %f3802, %f3803, %f5406;
+ mov.f32 %f3805, 0fB3A22168;
+ fma.rn.f32 %f3806, %f3802, %f3805, %f3804;
+ mov.f32 %f3807, 0fA7C234C5;
+ fma.rn.f32 %f5674, %f3802, %f3807, %f3806;
+ abs.f32 %f1221, %f5406;
+ setp.ltu.f32 %p894, %f1221, 0f47CE4780;
+ @%p894 bra $L__BB0_1055;
+
+ setp.eq.f32 %p895, %f1221, 0f7F800000;
+ @%p895 bra $L__BB0_1054;
+ bra.uni $L__BB0_1049;
+
+$L__BB0_1054:
+ mov.f32 %f3810, 0f00000000;
+ mul.rn.f32 %f5674, %f5406, %f3810;
+ mov.u32 %r8246, 0;
+ bra.uni $L__BB0_1055;
+
+$L__BB0_1049:
+ mov.b32 %r1368, %f5406;
+ shr.u32 %r5457, %r1368, 23;
+ and.b32 %r5458, %r5457, 255;
+ add.s32 %r1369, %r5458, -128;
+ shl.b32 %r5459, %r1368, 8;
+ or.b32 %r1370, %r5459, -2147483648;
+ shr.u32 %r1371, %r1369, 5;
+ mov.u64 %rd2643, 0;
+ mov.u32 %r8243, 0;
+ mov.u64 %rd1707, __cudart_i2opi_f;
+ mov.u64 %rd2644, %rd2643;
+
+$L__BB0_1050:
+ .pragma "nounroll";
+ shl.b64 %rd1706, %rd2643, 2;
+ add.s64 %rd1708, %rd1707, %rd1706;
+ ld.global.nc.u32 %r5460, [%rd1708];
+ mad.wide.u32 %rd1709, %r5460, %r1370, %rd2644;
+ shr.u64 %rd2644, %rd1709, 32;
+ add.s64 %rd1710, %rd1, %rd1706;
+ st.local.u32 [%rd1710], %rd1709;
+ add.s32 %r8243, %r8243, 1;
+ cvt.s64.s32 %rd2643, %r8243;
+ setp.ne.s32 %p896, %r8243, 6;
+ @%p896 bra $L__BB0_1050;
+
+ st.local.u32 [%rd4], %rd2644;
+ mov.u32 %r5461, 4;
+ sub.s32 %r1374, %r5461, %r1371;
+ mov.u32 %r5462, 6;
+ sub.s32 %r5463, %r5462, %r1371;
+ mul.wide.s32 %rd1711, %r5463, 4;
+ add.s64 %rd1712, %rd1, %rd1711;
+ ld.local.u32 %r8244, [%rd1712];
+ ld.local.u32 %r8245, [%rd1712+-4];
+ and.b32 %r1377, %r1369, 31;
+ setp.eq.s32 %p897, %r1377, 0;
+ @%p897 bra $L__BB0_1053;
+
+ mov.u32 %r5464, 32;
+ sub.s32 %r5465, %r5464, %r1377;
+ shr.u32 %r5466, %r8245, %r5465;
+ shl.b32 %r5467, %r8244, %r1377;
+ add.s32 %r8244, %r5466, %r5467;
+ mul.wide.s32 %rd1713, %r1374, 4;
+ add.s64 %rd1714, %rd1, %rd1713;
+ ld.local.u32 %r5468, [%rd1714];
+ shr.u32 %r5469, %r5468, %r5465;
+ shl.b32 %r5470, %r8245, %r1377;
+ add.s32 %r8245, %r5469, %r5470;
+
+$L__BB0_1053:
+ and.b32 %r5471, %r1368, -2147483648;
+ shr.u32 %r5472, %r8245, 30;
+ shl.b32 %r5473, %r8244, 2;
+ or.b32 %r5474, %r5472, %r5473;
+ shr.u32 %r5475, %r5474, 31;
+ shr.u32 %r5476, %r8244, 30;
+ add.s32 %r5477, %r5475, %r5476;
+ neg.s32 %r5478, %r5477;
+ setp.eq.s32 %p898, %r5471, 0;
+ selp.b32 %r8246, %r5477, %r5478, %p898;
+ setp.ne.s32 %p899, %r5475, 0;
+ xor.b32 %r5479, %r5471, -2147483648;
+ selp.b32 %r5480, %r5479, %r5471, %p899;
+ selp.b32 %r5481, -1, 0, %p899;
+ xor.b32 %r5482, %r5474, %r5481;
+ shl.b32 %r5483, %r8245, 2;
+ xor.b32 %r5484, %r5483, %r5481;
+ cvt.u64.u32 %rd1715, %r5482;
+ cvt.u64.u32 %rd1716, %r5484;
+ bfi.b64 %rd1717, %rd1715, %rd1716, 32, 32;
+ cvt.rn.f64.s64 %fd139, %rd1717;
+ mul.f64 %fd140, %fd139, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3808, %fd140;
+ setp.eq.s32 %p900, %r5480, 0;
+ neg.f32 %f3809, %f3808;
+ selp.f32 %f5674, %f3808, %f3809, %p900;
+
+$L__BB0_1055:
+ add.s32 %r1384, %r8246, 1;
+ and.b32 %r1385, %r1384, 1;
+ setp.eq.s32 %p901, %r1385, 0;
+ selp.f32 %f1225, %f5674, 0f3F800000, %p901;
+ mul.rn.f32 %f1226, %f5674, %f5674;
+ mov.f32 %f5675, 0fB94D4153;
+ @%p901 bra $L__BB0_1057;
+
+ mov.f32 %f3812, 0fBAB607ED;
+ mov.f32 %f3813, 0f37CBAC00;
+ fma.rn.f32 %f5675, %f3813, %f1226, %f3812;
+
+$L__BB0_1057:
+ selp.f32 %f3814, 0f3C0885E4, 0f3D2AAABB, %p901;
+ fma.rn.f32 %f3815, %f5675, %f1226, %f3814;
+ selp.f32 %f3816, 0fBE2AAAA8, 0fBEFFFFFF, %p901;
+ fma.rn.f32 %f3817, %f3815, %f1226, %f3816;
+ mov.f32 %f3818, 0f00000000;
+ fma.rn.f32 %f3819, %f1226, %f1225, %f3818;
+ fma.rn.f32 %f5283, %f3817, %f3819, %f1225;
+ and.b32 %r5486, %r1384, 2;
+ setp.eq.s32 %p903, %r5486, 0;
+ @%p903 bra $L__BB0_1059;
+
+ mov.f32 %f3821, 0fBF800000;
+ fma.rn.f32 %f5283, %f5283, %f3821, %f3818;
+
+$L__BB0_1059:
+ selp.f32 %f1233, %f5283, %f5284, %p18;
+ selp.f32 %f1234, %f5281, %f5282, %p18;
+ @%p882 bra $L__BB0_1061;
+
+ add.f32 %f5796, %f1234, %f1233;
+ mov.f32 %f5282, %f5281;
+ mov.f32 %f5284, %f5283;
+
+$L__BB0_1061:
+ @%p810 bra $L__BB0_1090;
+
+ shl.b32 %r5487, %r12, 5;
+ mov.u32 %r5488, -32;
+ sub.s32 %r1386, %r5488, %r5487;
+ setp.ge.s32 %p907, %r11, %r1386;
+ @%p907 bra $L__BB0_1075;
+
+ mul.f32 %f3824, %f5413, 0f3F22F983;
+ cvt.rni.s32.f32 %r8250, %f3824;
+ cvt.rn.f32.s32 %f3825, %r8250;
+ mov.f32 %f3826, 0fBFC90FDA;
+ fma.rn.f32 %f3827, %f3825, %f3826, %f5413;
+ mov.f32 %f3828, 0fB3A22168;
+ fma.rn.f32 %f3829, %f3825, %f3828, %f3827;
+ mov.f32 %f3830, 0fA7C234C5;
+ fma.rn.f32 %f5683, %f3825, %f3830, %f3829;
+ abs.f32 %f1242, %f5413;
+ setp.ltu.f32 %p908, %f1242, 0f47CE4780;
+ @%p908 bra $L__BB0_1071;
+
+ setp.eq.f32 %p909, %f1242, 0f7F800000;
+ @%p909 bra $L__BB0_1070;
+ bra.uni $L__BB0_1065;
+
+$L__BB0_1070:
+ mov.f32 %f3833, 0f00000000;
+ mul.rn.f32 %f5683, %f5413, %f3833;
+ mov.u32 %r8250, 0;
+ bra.uni $L__BB0_1071;
+
+$L__BB0_1065:
+ mov.b32 %r1388, %f5413;
+ shr.u32 %r5490, %r1388, 23;
+ and.b32 %r5491, %r5490, 255;
+ add.s32 %r1389, %r5491, -128;
+ shl.b32 %r5492, %r1388, 8;
+ or.b32 %r1390, %r5492, -2147483648;
+ shr.u32 %r1391, %r1389, 5;
+ mov.u64 %rd2645, 0;
+ mov.u32 %r8247, 0;
+ mov.u64 %rd1721, __cudart_i2opi_f;
+ mov.u64 %rd2646, %rd2645;
+
+$L__BB0_1066:
+ .pragma "nounroll";
+ shl.b64 %rd1720, %rd2645, 2;
+ add.s64 %rd1722, %rd1721, %rd1720;
+ ld.global.nc.u32 %r5493, [%rd1722];
+ mad.wide.u32 %rd1723, %r5493, %r1390, %rd2646;
+ shr.u64 %rd2646, %rd1723, 32;
+ add.s64 %rd1724, %rd1, %rd1720;
+ st.local.u32 [%rd1724], %rd1723;
+ add.s32 %r8247, %r8247, 1;
+ cvt.s64.s32 %rd2645, %r8247;
+ setp.ne.s32 %p910, %r8247, 6;
+ @%p910 bra $L__BB0_1066;
+
+ st.local.u32 [%rd4], %rd2646;
+ mov.u32 %r5494, 4;
+ sub.s32 %r1394, %r5494, %r1391;
+ mov.u32 %r5495, 6;
+ sub.s32 %r5496, %r5495, %r1391;
+ mul.wide.s32 %rd1725, %r5496, 4;
+ add.s64 %rd1726, %rd1, %rd1725;
+ ld.local.u32 %r8248, [%rd1726];
+ ld.local.u32 %r8249, [%rd1726+-4];
+ and.b32 %r1397, %r1389, 31;
+ setp.eq.s32 %p911, %r1397, 0;
+ @%p911 bra $L__BB0_1069;
+
+ mov.u32 %r5497, 32;
+ sub.s32 %r5498, %r5497, %r1397;
+ shr.u32 %r5499, %r8249, %r5498;
+ shl.b32 %r5500, %r8248, %r1397;
+ add.s32 %r8248, %r5499, %r5500;
+ mul.wide.s32 %rd1727, %r1394, 4;
+ add.s64 %rd1728, %rd1, %rd1727;
+ ld.local.u32 %r5501, [%rd1728];
+ shr.u32 %r5502, %r5501, %r5498;
+ shl.b32 %r5503, %r8249, %r1397;
+ add.s32 %r8249, %r5502, %r5503;
+
+$L__BB0_1069:
+ and.b32 %r5504, %r1388, -2147483648;
+ shr.u32 %r5505, %r8249, 30;
+ shl.b32 %r5506, %r8248, 2;
+ or.b32 %r5507, %r5505, %r5506;
+ shr.u32 %r5508, %r5507, 31;
+ shr.u32 %r5509, %r8248, 30;
+ add.s32 %r5510, %r5508, %r5509;
+ neg.s32 %r5511, %r5510;
+ setp.eq.s32 %p912, %r5504, 0;
+ selp.b32 %r8250, %r5510, %r5511, %p912;
+ setp.ne.s32 %p913, %r5508, 0;
+ xor.b32 %r5512, %r5504, -2147483648;
+ selp.b32 %r5513, %r5512, %r5504, %p913;
+ selp.b32 %r5514, -1, 0, %p913;
+ xor.b32 %r5515, %r5507, %r5514;
+ shl.b32 %r5516, %r8249, 2;
+ xor.b32 %r5517, %r5516, %r5514;
+ cvt.u64.u32 %rd1729, %r5515;
+ cvt.u64.u32 %rd1730, %r5517;
+ bfi.b64 %rd1731, %rd1729, %rd1730, 32, 32;
+ cvt.rn.f64.s64 %fd141, %rd1731;
+ mul.f64 %fd142, %fd141, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3831, %fd142;
+ setp.eq.s32 %p914, %r5513, 0;
+ neg.f32 %f3832, %f3831;
+ selp.f32 %f5683, %f3831, %f3832, %p914;
+
+$L__BB0_1071:
+ and.b32 %r1404, %r8250, 1;
+ setp.eq.s32 %p915, %r1404, 0;
+ selp.f32 %f1246, %f5683, 0f3F800000, %p915;
+ mul.rn.f32 %f1247, %f5683, %f5683;
+ mov.f32 %f5684, 0fB94D4153;
+ @%p915 bra $L__BB0_1073;
+
+ mov.f32 %f3835, 0fBAB607ED;
+ mov.f32 %f3836, 0f37CBAC00;
+ fma.rn.f32 %f5684, %f3836, %f1247, %f3835;
+
+$L__BB0_1073:
+ selp.f32 %f3837, 0f3C0885E4, 0f3D2AAABB, %p915;
+ fma.rn.f32 %f3838, %f5684, %f1247, %f3837;
+ selp.f32 %f3839, 0fBE2AAAA8, 0fBEFFFFFF, %p915;
+ fma.rn.f32 %f3840, %f3838, %f1247, %f3839;
+ mov.f32 %f3841, 0f00000000;
+ fma.rn.f32 %f3842, %f1247, %f1246, %f3841;
+ fma.rn.f32 %f5281, %f3840, %f3842, %f1246;
+ and.b32 %r5519, %r8250, 2;
+ setp.eq.s32 %p917, %r5519, 0;
+ @%p917 bra $L__BB0_1075;
+
+ mov.f32 %f3844, 0fBF800000;
+ fma.rn.f32 %f5281, %f5281, %f3844, %f3841;
+
+$L__BB0_1075:
+ setp.lt.s32 %p19, %r11, %r1386;
+ @%p907 bra $L__BB0_1088;
+
+ mul.f32 %f3845, %f5405, 0f3F22F983;
+ cvt.rni.s32.f32 %r8254, %f3845;
+ cvt.rn.f32.s32 %f3846, %r8254;
+ mov.f32 %f3847, 0fBFC90FDA;
+ fma.rn.f32 %f3848, %f3846, %f3847, %f5405;
+ mov.f32 %f3849, 0fB3A22168;
+ fma.rn.f32 %f3850, %f3846, %f3849, %f3848;
+ mov.f32 %f3851, 0fA7C234C5;
+ fma.rn.f32 %f5687, %f3846, %f3851, %f3850;
+ abs.f32 %f1255, %f5405;
+ setp.ltu.f32 %p919, %f1255, 0f47CE4780;
+ @%p919 bra $L__BB0_1084;
+
+ setp.eq.f32 %p920, %f1255, 0f7F800000;
+ @%p920 bra $L__BB0_1083;
+ bra.uni $L__BB0_1078;
+
+$L__BB0_1083:
+ mov.f32 %f3854, 0f00000000;
+ mul.rn.f32 %f5687, %f5405, %f3854;
+ mov.u32 %r8254, 0;
+ bra.uni $L__BB0_1084;
+
+$L__BB0_1078:
+ mov.b32 %r1406, %f5405;
+ shr.u32 %r5521, %r1406, 23;
+ and.b32 %r5522, %r5521, 255;
+ add.s32 %r1407, %r5522, -128;
+ shl.b32 %r5523, %r1406, 8;
+ or.b32 %r1408, %r5523, -2147483648;
+ shr.u32 %r1409, %r1407, 5;
+ mov.u64 %rd2647, 0;
+ mov.u32 %r8251, 0;
+ mov.u64 %rd1735, __cudart_i2opi_f;
+ mov.u64 %rd2648, %rd2647;
+
+$L__BB0_1079:
+ .pragma "nounroll";
+ shl.b64 %rd1734, %rd2647, 2;
+ add.s64 %rd1736, %rd1735, %rd1734;
+ ld.global.nc.u32 %r5524, [%rd1736];
+ mad.wide.u32 %rd1737, %r5524, %r1408, %rd2648;
+ shr.u64 %rd2648, %rd1737, 32;
+ add.s64 %rd1738, %rd1, %rd1734;
+ st.local.u32 [%rd1738], %rd1737;
+ add.s32 %r8251, %r8251, 1;
+ cvt.s64.s32 %rd2647, %r8251;
+ setp.ne.s32 %p921, %r8251, 6;
+ @%p921 bra $L__BB0_1079;
+
+ st.local.u32 [%rd4], %rd2648;
+ mov.u32 %r5525, 4;
+ sub.s32 %r1412, %r5525, %r1409;
+ mov.u32 %r5526, 6;
+ sub.s32 %r5527, %r5526, %r1409;
+ mul.wide.s32 %rd1739, %r5527, 4;
+ add.s64 %rd1740, %rd1, %rd1739;
+ ld.local.u32 %r8252, [%rd1740];
+ ld.local.u32 %r8253, [%rd1740+-4];
+ and.b32 %r1415, %r1407, 31;
+ setp.eq.s32 %p922, %r1415, 0;
+ @%p922 bra $L__BB0_1082;
+
+ mov.u32 %r5528, 32;
+ sub.s32 %r5529, %r5528, %r1415;
+ shr.u32 %r5530, %r8253, %r5529;
+ shl.b32 %r5531, %r8252, %r1415;
+ add.s32 %r8252, %r5530, %r5531;
+ mul.wide.s32 %rd1741, %r1412, 4;
+ add.s64 %rd1742, %rd1, %rd1741;
+ ld.local.u32 %r5532, [%rd1742];
+ shr.u32 %r5533, %r5532, %r5529;
+ shl.b32 %r5534, %r8253, %r1415;
+ add.s32 %r8253, %r5533, %r5534;
+
+$L__BB0_1082:
+ and.b32 %r5535, %r1406, -2147483648;
+ shr.u32 %r5536, %r8253, 30;
+ shl.b32 %r5537, %r8252, 2;
+ or.b32 %r5538, %r5536, %r5537;
+ shr.u32 %r5539, %r5538, 31;
+ shr.u32 %r5540, %r8252, 30;
+ add.s32 %r5541, %r5539, %r5540;
+ neg.s32 %r5542, %r5541;
+ setp.eq.s32 %p923, %r5535, 0;
+ selp.b32 %r8254, %r5541, %r5542, %p923;
+ setp.ne.s32 %p924, %r5539, 0;
+ xor.b32 %r5543, %r5535, -2147483648;
+ selp.b32 %r5544, %r5543, %r5535, %p924;
+ selp.b32 %r5545, -1, 0, %p924;
+ xor.b32 %r5546, %r5538, %r5545;
+ shl.b32 %r5547, %r8253, 2;
+ xor.b32 %r5548, %r5547, %r5545;
+ cvt.u64.u32 %rd1743, %r5546;
+ cvt.u64.u32 %rd1744, %r5548;
+ bfi.b64 %rd1745, %rd1743, %rd1744, 32, 32;
+ cvt.rn.f64.s64 %fd143, %rd1745;
+ mul.f64 %fd144, %fd143, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3852, %fd144;
+ setp.eq.s32 %p925, %r5544, 0;
+ neg.f32 %f3853, %f3852;
+ selp.f32 %f5687, %f3852, %f3853, %p925;
+
+$L__BB0_1084:
+ add.s32 %r1422, %r8254, 1;
+ and.b32 %r1423, %r1422, 1;
+ setp.eq.s32 %p926, %r1423, 0;
+ selp.f32 %f1259, %f5687, 0f3F800000, %p926;
+ mul.rn.f32 %f1260, %f5687, %f5687;
+ mov.f32 %f5688, 0fB94D4153;
+ @%p926 bra $L__BB0_1086;
+
+ mov.f32 %f3856, 0fBAB607ED;
+ mov.f32 %f3857, 0f37CBAC00;
+ fma.rn.f32 %f5688, %f3857, %f1260, %f3856;
+
+$L__BB0_1086:
+ selp.f32 %f3858, 0f3C0885E4, 0f3D2AAABB, %p926;
+ fma.rn.f32 %f3859, %f5688, %f1260, %f3858;
+ selp.f32 %f3860, 0fBE2AAAA8, 0fBEFFFFFF, %p926;
+ fma.rn.f32 %f3861, %f3859, %f1260, %f3860;
+ mov.f32 %f3862, 0f00000000;
+ fma.rn.f32 %f3863, %f1260, %f1259, %f3862;
+ fma.rn.f32 %f5283, %f3861, %f3863, %f1259;
+ and.b32 %r5550, %r1422, 2;
+ setp.eq.s32 %p928, %r5550, 0;
+ @%p928 bra $L__BB0_1088;
+
+ mov.f32 %f3865, 0fBF800000;
+ fma.rn.f32 %f5283, %f5283, %f3865, %f3862;
+
+$L__BB0_1088:
+ selp.f32 %f1267, %f5283, %f5284, %p19;
+ selp.f32 %f1268, %f5281, %f5282, %p19;
+ @%p907 bra $L__BB0_1090;
+
+ add.f32 %f5795, %f1268, %f1267;
+ mov.f32 %f5282, %f5281;
+ mov.f32 %f5284, %f5283;
+
+$L__BB0_1090:
+ @%p813 bra $L__BB0_1119;
+
+ shl.b32 %r5551, %r12, 5;
+ neg.s32 %r1424, %r5551;
+ setp.ge.s32 %p932, %r11, %r1424;
+ @%p932 bra $L__BB0_1104;
+
+ mul.f32 %f3868, %f5412, 0f3F22F983;
+ cvt.rni.s32.f32 %r8258, %f3868;
+ cvt.rn.f32.s32 %f3869, %r8258;
+ mov.f32 %f3870, 0fBFC90FDA;
+ fma.rn.f32 %f3871, %f3869, %f3870, %f5412;
+ mov.f32 %f3872, 0fB3A22168;
+ fma.rn.f32 %f3873, %f3869, %f3872, %f3871;
+ mov.f32 %f3874, 0fA7C234C5;
+ fma.rn.f32 %f5696, %f3869, %f3874, %f3873;
+ abs.f32 %f1276, %f5412;
+ setp.ltu.f32 %p933, %f1276, 0f47CE4780;
+ @%p933 bra $L__BB0_1100;
+
+ setp.eq.f32 %p934, %f1276, 0f7F800000;
+ @%p934 bra $L__BB0_1099;
+ bra.uni $L__BB0_1094;
+
+$L__BB0_1099:
+ mov.f32 %f3877, 0f00000000;
+ mul.rn.f32 %f5696, %f5412, %f3877;
+ mov.u32 %r8258, 0;
+ bra.uni $L__BB0_1100;
+
+$L__BB0_1094:
+ mov.b32 %r1426, %f5412;
+ shr.u32 %r5553, %r1426, 23;
+ and.b32 %r5554, %r5553, 255;
+ add.s32 %r1427, %r5554, -128;
+ shl.b32 %r5555, %r1426, 8;
+ or.b32 %r1428, %r5555, -2147483648;
+ shr.u32 %r1429, %r1427, 5;
+ mov.u64 %rd2649, 0;
+ mov.u32 %r8255, 0;
+ mov.u64 %rd1749, __cudart_i2opi_f;
+ mov.u64 %rd2650, %rd2649;
+
+$L__BB0_1095:
+ .pragma "nounroll";
+ shl.b64 %rd1748, %rd2649, 2;
+ add.s64 %rd1750, %rd1749, %rd1748;
+ ld.global.nc.u32 %r5556, [%rd1750];
+ mad.wide.u32 %rd1751, %r5556, %r1428, %rd2650;
+ shr.u64 %rd2650, %rd1751, 32;
+ add.s64 %rd1752, %rd1, %rd1748;
+ st.local.u32 [%rd1752], %rd1751;
+ add.s32 %r8255, %r8255, 1;
+ cvt.s64.s32 %rd2649, %r8255;
+ setp.ne.s32 %p935, %r8255, 6;
+ @%p935 bra $L__BB0_1095;
+
+ st.local.u32 [%rd4], %rd2650;
+ mov.u32 %r5557, 4;
+ sub.s32 %r1432, %r5557, %r1429;
+ mov.u32 %r5558, 6;
+ sub.s32 %r5559, %r5558, %r1429;
+ mul.wide.s32 %rd1753, %r5559, 4;
+ add.s64 %rd1754, %rd1, %rd1753;
+ ld.local.u32 %r8256, [%rd1754];
+ ld.local.u32 %r8257, [%rd1754+-4];
+ and.b32 %r1435, %r1427, 31;
+ setp.eq.s32 %p936, %r1435, 0;
+ @%p936 bra $L__BB0_1098;
+
+ mov.u32 %r5560, 32;
+ sub.s32 %r5561, %r5560, %r1435;
+ shr.u32 %r5562, %r8257, %r5561;
+ shl.b32 %r5563, %r8256, %r1435;
+ add.s32 %r8256, %r5562, %r5563;
+ mul.wide.s32 %rd1755, %r1432, 4;
+ add.s64 %rd1756, %rd1, %rd1755;
+ ld.local.u32 %r5564, [%rd1756];
+ shr.u32 %r5565, %r5564, %r5561;
+ shl.b32 %r5566, %r8257, %r1435;
+ add.s32 %r8257, %r5565, %r5566;
+
+$L__BB0_1098:
+ and.b32 %r5567, %r1426, -2147483648;
+ shr.u32 %r5568, %r8257, 30;
+ shl.b32 %r5569, %r8256, 2;
+ or.b32 %r5570, %r5568, %r5569;
+ shr.u32 %r5571, %r5570, 31;
+ shr.u32 %r5572, %r8256, 30;
+ add.s32 %r5573, %r5571, %r5572;
+ neg.s32 %r5574, %r5573;
+ setp.eq.s32 %p937, %r5567, 0;
+ selp.b32 %r8258, %r5573, %r5574, %p937;
+ setp.ne.s32 %p938, %r5571, 0;
+ xor.b32 %r5575, %r5567, -2147483648;
+ selp.b32 %r5576, %r5575, %r5567, %p938;
+ selp.b32 %r5577, -1, 0, %p938;
+ xor.b32 %r5578, %r5570, %r5577;
+ shl.b32 %r5579, %r8257, 2;
+ xor.b32 %r5580, %r5579, %r5577;
+ cvt.u64.u32 %rd1757, %r5578;
+ cvt.u64.u32 %rd1758, %r5580;
+ bfi.b64 %rd1759, %rd1757, %rd1758, 32, 32;
+ cvt.rn.f64.s64 %fd145, %rd1759;
+ mul.f64 %fd146, %fd145, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3875, %fd146;
+ setp.eq.s32 %p939, %r5576, 0;
+ neg.f32 %f3876, %f3875;
+ selp.f32 %f5696, %f3875, %f3876, %p939;
+
+$L__BB0_1100:
+ and.b32 %r1442, %r8258, 1;
+ setp.eq.s32 %p940, %r1442, 0;
+ selp.f32 %f1280, %f5696, 0f3F800000, %p940;
+ mul.rn.f32 %f1281, %f5696, %f5696;
+ mov.f32 %f5697, 0fB94D4153;
+ @%p940 bra $L__BB0_1102;
+
+ mov.f32 %f3879, 0fBAB607ED;
+ mov.f32 %f3880, 0f37CBAC00;
+ fma.rn.f32 %f5697, %f3880, %f1281, %f3879;
+
+$L__BB0_1102:
+ selp.f32 %f3881, 0f3C0885E4, 0f3D2AAABB, %p940;
+ fma.rn.f32 %f3882, %f5697, %f1281, %f3881;
+ selp.f32 %f3883, 0fBE2AAAA8, 0fBEFFFFFF, %p940;
+ fma.rn.f32 %f3884, %f3882, %f1281, %f3883;
+ mov.f32 %f3885, 0f00000000;
+ fma.rn.f32 %f3886, %f1281, %f1280, %f3885;
+ fma.rn.f32 %f5281, %f3884, %f3886, %f1280;
+ and.b32 %r5582, %r8258, 2;
+ setp.eq.s32 %p942, %r5582, 0;
+ @%p942 bra $L__BB0_1104;
+
+ mov.f32 %f3888, 0fBF800000;
+ fma.rn.f32 %f5281, %f5281, %f3888, %f3885;
+
+$L__BB0_1104:
+ setp.lt.s32 %p20, %r11, %r1424;
+ @%p932 bra $L__BB0_1117;
+
+ mul.f32 %f3889, %f5404, 0f3F22F983;
+ cvt.rni.s32.f32 %r8262, %f3889;
+ cvt.rn.f32.s32 %f3890, %r8262;
+ mov.f32 %f3891, 0fBFC90FDA;
+ fma.rn.f32 %f3892, %f3890, %f3891, %f5404;
+ mov.f32 %f3893, 0fB3A22168;
+ fma.rn.f32 %f3894, %f3890, %f3893, %f3892;
+ mov.f32 %f3895, 0fA7C234C5;
+ fma.rn.f32 %f5700, %f3890, %f3895, %f3894;
+ abs.f32 %f1289, %f5404;
+ setp.ltu.f32 %p944, %f1289, 0f47CE4780;
+ @%p944 bra $L__BB0_1113;
+
+ setp.eq.f32 %p945, %f1289, 0f7F800000;
+ @%p945 bra $L__BB0_1112;
+ bra.uni $L__BB0_1107;
+
+$L__BB0_1112:
+ mov.f32 %f3898, 0f00000000;
+ mul.rn.f32 %f5700, %f5404, %f3898;
+ mov.u32 %r8262, 0;
+ bra.uni $L__BB0_1113;
+
+$L__BB0_1107:
+ mov.b32 %r1444, %f5404;
+ shr.u32 %r5584, %r1444, 23;
+ and.b32 %r5585, %r5584, 255;
+ add.s32 %r1445, %r5585, -128;
+ shl.b32 %r5586, %r1444, 8;
+ or.b32 %r1446, %r5586, -2147483648;
+ shr.u32 %r1447, %r1445, 5;
+ mov.u64 %rd2651, 0;
+ mov.u32 %r8259, 0;
+ mov.u64 %rd1763, __cudart_i2opi_f;
+ mov.u64 %rd2652, %rd2651;
+
+$L__BB0_1108:
+ .pragma "nounroll";
+ shl.b64 %rd1762, %rd2651, 2;
+ add.s64 %rd1764, %rd1763, %rd1762;
+ ld.global.nc.u32 %r5587, [%rd1764];
+ mad.wide.u32 %rd1765, %r5587, %r1446, %rd2652;
+ shr.u64 %rd2652, %rd1765, 32;
+ add.s64 %rd1766, %rd1, %rd1762;
+ st.local.u32 [%rd1766], %rd1765;
+ add.s32 %r8259, %r8259, 1;
+ cvt.s64.s32 %rd2651, %r8259;
+ setp.ne.s32 %p946, %r8259, 6;
+ @%p946 bra $L__BB0_1108;
+
+ st.local.u32 [%rd4], %rd2652;
+ mov.u32 %r5588, 4;
+ sub.s32 %r1450, %r5588, %r1447;
+ mov.u32 %r5589, 6;
+ sub.s32 %r5590, %r5589, %r1447;
+ mul.wide.s32 %rd1767, %r5590, 4;
+ add.s64 %rd1768, %rd1, %rd1767;
+ ld.local.u32 %r8260, [%rd1768];
+ ld.local.u32 %r8261, [%rd1768+-4];
+ and.b32 %r1453, %r1445, 31;
+ setp.eq.s32 %p947, %r1453, 0;
+ @%p947 bra $L__BB0_1111;
+
+ mov.u32 %r5591, 32;
+ sub.s32 %r5592, %r5591, %r1453;
+ shr.u32 %r5593, %r8261, %r5592;
+ shl.b32 %r5594, %r8260, %r1453;
+ add.s32 %r8260, %r5593, %r5594;
+ mul.wide.s32 %rd1769, %r1450, 4;
+ add.s64 %rd1770, %rd1, %rd1769;
+ ld.local.u32 %r5595, [%rd1770];
+ shr.u32 %r5596, %r5595, %r5592;
+ shl.b32 %r5597, %r8261, %r1453;
+ add.s32 %r8261, %r5596, %r5597;
+
+$L__BB0_1111:
+ and.b32 %r5598, %r1444, -2147483648;
+ shr.u32 %r5599, %r8261, 30;
+ shl.b32 %r5600, %r8260, 2;
+ or.b32 %r5601, %r5599, %r5600;
+ shr.u32 %r5602, %r5601, 31;
+ shr.u32 %r5603, %r8260, 30;
+ add.s32 %r5604, %r5602, %r5603;
+ neg.s32 %r5605, %r5604;
+ setp.eq.s32 %p948, %r5598, 0;
+ selp.b32 %r8262, %r5604, %r5605, %p948;
+ setp.ne.s32 %p949, %r5602, 0;
+ xor.b32 %r5606, %r5598, -2147483648;
+ selp.b32 %r5607, %r5606, %r5598, %p949;
+ selp.b32 %r5608, -1, 0, %p949;
+ xor.b32 %r5609, %r5601, %r5608;
+ shl.b32 %r5610, %r8261, 2;
+ xor.b32 %r5611, %r5610, %r5608;
+ cvt.u64.u32 %rd1771, %r5609;
+ cvt.u64.u32 %rd1772, %r5611;
+ bfi.b64 %rd1773, %rd1771, %rd1772, 32, 32;
+ cvt.rn.f64.s64 %fd147, %rd1773;
+ mul.f64 %fd148, %fd147, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3896, %fd148;
+ setp.eq.s32 %p950, %r5607, 0;
+ neg.f32 %f3897, %f3896;
+ selp.f32 %f5700, %f3896, %f3897, %p950;
+
+$L__BB0_1113:
+ add.s32 %r1460, %r8262, 1;
+ and.b32 %r1461, %r1460, 1;
+ setp.eq.s32 %p951, %r1461, 0;
+ selp.f32 %f1293, %f5700, 0f3F800000, %p951;
+ mul.rn.f32 %f1294, %f5700, %f5700;
+ mov.f32 %f5701, 0fB94D4153;
+ @%p951 bra $L__BB0_1115;
+
+ mov.f32 %f3900, 0fBAB607ED;
+ mov.f32 %f3901, 0f37CBAC00;
+ fma.rn.f32 %f5701, %f3901, %f1294, %f3900;
+
+$L__BB0_1115:
+ selp.f32 %f3902, 0f3C0885E4, 0f3D2AAABB, %p951;
+ fma.rn.f32 %f3903, %f5701, %f1294, %f3902;
+ selp.f32 %f3904, 0fBE2AAAA8, 0fBEFFFFFF, %p951;
+ fma.rn.f32 %f3905, %f3903, %f1294, %f3904;
+ mov.f32 %f3906, 0f00000000;
+ fma.rn.f32 %f3907, %f1294, %f1293, %f3906;
+ fma.rn.f32 %f5283, %f3905, %f3907, %f1293;
+ and.b32 %r5613, %r1460, 2;
+ setp.eq.s32 %p953, %r5613, 0;
+ @%p953 bra $L__BB0_1117;
+
+ mov.f32 %f3909, 0fBF800000;
+ fma.rn.f32 %f5283, %f5283, %f3909, %f3906;
+
+$L__BB0_1117:
+ selp.f32 %f1301, %f5283, %f5284, %p20;
+ selp.f32 %f1302, %f5281, %f5282, %p20;
+ @%p932 bra $L__BB0_1119;
+
+ add.f32 %f5794, %f1302, %f1301;
+ mov.f32 %f5282, %f5281;
+ mov.f32 %f5284, %f5283;
+
+$L__BB0_1119:
+ @%p813 bra $L__BB0_1148;
+
+ shl.b32 %r5614, %r12, 5;
+ mov.u32 %r5615, -32;
+ sub.s32 %r1462, %r5615, %r5614;
+ setp.ge.s32 %p957, %r11, %r1462;
+ @%p957 bra $L__BB0_1133;
+
+ mul.f32 %f3912, %f5411, 0f3F22F983;
+ cvt.rni.s32.f32 %r8266, %f3912;
+ cvt.rn.f32.s32 %f3913, %r8266;
+ mov.f32 %f3914, 0fBFC90FDA;
+ fma.rn.f32 %f3915, %f3913, %f3914, %f5411;
+ mov.f32 %f3916, 0fB3A22168;
+ fma.rn.f32 %f3917, %f3913, %f3916, %f3915;
+ mov.f32 %f3918, 0fA7C234C5;
+ fma.rn.f32 %f5709, %f3913, %f3918, %f3917;
+ abs.f32 %f1310, %f5411;
+ setp.ltu.f32 %p958, %f1310, 0f47CE4780;
+ @%p958 bra $L__BB0_1129;
+
+ setp.eq.f32 %p959, %f1310, 0f7F800000;
+ @%p959 bra $L__BB0_1128;
+ bra.uni $L__BB0_1123;
+
+$L__BB0_1128:
+ mov.f32 %f3921, 0f00000000;
+ mul.rn.f32 %f5709, %f5411, %f3921;
+ mov.u32 %r8266, 0;
+ bra.uni $L__BB0_1129;
+
+$L__BB0_1123:
+ mov.b32 %r1464, %f5411;
+ shr.u32 %r5617, %r1464, 23;
+ and.b32 %r5618, %r5617, 255;
+ add.s32 %r1465, %r5618, -128;
+ shl.b32 %r5619, %r1464, 8;
+ or.b32 %r1466, %r5619, -2147483648;
+ shr.u32 %r1467, %r1465, 5;
+ mov.u64 %rd2653, 0;
+ mov.u32 %r8263, 0;
+ mov.u64 %rd1777, __cudart_i2opi_f;
+ mov.u64 %rd2654, %rd2653;
+
+$L__BB0_1124:
+ .pragma "nounroll";
+ shl.b64 %rd1776, %rd2653, 2;
+ add.s64 %rd1778, %rd1777, %rd1776;
+ ld.global.nc.u32 %r5620, [%rd1778];
+ mad.wide.u32 %rd1779, %r5620, %r1466, %rd2654;
+ shr.u64 %rd2654, %rd1779, 32;
+ add.s64 %rd1780, %rd1, %rd1776;
+ st.local.u32 [%rd1780], %rd1779;
+ add.s32 %r8263, %r8263, 1;
+ cvt.s64.s32 %rd2653, %r8263;
+ setp.ne.s32 %p960, %r8263, 6;
+ @%p960 bra $L__BB0_1124;
+
+ st.local.u32 [%rd4], %rd2654;
+ mov.u32 %r5621, 4;
+ sub.s32 %r1470, %r5621, %r1467;
+ mov.u32 %r5622, 6;
+ sub.s32 %r5623, %r5622, %r1467;
+ mul.wide.s32 %rd1781, %r5623, 4;
+ add.s64 %rd1782, %rd1, %rd1781;
+ ld.local.u32 %r8264, [%rd1782];
+ ld.local.u32 %r8265, [%rd1782+-4];
+ and.b32 %r1473, %r1465, 31;
+ setp.eq.s32 %p961, %r1473, 0;
+ @%p961 bra $L__BB0_1127;
+
+ mov.u32 %r5624, 32;
+ sub.s32 %r5625, %r5624, %r1473;
+ shr.u32 %r5626, %r8265, %r5625;
+ shl.b32 %r5627, %r8264, %r1473;
+ add.s32 %r8264, %r5626, %r5627;
+ mul.wide.s32 %rd1783, %r1470, 4;
+ add.s64 %rd1784, %rd1, %rd1783;
+ ld.local.u32 %r5628, [%rd1784];
+ shr.u32 %r5629, %r5628, %r5625;
+ shl.b32 %r5630, %r8265, %r1473;
+ add.s32 %r8265, %r5629, %r5630;
+
+$L__BB0_1127:
+ and.b32 %r5631, %r1464, -2147483648;
+ shr.u32 %r5632, %r8265, 30;
+ shl.b32 %r5633, %r8264, 2;
+ or.b32 %r5634, %r5632, %r5633;
+ shr.u32 %r5635, %r5634, 31;
+ shr.u32 %r5636, %r8264, 30;
+ add.s32 %r5637, %r5635, %r5636;
+ neg.s32 %r5638, %r5637;
+ setp.eq.s32 %p962, %r5631, 0;
+ selp.b32 %r8266, %r5637, %r5638, %p962;
+ setp.ne.s32 %p963, %r5635, 0;
+ xor.b32 %r5639, %r5631, -2147483648;
+ selp.b32 %r5640, %r5639, %r5631, %p963;
+ selp.b32 %r5641, -1, 0, %p963;
+ xor.b32 %r5642, %r5634, %r5641;
+ shl.b32 %r5643, %r8265, 2;
+ xor.b32 %r5644, %r5643, %r5641;
+ cvt.u64.u32 %rd1785, %r5642;
+ cvt.u64.u32 %rd1786, %r5644;
+ bfi.b64 %rd1787, %rd1785, %rd1786, 32, 32;
+ cvt.rn.f64.s64 %fd149, %rd1787;
+ mul.f64 %fd150, %fd149, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3919, %fd150;
+ setp.eq.s32 %p964, %r5640, 0;
+ neg.f32 %f3920, %f3919;
+ selp.f32 %f5709, %f3919, %f3920, %p964;
+
+$L__BB0_1129:
+ and.b32 %r1480, %r8266, 1;
+ setp.eq.s32 %p965, %r1480, 0;
+ selp.f32 %f1314, %f5709, 0f3F800000, %p965;
+ mul.rn.f32 %f1315, %f5709, %f5709;
+ mov.f32 %f5710, 0fB94D4153;
+ @%p965 bra $L__BB0_1131;
+
+ mov.f32 %f3923, 0fBAB607ED;
+ mov.f32 %f3924, 0f37CBAC00;
+ fma.rn.f32 %f5710, %f3924, %f1315, %f3923;
+
+$L__BB0_1131:
+ selp.f32 %f3925, 0f3C0885E4, 0f3D2AAABB, %p965;
+ fma.rn.f32 %f3926, %f5710, %f1315, %f3925;
+ selp.f32 %f3927, 0fBE2AAAA8, 0fBEFFFFFF, %p965;
+ fma.rn.f32 %f3928, %f3926, %f1315, %f3927;
+ mov.f32 %f3929, 0f00000000;
+ fma.rn.f32 %f3930, %f1315, %f1314, %f3929;
+ fma.rn.f32 %f5281, %f3928, %f3930, %f1314;
+ and.b32 %r5646, %r8266, 2;
+ setp.eq.s32 %p967, %r5646, 0;
+ @%p967 bra $L__BB0_1133;
+
+ mov.f32 %f3932, 0fBF800000;
+ fma.rn.f32 %f5281, %f5281, %f3932, %f3929;
+
+$L__BB0_1133:
+ setp.lt.s32 %p21, %r11, %r1462;
+ @%p957 bra $L__BB0_1146;
+
+ mul.f32 %f3933, %f5403, 0f3F22F983;
+ cvt.rni.s32.f32 %r8270, %f3933;
+ cvt.rn.f32.s32 %f3934, %r8270;
+ mov.f32 %f3935, 0fBFC90FDA;
+ fma.rn.f32 %f3936, %f3934, %f3935, %f5403;
+ mov.f32 %f3937, 0fB3A22168;
+ fma.rn.f32 %f3938, %f3934, %f3937, %f3936;
+ mov.f32 %f3939, 0fA7C234C5;
+ fma.rn.f32 %f5713, %f3934, %f3939, %f3938;
+ abs.f32 %f1323, %f5403;
+ setp.ltu.f32 %p969, %f1323, 0f47CE4780;
+ @%p969 bra $L__BB0_1142;
+
+ setp.eq.f32 %p970, %f1323, 0f7F800000;
+ @%p970 bra $L__BB0_1141;
+ bra.uni $L__BB0_1136;
+
+$L__BB0_1141:
+ mov.f32 %f3942, 0f00000000;
+ mul.rn.f32 %f5713, %f5403, %f3942;
+ mov.u32 %r8270, 0;
+ bra.uni $L__BB0_1142;
+
+$L__BB0_1136:
+ mov.b32 %r1482, %f5403;
+ shr.u32 %r5648, %r1482, 23;
+ and.b32 %r5649, %r5648, 255;
+ add.s32 %r1483, %r5649, -128;
+ shl.b32 %r5650, %r1482, 8;
+ or.b32 %r1484, %r5650, -2147483648;
+ shr.u32 %r1485, %r1483, 5;
+ mov.u64 %rd2655, 0;
+ mov.u32 %r8267, 0;
+ mov.u64 %rd1791, __cudart_i2opi_f;
+ mov.u64 %rd2656, %rd2655;
+
+$L__BB0_1137:
+ .pragma "nounroll";
+ shl.b64 %rd1790, %rd2655, 2;
+ add.s64 %rd1792, %rd1791, %rd1790;
+ ld.global.nc.u32 %r5651, [%rd1792];
+ mad.wide.u32 %rd1793, %r5651, %r1484, %rd2656;
+ shr.u64 %rd2656, %rd1793, 32;
+ add.s64 %rd1794, %rd1, %rd1790;
+ st.local.u32 [%rd1794], %rd1793;
+ add.s32 %r8267, %r8267, 1;
+ cvt.s64.s32 %rd2655, %r8267;
+ setp.ne.s32 %p971, %r8267, 6;
+ @%p971 bra $L__BB0_1137;
+
+ st.local.u32 [%rd4], %rd2656;
+ mov.u32 %r5652, 4;
+ sub.s32 %r1488, %r5652, %r1485;
+ mov.u32 %r5653, 6;
+ sub.s32 %r5654, %r5653, %r1485;
+ mul.wide.s32 %rd1795, %r5654, 4;
+ add.s64 %rd1796, %rd1, %rd1795;
+ ld.local.u32 %r8268, [%rd1796];
+ ld.local.u32 %r8269, [%rd1796+-4];
+ and.b32 %r1491, %r1483, 31;
+ setp.eq.s32 %p972, %r1491, 0;
+ @%p972 bra $L__BB0_1140;
+
+ mov.u32 %r5655, 32;
+ sub.s32 %r5656, %r5655, %r1491;
+ shr.u32 %r5657, %r8269, %r5656;
+ shl.b32 %r5658, %r8268, %r1491;
+ add.s32 %r8268, %r5657, %r5658;
+ mul.wide.s32 %rd1797, %r1488, 4;
+ add.s64 %rd1798, %rd1, %rd1797;
+ ld.local.u32 %r5659, [%rd1798];
+ shr.u32 %r5660, %r5659, %r5656;
+ shl.b32 %r5661, %r8269, %r1491;
+ add.s32 %r8269, %r5660, %r5661;
+
+$L__BB0_1140:
+ and.b32 %r5662, %r1482, -2147483648;
+ shr.u32 %r5663, %r8269, 30;
+ shl.b32 %r5664, %r8268, 2;
+ or.b32 %r5665, %r5663, %r5664;
+ shr.u32 %r5666, %r5665, 31;
+ shr.u32 %r5667, %r8268, 30;
+ add.s32 %r5668, %r5666, %r5667;
+ neg.s32 %r5669, %r5668;
+ setp.eq.s32 %p973, %r5662, 0;
+ selp.b32 %r8270, %r5668, %r5669, %p973;
+ setp.ne.s32 %p974, %r5666, 0;
+ xor.b32 %r5670, %r5662, -2147483648;
+ selp.b32 %r5671, %r5670, %r5662, %p974;
+ selp.b32 %r5672, -1, 0, %p974;
+ xor.b32 %r5673, %r5665, %r5672;
+ shl.b32 %r5674, %r8269, 2;
+ xor.b32 %r5675, %r5674, %r5672;
+ cvt.u64.u32 %rd1799, %r5673;
+ cvt.u64.u32 %rd1800, %r5675;
+ bfi.b64 %rd1801, %rd1799, %rd1800, 32, 32;
+ cvt.rn.f64.s64 %fd151, %rd1801;
+ mul.f64 %fd152, %fd151, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3940, %fd152;
+ setp.eq.s32 %p975, %r5671, 0;
+ neg.f32 %f3941, %f3940;
+ selp.f32 %f5713, %f3940, %f3941, %p975;
+
+$L__BB0_1142:
+ add.s32 %r1498, %r8270, 1;
+ and.b32 %r1499, %r1498, 1;
+ setp.eq.s32 %p976, %r1499, 0;
+ selp.f32 %f1327, %f5713, 0f3F800000, %p976;
+ mul.rn.f32 %f1328, %f5713, %f5713;
+ mov.f32 %f5714, 0fB94D4153;
+ @%p976 bra $L__BB0_1144;
+
+ mov.f32 %f3944, 0fBAB607ED;
+ mov.f32 %f3945, 0f37CBAC00;
+ fma.rn.f32 %f5714, %f3945, %f1328, %f3944;
+
+$L__BB0_1144:
+ selp.f32 %f3946, 0f3C0885E4, 0f3D2AAABB, %p976;
+ fma.rn.f32 %f3947, %f5714, %f1328, %f3946;
+ selp.f32 %f3948, 0fBE2AAAA8, 0fBEFFFFFF, %p976;
+ fma.rn.f32 %f3949, %f3947, %f1328, %f3948;
+ mov.f32 %f3950, 0f00000000;
+ fma.rn.f32 %f3951, %f1328, %f1327, %f3950;
+ fma.rn.f32 %f5283, %f3949, %f3951, %f1327;
+ and.b32 %r5677, %r1498, 2;
+ setp.eq.s32 %p978, %r5677, 0;
+ @%p978 bra $L__BB0_1146;
+
+ mov.f32 %f3953, 0fBF800000;
+ fma.rn.f32 %f5283, %f5283, %f3953, %f3950;
+
+$L__BB0_1146:
+ selp.f32 %f1335, %f5283, %f5284, %p21;
+ selp.f32 %f1336, %f5281, %f5282, %p21;
+ @%p957 bra $L__BB0_1148;
+
+ add.f32 %f5793, %f1336, %f1335;
+ mov.f32 %f5282, %f5281;
+ mov.f32 %f5284, %f5283;
+
+$L__BB0_1148:
+ @%p816 bra $L__BB0_1177;
+
+ shl.b32 %r5678, %r12, 5;
+ neg.s32 %r1500, %r5678;
+ setp.ge.s32 %p982, %r11, %r1500;
+ @%p982 bra $L__BB0_1162;
+
+ mul.f32 %f3956, %f5410, 0f3F22F983;
+ cvt.rni.s32.f32 %r8274, %f3956;
+ cvt.rn.f32.s32 %f3957, %r8274;
+ mov.f32 %f3958, 0fBFC90FDA;
+ fma.rn.f32 %f3959, %f3957, %f3958, %f5410;
+ mov.f32 %f3960, 0fB3A22168;
+ fma.rn.f32 %f3961, %f3957, %f3960, %f3959;
+ mov.f32 %f3962, 0fA7C234C5;
+ fma.rn.f32 %f5722, %f3957, %f3962, %f3961;
+ abs.f32 %f1344, %f5410;
+ setp.ltu.f32 %p983, %f1344, 0f47CE4780;
+ @%p983 bra $L__BB0_1158;
+
+ setp.eq.f32 %p984, %f1344, 0f7F800000;
+ @%p984 bra $L__BB0_1157;
+ bra.uni $L__BB0_1152;
+
+$L__BB0_1157:
+ mov.f32 %f3965, 0f00000000;
+ mul.rn.f32 %f5722, %f5410, %f3965;
+ mov.u32 %r8274, 0;
+ bra.uni $L__BB0_1158;
+
+$L__BB0_1152:
+ mov.b32 %r1502, %f5410;
+ shr.u32 %r5680, %r1502, 23;
+ and.b32 %r5681, %r5680, 255;
+ add.s32 %r1503, %r5681, -128;
+ shl.b32 %r5682, %r1502, 8;
+ or.b32 %r1504, %r5682, -2147483648;
+ shr.u32 %r1505, %r1503, 5;
+ mov.u64 %rd2657, 0;
+ mov.u32 %r8271, 0;
+ mov.u64 %rd1805, __cudart_i2opi_f;
+ mov.u64 %rd2658, %rd2657;
+
+$L__BB0_1153:
+ .pragma "nounroll";
+ shl.b64 %rd1804, %rd2657, 2;
+ add.s64 %rd1806, %rd1805, %rd1804;
+ ld.global.nc.u32 %r5683, [%rd1806];
+ mad.wide.u32 %rd1807, %r5683, %r1504, %rd2658;
+ shr.u64 %rd2658, %rd1807, 32;
+ add.s64 %rd1808, %rd1, %rd1804;
+ st.local.u32 [%rd1808], %rd1807;
+ add.s32 %r8271, %r8271, 1;
+ cvt.s64.s32 %rd2657, %r8271;
+ setp.ne.s32 %p985, %r8271, 6;
+ @%p985 bra $L__BB0_1153;
+
+ st.local.u32 [%rd4], %rd2658;
+ mov.u32 %r5684, 4;
+ sub.s32 %r1508, %r5684, %r1505;
+ mov.u32 %r5685, 6;
+ sub.s32 %r5686, %r5685, %r1505;
+ mul.wide.s32 %rd1809, %r5686, 4;
+ add.s64 %rd1810, %rd1, %rd1809;
+ ld.local.u32 %r8272, [%rd1810];
+ ld.local.u32 %r8273, [%rd1810+-4];
+ and.b32 %r1511, %r1503, 31;
+ setp.eq.s32 %p986, %r1511, 0;
+ @%p986 bra $L__BB0_1156;
+
+ mov.u32 %r5687, 32;
+ sub.s32 %r5688, %r5687, %r1511;
+ shr.u32 %r5689, %r8273, %r5688;
+ shl.b32 %r5690, %r8272, %r1511;
+ add.s32 %r8272, %r5689, %r5690;
+ mul.wide.s32 %rd1811, %r1508, 4;
+ add.s64 %rd1812, %rd1, %rd1811;
+ ld.local.u32 %r5691, [%rd1812];
+ shr.u32 %r5692, %r5691, %r5688;
+ shl.b32 %r5693, %r8273, %r1511;
+ add.s32 %r8273, %r5692, %r5693;
+
+$L__BB0_1156:
+ and.b32 %r5694, %r1502, -2147483648;
+ shr.u32 %r5695, %r8273, 30;
+ shl.b32 %r5696, %r8272, 2;
+ or.b32 %r5697, %r5695, %r5696;
+ shr.u32 %r5698, %r5697, 31;
+ shr.u32 %r5699, %r8272, 30;
+ add.s32 %r5700, %r5698, %r5699;
+ neg.s32 %r5701, %r5700;
+ setp.eq.s32 %p987, %r5694, 0;
+ selp.b32 %r8274, %r5700, %r5701, %p987;
+ setp.ne.s32 %p988, %r5698, 0;
+ xor.b32 %r5702, %r5694, -2147483648;
+ selp.b32 %r5703, %r5702, %r5694, %p988;
+ selp.b32 %r5704, -1, 0, %p988;
+ xor.b32 %r5705, %r5697, %r5704;
+ shl.b32 %r5706, %r8273, 2;
+ xor.b32 %r5707, %r5706, %r5704;
+ cvt.u64.u32 %rd1813, %r5705;
+ cvt.u64.u32 %rd1814, %r5707;
+ bfi.b64 %rd1815, %rd1813, %rd1814, 32, 32;
+ cvt.rn.f64.s64 %fd153, %rd1815;
+ mul.f64 %fd154, %fd153, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3963, %fd154;
+ setp.eq.s32 %p989, %r5703, 0;
+ neg.f32 %f3964, %f3963;
+ selp.f32 %f5722, %f3963, %f3964, %p989;
+
+$L__BB0_1158:
+ and.b32 %r1518, %r8274, 1;
+ setp.eq.s32 %p990, %r1518, 0;
+ selp.f32 %f1348, %f5722, 0f3F800000, %p990;
+ mul.rn.f32 %f1349, %f5722, %f5722;
+ mov.f32 %f5723, 0fB94D4153;
+ @%p990 bra $L__BB0_1160;
+
+ mov.f32 %f3967, 0fBAB607ED;
+ mov.f32 %f3968, 0f37CBAC00;
+ fma.rn.f32 %f5723, %f3968, %f1349, %f3967;
+
+$L__BB0_1160:
+ selp.f32 %f3969, 0f3C0885E4, 0f3D2AAABB, %p990;
+ fma.rn.f32 %f3970, %f5723, %f1349, %f3969;
+ selp.f32 %f3971, 0fBE2AAAA8, 0fBEFFFFFF, %p990;
+ fma.rn.f32 %f3972, %f3970, %f1349, %f3971;
+ mov.f32 %f3973, 0f00000000;
+ fma.rn.f32 %f3974, %f1349, %f1348, %f3973;
+ fma.rn.f32 %f5281, %f3972, %f3974, %f1348;
+ and.b32 %r5709, %r8274, 2;
+ setp.eq.s32 %p992, %r5709, 0;
+ @%p992 bra $L__BB0_1162;
+
+ mov.f32 %f3976, 0fBF800000;
+ fma.rn.f32 %f5281, %f5281, %f3976, %f3973;
+
+$L__BB0_1162:
+ setp.lt.s32 %p22, %r11, %r1500;
+ @%p982 bra $L__BB0_1175;
+
+ mul.f32 %f3977, %f5402, 0f3F22F983;
+ cvt.rni.s32.f32 %r8278, %f3977;
+ cvt.rn.f32.s32 %f3978, %r8278;
+ mov.f32 %f3979, 0fBFC90FDA;
+ fma.rn.f32 %f3980, %f3978, %f3979, %f5402;
+ mov.f32 %f3981, 0fB3A22168;
+ fma.rn.f32 %f3982, %f3978, %f3981, %f3980;
+ mov.f32 %f3983, 0fA7C234C5;
+ fma.rn.f32 %f5726, %f3978, %f3983, %f3982;
+ abs.f32 %f1357, %f5402;
+ setp.ltu.f32 %p994, %f1357, 0f47CE4780;
+ @%p994 bra $L__BB0_1171;
+
+ setp.eq.f32 %p995, %f1357, 0f7F800000;
+ @%p995 bra $L__BB0_1170;
+ bra.uni $L__BB0_1165;
+
+$L__BB0_1170:
+ mov.f32 %f3986, 0f00000000;
+ mul.rn.f32 %f5726, %f5402, %f3986;
+ mov.u32 %r8278, 0;
+ bra.uni $L__BB0_1171;
+
+$L__BB0_1165:
+ mov.b32 %r1520, %f5402;
+ shr.u32 %r5711, %r1520, 23;
+ and.b32 %r5712, %r5711, 255;
+ add.s32 %r1521, %r5712, -128;
+ shl.b32 %r5713, %r1520, 8;
+ or.b32 %r1522, %r5713, -2147483648;
+ shr.u32 %r1523, %r1521, 5;
+ mov.u64 %rd2659, 0;
+ mov.u32 %r8275, 0;
+ mov.u64 %rd1819, __cudart_i2opi_f;
+ mov.u64 %rd2660, %rd2659;
+
+$L__BB0_1166:
+ .pragma "nounroll";
+ shl.b64 %rd1818, %rd2659, 2;
+ add.s64 %rd1820, %rd1819, %rd1818;
+ ld.global.nc.u32 %r5714, [%rd1820];
+ mad.wide.u32 %rd1821, %r5714, %r1522, %rd2660;
+ shr.u64 %rd2660, %rd1821, 32;
+ add.s64 %rd1822, %rd1, %rd1818;
+ st.local.u32 [%rd1822], %rd1821;
+ add.s32 %r8275, %r8275, 1;
+ cvt.s64.s32 %rd2659, %r8275;
+ setp.ne.s32 %p996, %r8275, 6;
+ @%p996 bra $L__BB0_1166;
+
+ st.local.u32 [%rd4], %rd2660;
+ mov.u32 %r5715, 4;
+ sub.s32 %r1526, %r5715, %r1523;
+ mov.u32 %r5716, 6;
+ sub.s32 %r5717, %r5716, %r1523;
+ mul.wide.s32 %rd1823, %r5717, 4;
+ add.s64 %rd1824, %rd1, %rd1823;
+ ld.local.u32 %r8276, [%rd1824];
+ ld.local.u32 %r8277, [%rd1824+-4];
+ and.b32 %r1529, %r1521, 31;
+ setp.eq.s32 %p997, %r1529, 0;
+ @%p997 bra $L__BB0_1169;
+
+ mov.u32 %r5718, 32;
+ sub.s32 %r5719, %r5718, %r1529;
+ shr.u32 %r5720, %r8277, %r5719;
+ shl.b32 %r5721, %r8276, %r1529;
+ add.s32 %r8276, %r5720, %r5721;
+ mul.wide.s32 %rd1825, %r1526, 4;
+ add.s64 %rd1826, %rd1, %rd1825;
+ ld.local.u32 %r5722, [%rd1826];
+ shr.u32 %r5723, %r5722, %r5719;
+ shl.b32 %r5724, %r8277, %r1529;
+ add.s32 %r8277, %r5723, %r5724;
+
+$L__BB0_1169:
+ and.b32 %r5725, %r1520, -2147483648;
+ shr.u32 %r5726, %r8277, 30;
+ shl.b32 %r5727, %r8276, 2;
+ or.b32 %r5728, %r5726, %r5727;
+ shr.u32 %r5729, %r5728, 31;
+ shr.u32 %r5730, %r8276, 30;
+ add.s32 %r5731, %r5729, %r5730;
+ neg.s32 %r5732, %r5731;
+ setp.eq.s32 %p998, %r5725, 0;
+ selp.b32 %r8278, %r5731, %r5732, %p998;
+ setp.ne.s32 %p999, %r5729, 0;
+ xor.b32 %r5733, %r5725, -2147483648;
+ selp.b32 %r5734, %r5733, %r5725, %p999;
+ selp.b32 %r5735, -1, 0, %p999;
+ xor.b32 %r5736, %r5728, %r5735;
+ shl.b32 %r5737, %r8277, 2;
+ xor.b32 %r5738, %r5737, %r5735;
+ cvt.u64.u32 %rd1827, %r5736;
+ cvt.u64.u32 %rd1828, %r5738;
+ bfi.b64 %rd1829, %rd1827, %rd1828, 32, 32;
+ cvt.rn.f64.s64 %fd155, %rd1829;
+ mul.f64 %fd156, %fd155, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f3984, %fd156;
+ setp.eq.s32 %p1000, %r5734, 0;
+ neg.f32 %f3985, %f3984;
+ selp.f32 %f5726, %f3984, %f3985, %p1000;
+
+$L__BB0_1171:
+ add.s32 %r1536, %r8278, 1;
+ and.b32 %r1537, %r1536, 1;
+ setp.eq.s32 %p1001, %r1537, 0;
+ selp.f32 %f1361, %f5726, 0f3F800000, %p1001;
+ mul.rn.f32 %f1362, %f5726, %f5726;
+ mov.f32 %f5727, 0fB94D4153;
+ @%p1001 bra $L__BB0_1173;
+
+ mov.f32 %f3988, 0fBAB607ED;
+ mov.f32 %f3989, 0f37CBAC00;
+ fma.rn.f32 %f5727, %f3989, %f1362, %f3988;
+
+$L__BB0_1173:
+ selp.f32 %f3990, 0f3C0885E4, 0f3D2AAABB, %p1001;
+ fma.rn.f32 %f3991, %f5727, %f1362, %f3990;
+ selp.f32 %f3992, 0fBE2AAAA8, 0fBEFFFFFF, %p1001;
+ fma.rn.f32 %f3993, %f3991, %f1362, %f3992;
+ mov.f32 %f3994, 0f00000000;
+ fma.rn.f32 %f3995, %f1362, %f1361, %f3994;
+ fma.rn.f32 %f5283, %f3993, %f3995, %f1361;
+ and.b32 %r5740, %r1536, 2;
+ setp.eq.s32 %p1003, %r5740, 0;
+ @%p1003 bra $L__BB0_1175;
+
+ mov.f32 %f3997, 0fBF800000;
+ fma.rn.f32 %f5283, %f5283, %f3997, %f3994;
+
+$L__BB0_1175:
+ selp.f32 %f1369, %f5283, %f5284, %p22;
+ selp.f32 %f1370, %f5281, %f5282, %p22;
+ @%p982 bra $L__BB0_1177;
+
+ add.f32 %f5792, %f1370, %f1369;
+ mov.f32 %f5282, %f5281;
+ mov.f32 %f5284, %f5283;
+
+$L__BB0_1177:
+ @%p816 bra $L__BB0_1399;
+
+ shl.b32 %r5741, %r12, 5;
+ mov.u32 %r5742, -32;
+ sub.s32 %r1538, %r5742, %r5741;
+ setp.ge.s32 %p1007, %r11, %r1538;
+ @%p1007 bra $L__BB0_1191;
+
+ mul.f32 %f4000, %f5409, 0f3F22F983;
+ cvt.rni.s32.f32 %r8282, %f4000;
+ cvt.rn.f32.s32 %f4001, %r8282;
+ mov.f32 %f4002, 0fBFC90FDA;
+ fma.rn.f32 %f4003, %f4001, %f4002, %f5409;
+ mov.f32 %f4004, 0fB3A22168;
+ fma.rn.f32 %f4005, %f4001, %f4004, %f4003;
+ mov.f32 %f4006, 0fA7C234C5;
+ fma.rn.f32 %f5735, %f4001, %f4006, %f4005;
+ abs.f32 %f1378, %f5409;
+ setp.ltu.f32 %p1008, %f1378, 0f47CE4780;
+ @%p1008 bra $L__BB0_1187;
+
+ setp.eq.f32 %p1009, %f1378, 0f7F800000;
+ @%p1009 bra $L__BB0_1186;
+ bra.uni $L__BB0_1181;
+
+$L__BB0_1186:
+ mov.f32 %f4009, 0f00000000;
+ mul.rn.f32 %f5735, %f5409, %f4009;
+ mov.u32 %r8282, 0;
+ bra.uni $L__BB0_1187;
+
+$L__BB0_1181:
+ mov.b32 %r1540, %f5409;
+ shr.u32 %r5744, %r1540, 23;
+ and.b32 %r5745, %r5744, 255;
+ add.s32 %r1541, %r5745, -128;
+ shl.b32 %r5746, %r1540, 8;
+ or.b32 %r1542, %r5746, -2147483648;
+ shr.u32 %r1543, %r1541, 5;
+ mov.u64 %rd2661, 0;
+ mov.u32 %r8279, 0;
+ mov.u64 %rd1833, __cudart_i2opi_f;
+ mov.u64 %rd2662, %rd2661;
+
+$L__BB0_1182:
+ .pragma "nounroll";
+ shl.b64 %rd1832, %rd2661, 2;
+ add.s64 %rd1834, %rd1833, %rd1832;
+ ld.global.nc.u32 %r5747, [%rd1834];
+ mad.wide.u32 %rd1835, %r5747, %r1542, %rd2662;
+ shr.u64 %rd2662, %rd1835, 32;
+ add.s64 %rd1836, %rd1, %rd1832;
+ st.local.u32 [%rd1836], %rd1835;
+ add.s32 %r8279, %r8279, 1;
+ cvt.s64.s32 %rd2661, %r8279;
+ setp.ne.s32 %p1010, %r8279, 6;
+ @%p1010 bra $L__BB0_1182;
+
+ st.local.u32 [%rd4], %rd2662;
+ mov.u32 %r5748, 4;
+ sub.s32 %r1546, %r5748, %r1543;
+ mov.u32 %r5749, 6;
+ sub.s32 %r5750, %r5749, %r1543;
+ mul.wide.s32 %rd1837, %r5750, 4;
+ add.s64 %rd1838, %rd1, %rd1837;
+ ld.local.u32 %r8280, [%rd1838];
+ ld.local.u32 %r8281, [%rd1838+-4];
+ and.b32 %r1549, %r1541, 31;
+ setp.eq.s32 %p1011, %r1549, 0;
+ @%p1011 bra $L__BB0_1185;
+
+ mov.u32 %r5751, 32;
+ sub.s32 %r5752, %r5751, %r1549;
+ shr.u32 %r5753, %r8281, %r5752;
+ shl.b32 %r5754, %r8280, %r1549;
+ add.s32 %r8280, %r5753, %r5754;
+ mul.wide.s32 %rd1839, %r1546, 4;
+ add.s64 %rd1840, %rd1, %rd1839;
+ ld.local.u32 %r5755, [%rd1840];
+ shr.u32 %r5756, %r5755, %r5752;
+ shl.b32 %r5757, %r8281, %r1549;
+ add.s32 %r8281, %r5756, %r5757;
+
+$L__BB0_1185:
+ and.b32 %r5758, %r1540, -2147483648;
+ shr.u32 %r5759, %r8281, 30;
+ shl.b32 %r5760, %r8280, 2;
+ or.b32 %r5761, %r5759, %r5760;
+ shr.u32 %r5762, %r5761, 31;
+ shr.u32 %r5763, %r8280, 30;
+ add.s32 %r5764, %r5762, %r5763;
+ neg.s32 %r5765, %r5764;
+ setp.eq.s32 %p1012, %r5758, 0;
+ selp.b32 %r8282, %r5764, %r5765, %p1012;
+ setp.ne.s32 %p1013, %r5762, 0;
+ xor.b32 %r5766, %r5758, -2147483648;
+ selp.b32 %r5767, %r5766, %r5758, %p1013;
+ selp.b32 %r5768, -1, 0, %p1013;
+ xor.b32 %r5769, %r5761, %r5768;
+ shl.b32 %r5770, %r8281, 2;
+ xor.b32 %r5771, %r5770, %r5768;
+ cvt.u64.u32 %rd1841, %r5769;
+ cvt.u64.u32 %rd1842, %r5771;
+ bfi.b64 %rd1843, %rd1841, %rd1842, 32, 32;
+ cvt.rn.f64.s64 %fd157, %rd1843;
+ mul.f64 %fd158, %fd157, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4007, %fd158;
+ setp.eq.s32 %p1014, %r5767, 0;
+ neg.f32 %f4008, %f4007;
+ selp.f32 %f5735, %f4007, %f4008, %p1014;
+
+$L__BB0_1187:
+ and.b32 %r1556, %r8282, 1;
+ setp.eq.s32 %p1015, %r1556, 0;
+ selp.f32 %f1382, %f5735, 0f3F800000, %p1015;
+ mul.rn.f32 %f1383, %f5735, %f5735;
+ mov.f32 %f5736, 0fB94D4153;
+ @%p1015 bra $L__BB0_1189;
+
+ mov.f32 %f4011, 0fBAB607ED;
+ mov.f32 %f4012, 0f37CBAC00;
+ fma.rn.f32 %f5736, %f4012, %f1383, %f4011;
+
+$L__BB0_1189:
+ selp.f32 %f4013, 0f3C0885E4, 0f3D2AAABB, %p1015;
+ fma.rn.f32 %f4014, %f5736, %f1383, %f4013;
+ selp.f32 %f4015, 0fBE2AAAA8, 0fBEFFFFFF, %p1015;
+ fma.rn.f32 %f4016, %f4014, %f1383, %f4015;
+ mov.f32 %f4017, 0f00000000;
+ fma.rn.f32 %f4018, %f1383, %f1382, %f4017;
+ fma.rn.f32 %f5281, %f4016, %f4018, %f1382;
+ and.b32 %r5773, %r8282, 2;
+ setp.eq.s32 %p1017, %r5773, 0;
+ @%p1017 bra $L__BB0_1191;
+
+ mov.f32 %f4020, 0fBF800000;
+ fma.rn.f32 %f5281, %f5281, %f4020, %f4017;
+
+$L__BB0_1191:
+ setp.lt.s32 %p23, %r11, %r1538;
+ @%p1007 bra $L__BB0_1204;
+
+ mul.f32 %f4021, %f5401, 0f3F22F983;
+ cvt.rni.s32.f32 %r8286, %f4021;
+ cvt.rn.f32.s32 %f4022, %r8286;
+ mov.f32 %f4023, 0fBFC90FDA;
+ fma.rn.f32 %f4024, %f4022, %f4023, %f5401;
+ mov.f32 %f4025, 0fB3A22168;
+ fma.rn.f32 %f4026, %f4022, %f4025, %f4024;
+ mov.f32 %f4027, 0fA7C234C5;
+ fma.rn.f32 %f5739, %f4022, %f4027, %f4026;
+ abs.f32 %f1391, %f5401;
+ setp.ltu.f32 %p1019, %f1391, 0f47CE4780;
+ @%p1019 bra $L__BB0_1200;
+
+ setp.eq.f32 %p1020, %f1391, 0f7F800000;
+ @%p1020 bra $L__BB0_1199;
+ bra.uni $L__BB0_1194;
+
+$L__BB0_1199:
+ mov.f32 %f4030, 0f00000000;
+ mul.rn.f32 %f5739, %f5401, %f4030;
+ mov.u32 %r8286, 0;
+ bra.uni $L__BB0_1200;
+
+$L__BB0_1194:
+ mov.b32 %r1558, %f5401;
+ shr.u32 %r5775, %r1558, 23;
+ and.b32 %r5776, %r5775, 255;
+ add.s32 %r1559, %r5776, -128;
+ shl.b32 %r5777, %r1558, 8;
+ or.b32 %r1560, %r5777, -2147483648;
+ shr.u32 %r1561, %r1559, 5;
+ mov.u64 %rd2663, 0;
+ mov.u32 %r8283, 0;
+ mov.u64 %rd1847, __cudart_i2opi_f;
+ mov.u64 %rd2664, %rd2663;
+
+$L__BB0_1195:
+ .pragma "nounroll";
+ shl.b64 %rd1846, %rd2663, 2;
+ add.s64 %rd1848, %rd1847, %rd1846;
+ ld.global.nc.u32 %r5778, [%rd1848];
+ mad.wide.u32 %rd1849, %r5778, %r1560, %rd2664;
+ shr.u64 %rd2664, %rd1849, 32;
+ add.s64 %rd1850, %rd1, %rd1846;
+ st.local.u32 [%rd1850], %rd1849;
+ add.s32 %r8283, %r8283, 1;
+ cvt.s64.s32 %rd2663, %r8283;
+ setp.ne.s32 %p1021, %r8283, 6;
+ @%p1021 bra $L__BB0_1195;
+
+ st.local.u32 [%rd4], %rd2664;
+ mov.u32 %r5779, 4;
+ sub.s32 %r1564, %r5779, %r1561;
+ mov.u32 %r5780, 6;
+ sub.s32 %r5781, %r5780, %r1561;
+ mul.wide.s32 %rd1851, %r5781, 4;
+ add.s64 %rd1852, %rd1, %rd1851;
+ ld.local.u32 %r8284, [%rd1852];
+ ld.local.u32 %r8285, [%rd1852+-4];
+ and.b32 %r1567, %r1559, 31;
+ setp.eq.s32 %p1022, %r1567, 0;
+ @%p1022 bra $L__BB0_1198;
+
+ mov.u32 %r5782, 32;
+ sub.s32 %r5783, %r5782, %r1567;
+ shr.u32 %r5784, %r8285, %r5783;
+ shl.b32 %r5785, %r8284, %r1567;
+ add.s32 %r8284, %r5784, %r5785;
+ mul.wide.s32 %rd1853, %r1564, 4;
+ add.s64 %rd1854, %rd1, %rd1853;
+ ld.local.u32 %r5786, [%rd1854];
+ shr.u32 %r5787, %r5786, %r5783;
+ shl.b32 %r5788, %r8285, %r1567;
+ add.s32 %r8285, %r5787, %r5788;
+
+$L__BB0_1198:
+ and.b32 %r5789, %r1558, -2147483648;
+ shr.u32 %r5790, %r8285, 30;
+ shl.b32 %r5791, %r8284, 2;
+ or.b32 %r5792, %r5790, %r5791;
+ shr.u32 %r5793, %r5792, 31;
+ shr.u32 %r5794, %r8284, 30;
+ add.s32 %r5795, %r5793, %r5794;
+ neg.s32 %r5796, %r5795;
+ setp.eq.s32 %p1023, %r5789, 0;
+ selp.b32 %r8286, %r5795, %r5796, %p1023;
+ setp.ne.s32 %p1024, %r5793, 0;
+ xor.b32 %r5797, %r5789, -2147483648;
+ selp.b32 %r5798, %r5797, %r5789, %p1024;
+ selp.b32 %r5799, -1, 0, %p1024;
+ xor.b32 %r5800, %r5792, %r5799;
+ shl.b32 %r5801, %r8285, 2;
+ xor.b32 %r5802, %r5801, %r5799;
+ cvt.u64.u32 %rd1855, %r5800;
+ cvt.u64.u32 %rd1856, %r5802;
+ bfi.b64 %rd1857, %rd1855, %rd1856, 32, 32;
+ cvt.rn.f64.s64 %fd159, %rd1857;
+ mul.f64 %fd160, %fd159, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4028, %fd160;
+ setp.eq.s32 %p1025, %r5798, 0;
+ neg.f32 %f4029, %f4028;
+ selp.f32 %f5739, %f4028, %f4029, %p1025;
+
+$L__BB0_1200:
+ add.s32 %r1574, %r8286, 1;
+ and.b32 %r1575, %r1574, 1;
+ setp.eq.s32 %p1026, %r1575, 0;
+ selp.f32 %f1395, %f5739, 0f3F800000, %p1026;
+ mul.rn.f32 %f1396, %f5739, %f5739;
+ mov.f32 %f5740, 0fB94D4153;
+ @%p1026 bra $L__BB0_1202;
+
+ mov.f32 %f4032, 0fBAB607ED;
+ mov.f32 %f4033, 0f37CBAC00;
+ fma.rn.f32 %f5740, %f4033, %f1396, %f4032;
+
+$L__BB0_1202:
+ selp.f32 %f4034, 0f3C0885E4, 0f3D2AAABB, %p1026;
+ fma.rn.f32 %f4035, %f5740, %f1396, %f4034;
+ selp.f32 %f4036, 0fBE2AAAA8, 0fBEFFFFFF, %p1026;
+ fma.rn.f32 %f4037, %f4035, %f1396, %f4036;
+ mov.f32 %f4038, 0f00000000;
+ fma.rn.f32 %f4039, %f1396, %f1395, %f4038;
+ fma.rn.f32 %f5283, %f4037, %f4039, %f1395;
+ and.b32 %r5804, %r1574, 2;
+ setp.eq.s32 %p1028, %r5804, 0;
+ @%p1028 bra $L__BB0_1204;
+
+ mov.f32 %f4041, 0fBF800000;
+ fma.rn.f32 %f5283, %f5283, %f4041, %f4038;
+
+$L__BB0_1204:
+ selp.f32 %f1403, %f5283, %f5284, %p23;
+ selp.f32 %f1404, %f5281, %f5282, %p23;
+ @%p1007 bra $L__BB0_1399;
+
+ add.f32 %f5791, %f1404, %f1403;
+ mov.f32 %f5282, %f5281;
+ mov.f32 %f5284, %f5283;
+
+$L__BB0_1399:
+ setp.lt.s32 %p1191, %r12, 0;
+ and.pred %p1193, %p33, %p1191;
+ @%p1193 bra $L__BB0_1672;
+ bra.uni $L__BB0_1400;
+
+$L__BB0_1672:
+ mov.u32 %r7785, %ctaid.x;
+ shl.b32 %r7070, %r12, 5;
+ add.s32 %r7071, %r7070, %r1;
+ mul.hi.s32 %r7072, %r7071, -1840700269;
+ add.s32 %r7073, %r7072, %r7071;
+ shr.u32 %r7074, %r7073, 31;
+ shr.s32 %r7075, %r7073, 2;
+ add.s32 %r7076, %r7075, %r7074;
+ mul.lo.s32 %r7077, %r7076, %r2589;
+ shl.b32 %r7078, %r2586, 1;
+ add.s32 %r7079, %r7078, %r14;
+ shl.b32 %r7080, %r2587, 1;
+ add.s32 %r7081, %r7079, %r7080;
+ add.s32 %r7082, %r7081, %r7077;
+ mul.lo.s32 %r7083, %r7076, 7;
+ sub.s32 %r7084, %r7071, %r7083;
+ mul.lo.s32 %r7085, %r7084, %r2590;
+ add.s32 %r7086, %r7082, %r7085;
+ mul.wide.s32 %rd2303, %r7086, 4;
+ add.s64 %rd2304, %rd3, %rd2303;
+ ld.global.f32 %f1957, [%rd2304];
+ add.s32 %r7087, %r7071, 32;
+ mul.hi.s32 %r7088, %r7087, -1840700269;
+ add.s32 %r7089, %r7088, %r7087;
+ shr.u32 %r7090, %r7089, 31;
+ shr.s32 %r7091, %r7089, 2;
+ add.s32 %r7092, %r7091, %r7090;
+ mul.lo.s32 %r7093, %r7092, %r2589;
+ add.s32 %r7094, %r7081, %r7093;
+ mul.lo.s32 %r7095, %r7092, 7;
+ sub.s32 %r7096, %r7087, %r7095;
+ mul.lo.s32 %r7097, %r7096, %r2590;
+ add.s32 %r7098, %r7094, %r7097;
+ mul.wide.s32 %rd2305, %r7098, 4;
+ add.s64 %rd2306, %rd3, %rd2305;
+ ld.global.f32 %f1958, [%rd2306];
+ mul.wide.s32 %rd2307, %r2587, 4;
+ add.s64 %rd2308, %rd2304, %rd2307;
+ ld.global.f32 %f1959, [%rd2308];
+ add.s64 %rd2309, %rd2306, %rd2307;
+ ld.global.f32 %f1960, [%rd2309];
+ add.s64 %rd2310, %rd2308, %rd2307;
+ ld.global.f32 %f1961, [%rd2310];
+ add.s64 %rd2311, %rd2309, %rd2307;
+ ld.global.f32 %f1962, [%rd2311];
+ mad.lo.s32 %r7099, %r2586, 3, %r14;
+ add.s32 %r7100, %r7099, %r7077;
+ add.s32 %r7101, %r7100, %r7085;
+ mul.wide.s32 %rd2312, %r7101, 4;
+ add.s64 %rd2313, %rd3, %rd2312;
+ ld.global.f32 %f1963, [%rd2313];
+ add.s32 %r7102, %r7099, %r7093;
+ add.s32 %r7103, %r7102, %r7097;
+ mul.wide.s32 %rd2314, %r7103, 4;
+ add.s64 %rd2315, %rd3, %rd2314;
+ ld.global.f32 %f1964, [%rd2315];
+ mul.hi.s32 %r7105, %r7071, 954437177;
+ shr.u32 %r7106, %r7105, 31;
+ shr.s32 %r7107, %r7105, 1;
+ add.s32 %r7108, %r7107, %r7106;
+ mul.lo.s32 %r7109, %r7108, %r2579;
+ shl.b32 %r7110, %r2576, 2;
+ mad.lo.s32 %r7111, %r2578, %r7785, %r7110;
+ add.s32 %r7112, %r7111, %r7109;
+ mul.lo.s32 %r7113, %r7108, 9;
+ sub.s32 %r7114, %r7071, %r7113;
+ mul.lo.s32 %r7115, %r7114, %r2580;
+ add.s32 %r7116, %r7112, %r7115;
+ mul.wide.s32 %rd2316, %r7116, 4;
+ add.s64 %rd2317, %rd2, %rd2316;
+ ld.global.f32 %f1965, [%rd2317];
+ mul.hi.s32 %r7117, %r7087, 954437177;
+ shr.u32 %r7118, %r7117, 31;
+ shr.s32 %r7119, %r7117, 1;
+ add.s32 %r7120, %r7119, %r7118;
+ mul.lo.s32 %r7121, %r7120, %r2579;
+ add.s32 %r7122, %r7111, %r7121;
+ mul.lo.s32 %r7123, %r7120, 9;
+ sub.s32 %r7124, %r7087, %r7123;
+ mul.lo.s32 %r7125, %r7124, %r2580;
+ add.s32 %r7126, %r7122, %r7125;
+ mul.wide.s32 %rd2318, %r7126, 4;
+ add.s64 %rd2319, %rd2, %rd2318;
+ ld.global.f32 %f1966, [%rd2319];
+ mul.wide.s32 %rd2320, %r2577, 4;
+ add.s64 %rd2321, %rd2317, %rd2320;
+ ld.global.f32 %f1967, [%rd2321];
+ add.s64 %rd2322, %rd2319, %rd2320;
+ ld.global.f32 %f1968, [%rd2322];
+ add.s64 %rd2323, %rd2321, %rd2320;
+ ld.global.f32 %f1969, [%rd2323];
+ add.s64 %rd2324, %rd2322, %rd2320;
+ ld.global.f32 %f1970, [%rd2324];
+ add.s32 %r7127, %r7111, %r2576;
+ add.s32 %r7128, %r7127, %r7109;
+ add.s32 %r7129, %r7128, %r7115;
+ mul.wide.s32 %rd2325, %r7129, 4;
+ add.s64 %rd2326, %rd2, %rd2325;
+ ld.global.f32 %f1971, [%rd2326];
+ add.s32 %r7130, %r7127, %r7121;
+ add.s32 %r7131, %r7130, %r7125;
+ mul.wide.s32 %rd2327, %r7131, 4;
+ add.s64 %rd2328, %rd2, %rd2327;
+ ld.global.f32 %f1972, [%rd2328];
+ mul.f32 %f4731, %f1965, 0f3F22F983;
+ cvt.rni.s32.f32 %r8418, %f4731;
+ cvt.rn.f32.s32 %f4732, %r8418;
+ mov.f32 %f4733, 0fBFC90FDA;
+ fma.rn.f32 %f4734, %f4732, %f4733, %f1965;
+ mov.f32 %f4735, 0fB3A22168;
+ fma.rn.f32 %f4736, %f4732, %f4735, %f4734;
+ mov.f32 %f4737, 0fA7C234C5;
+ fma.rn.f32 %f5942, %f4732, %f4737, %f4736;
+ abs.f32 %f1974, %f1965;
+ setp.ltu.f32 %p1419, %f1974, 0f47CE4780;
+ @%p1419 bra $L__BB0_1680;
+
+ setp.eq.f32 %p1420, %f1974, 0f7F800000;
+ @%p1420 bra $L__BB0_1679;
+ bra.uni $L__BB0_1674;
+
+$L__BB0_1679:
+ mov.f32 %f4740, 0f00000000;
+ mul.rn.f32 %f5942, %f1965, %f4740;
+ mov.u32 %r8418, 0;
+ bra.uni $L__BB0_1680;
+
+$L__BB0_1400:
+ add.s32 %r1872, %r12, 12;
+ setp.gt.s32 %p1194, %r1872, 14;
+ shl.b32 %r6373, %r2586, 1;
+ add.s32 %r6374, %r6373, %r14;
+ shl.b32 %r6375, %r2587, 1;
+ add.s32 %r1873, %r6374, %r6375;
+ @%p1194 bra $L__BB0_1405;
+
+ shl.b32 %r1874, %r12, 5;
+ neg.s32 %r6376, %r1874;
+ setp.ge.s32 %p1195, %r11, %r6376;
+ @%p1195 bra $L__BB0_1403;
+
+ add.s32 %r6377, %r1874, %r1;
+ mul.hi.s32 %r6378, %r6377, -1840700269;
+ add.s32 %r6379, %r6378, %r6377;
+ shr.u32 %r6380, %r6379, 31;
+ shr.s32 %r6381, %r6379, 2;
+ add.s32 %r6382, %r6381, %r6380;
+ mad.lo.s32 %r6383, %r6382, %r2589, %r1873;
+ mul.lo.s32 %r6384, %r6382, 7;
+ sub.s32 %r6385, %r6377, %r6384;
+ mad.lo.s32 %r6386, %r6385, %r2590, %r6383;
+ mul.wide.s32 %rd2111, %r6386, 4;
+ add.s64 %rd2112, %rd3, %rd2111;
+ ld.global.f32 %f5607, [%rd2112];
+
+$L__BB0_1403:
+ mov.u32 %r6387, -32;
+ sub.s32 %r6388, %r6387, %r1874;
+ setp.ge.s32 %p1196, %r11, %r6388;
+ @%p1196 bra $L__BB0_1405;
+
+ add.s32 %r6389, %r1874, %r1;
+ add.s32 %r6390, %r6389, 32;
+ mul.hi.s32 %r6391, %r6390, -1840700269;
+ add.s32 %r6392, %r6391, %r6390;
+ shr.u32 %r6393, %r6392, 31;
+ shr.s32 %r6394, %r6392, 2;
+ add.s32 %r6395, %r6394, %r6393;
+ mad.lo.s32 %r6396, %r6395, %r2589, %r1873;
+ mul.lo.s32 %r6397, %r6395, 7;
+ sub.s32 %r6398, %r6390, %r6397;
+ mad.lo.s32 %r6399, %r6398, %r2590, %r6396;
+ mul.wide.s32 %rd2113, %r6399, 4;
+ add.s64 %rd2114, %rd3, %rd2113;
+ ld.global.f32 %f5606, [%rd2114];
+
+$L__BB0_1405:
+ add.s32 %r1875, %r12, 13;
+ setp.gt.s32 %p1197, %r1875, 14;
+ add.s32 %r1876, %r1873, %r2587;
+ @%p1197 bra $L__BB0_1410;
+
+ shl.b32 %r1877, %r12, 5;
+ neg.s32 %r6400, %r1877;
+ setp.ge.s32 %p1198, %r11, %r6400;
+ @%p1198 bra $L__BB0_1408;
+
+ add.s32 %r6401, %r1877, %r1;
+ mul.hi.s32 %r6402, %r6401, -1840700269;
+ add.s32 %r6403, %r6402, %r6401;
shr.u32 %r6404, %r6403, 31;
- shr.u32 %r6405, %r8595, 30;
- add.s32 %r6406, %r6404, %r6405;
- neg.s32 %r6407, %r6406;
- setp.eq.s32 %p1176, %r6400, 0;
- selp.b32 %r8597, %r6406, %r6407, %p1176;
- setp.ne.s32 %p1177, %r6404, 0;
- xor.b32 %r6408, %r6400, -2147483648;
- selp.b32 %r6409, %r6408, %r6400, %p1177;
- selp.b32 %r6410, -1, 0, %p1177;
- xor.b32 %r6411, %r6403, %r6410;
- shl.b32 %r6412, %r8596, 2;
- xor.b32 %r6413, %r6412, %r6410;
- cvt.u64.u32 %rd2062, %r6411;
- cvt.u64.u32 %rd2063, %r6413;
- bfi.b64 %rd2064, %rd2062, %rd2063, 32, 32;
- cvt.rn.f64.s64 %fd185, %rd2064;
- mul.f64 %fd186, %fd185, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4246, %fd186;
- setp.eq.s32 %p1178, %r6409, 0;
- neg.f32 %f4247, %f4246;
- selp.f32 %f5695, %f4246, %f4247, %p1178;
-
-$L__BB0_1385:
- and.b32 %r1852, %r8597, 1;
- setp.eq.s32 %p1179, %r1852, 0;
- selp.f32 %f1553, %f5695, 0f3F800000, %p1179;
- mul.rn.f32 %f1554, %f5695, %f5695;
- mov.f32 %f5696, 0fB94D4153;
- @%p1179 bra $L__BB0_1387;
-
- mov.f32 %f4250, 0fBAB607ED;
- mov.f32 %f4251, 0f37CBAC00;
- fma.rn.f32 %f5696, %f4251, %f1554, %f4250;
-
-$L__BB0_1387:
- selp.f32 %f4252, 0f3C0885E4, 0f3D2AAABB, %p1179;
- fma.rn.f32 %f4253, %f5696, %f1554, %f4252;
- selp.f32 %f4254, 0fBE2AAAA8, 0fBEFFFFFF, %p1179;
- fma.rn.f32 %f4255, %f4253, %f1554, %f4254;
- mov.f32 %f4256, 0f00000000;
- fma.rn.f32 %f4257, %f1554, %f1553, %f4256;
- fma.rn.f32 %f5697, %f4255, %f4257, %f1553;
- and.b32 %r6415, %r8597, 2;
- setp.eq.s32 %p1181, %r6415, 0;
- @%p1181 bra $L__BB0_1389;
-
- mov.f32 %f4259, 0fBF800000;
- fma.rn.f32 %f5697, %f5697, %f4259, %f4256;
-
-$L__BB0_1389:
- mul.f32 %f4260, %f1388, 0f3F22F983;
- cvt.rni.s32.f32 %r8601, %f4260;
- cvt.rn.f32.s32 %f4261, %r8601;
- mov.f32 %f4262, 0fBFC90FDA;
- fma.rn.f32 %f4263, %f4261, %f4262, %f1388;
- mov.f32 %f4264, 0fB3A22168;
- fma.rn.f32 %f4265, %f4261, %f4264, %f4263;
- mov.f32 %f4266, 0fA7C234C5;
- fma.rn.f32 %f5698, %f4261, %f4266, %f4265;
- abs.f32 %f1561, %f1388;
- setp.ltu.f32 %p1182, %f1561, 0f47CE4780;
- @%p1182 bra $L__BB0_1397;
-
- setp.eq.f32 %p1183, %f1561, 0f7F800000;
- @%p1183 bra $L__BB0_1396;
- bra.uni $L__BB0_1391;
-
-$L__BB0_1396:
- mov.f32 %f4269, 0f00000000;
- mul.rn.f32 %f5698, %f1388, %f4269;
- mov.u32 %r8601, 0;
- bra.uni $L__BB0_1397;
-
-$L__BB0_1391:
- mov.b32 %r1854, %f1388;
- shr.u32 %r6417, %r1854, 23;
- and.b32 %r6418, %r6417, 255;
- add.s32 %r1855, %r6418, -128;
- shl.b32 %r6419, %r1854, 8;
- or.b32 %r1856, %r6419, -2147483648;
- shr.u32 %r1857, %r1855, 5;
- mov.u64 %rd2693, 0;
- mov.u32 %r8598, 0;
- mov.u64 %rd2691, __cudart_i2opi_f;
- mov.u64 %rd2692, %rd1;
-
-$L__BB0_1392:
+ shr.s32 %r6405, %r6403, 2;
+ add.s32 %r6406, %r6405, %r6404;
+ mad.lo.s32 %r6407, %r6406, %r2589, %r1876;
+ mul.lo.s32 %r6408, %r6406, 7;
+ sub.s32 %r6409, %r6401, %r6408;
+ mad.lo.s32 %r6410, %r6409, %r2590, %r6407;
+ mul.wide.s32 %rd2115, %r6410, 4;
+ add.s64 %rd2116, %rd3, %rd2115;
+ ld.global.f32 %f5406, [%rd2116];
+
+$L__BB0_1408:
+ mov.u32 %r6411, -32;
+ sub.s32 %r6412, %r6411, %r1877;
+ setp.ge.s32 %p1199, %r11, %r6412;
+ @%p1199 bra $L__BB0_1410;
+
+ add.s32 %r6413, %r1877, %r1;
+ add.s32 %r6414, %r6413, 32;
+ mul.hi.s32 %r6415, %r6414, -1840700269;
+ add.s32 %r6416, %r6415, %r6414;
+ shr.u32 %r6417, %r6416, 31;
+ shr.s32 %r6418, %r6416, 2;
+ add.s32 %r6419, %r6418, %r6417;
+ mad.lo.s32 %r6420, %r6419, %r2589, %r1876;
+ mul.lo.s32 %r6421, %r6419, 7;
+ sub.s32 %r6422, %r6414, %r6421;
+ mad.lo.s32 %r6423, %r6422, %r2590, %r6420;
+ mul.wide.s32 %rd2117, %r6423, 4;
+ add.s64 %rd2118, %rd3, %rd2117;
+ ld.global.f32 %f5405, [%rd2118];
+
+$L__BB0_1410:
+ add.s32 %r1878, %r12, 14;
+ setp.gt.s32 %p1200, %r1878, 14;
+ add.s32 %r1879, %r1876, %r2587;
+ @%p1200 bra $L__BB0_1415;
+
+ shl.b32 %r1880, %r12, 5;
+ neg.s32 %r6424, %r1880;
+ setp.ge.s32 %p1201, %r11, %r6424;
+ @%p1201 bra $L__BB0_1413;
+
+ add.s32 %r6425, %r1880, %r1;
+ mul.hi.s32 %r6426, %r6425, -1840700269;
+ add.s32 %r6427, %r6426, %r6425;
+ shr.u32 %r6428, %r6427, 31;
+ shr.s32 %r6429, %r6427, 2;
+ add.s32 %r6430, %r6429, %r6428;
+ mad.lo.s32 %r6431, %r6430, %r2589, %r1879;
+ mul.lo.s32 %r6432, %r6430, 7;
+ sub.s32 %r6433, %r6425, %r6432;
+ mad.lo.s32 %r6434, %r6433, %r2590, %r6431;
+ mul.wide.s32 %rd2119, %r6434, 4;
+ add.s64 %rd2120, %rd3, %rd2119;
+ ld.global.f32 %f5404, [%rd2120];
+
+$L__BB0_1413:
+ mov.u32 %r6435, -32;
+ sub.s32 %r6436, %r6435, %r1880;
+ setp.ge.s32 %p1202, %r11, %r6436;
+ @%p1202 bra $L__BB0_1415;
+
+ add.s32 %r6437, %r1880, %r1;
+ add.s32 %r6438, %r6437, 32;
+ mul.hi.s32 %r6439, %r6438, -1840700269;
+ add.s32 %r6440, %r6439, %r6438;
+ shr.u32 %r6441, %r6440, 31;
+ shr.s32 %r6442, %r6440, 2;
+ add.s32 %r6443, %r6442, %r6441;
+ mad.lo.s32 %r6444, %r6443, %r2589, %r1879;
+ mul.lo.s32 %r6445, %r6443, 7;
+ sub.s32 %r6446, %r6438, %r6445;
+ mad.lo.s32 %r6447, %r6446, %r2590, %r6444;
+ mul.wide.s32 %rd2121, %r6447, 4;
+ add.s64 %rd2122, %rd3, %rd2121;
+ ld.global.f32 %f5403, [%rd2122];
+
+$L__BB0_1415:
+ add.s32 %r1881, %r12, 15;
+ setp.gt.s32 %p1203, %r1881, 14;
+ mad.lo.s32 %r1882, %r2586, 3, %r14;
+ @%p1203 bra $L__BB0_1420;
+
+ shl.b32 %r1883, %r12, 5;
+ neg.s32 %r6448, %r1883;
+ setp.ge.s32 %p1204, %r11, %r6448;
+ @%p1204 bra $L__BB0_1418;
+
+ add.s32 %r6449, %r1883, %r1;
+ mul.hi.s32 %r6450, %r6449, -1840700269;
+ add.s32 %r6451, %r6450, %r6449;
+ shr.u32 %r6452, %r6451, 31;
+ shr.s32 %r6453, %r6451, 2;
+ add.s32 %r6454, %r6453, %r6452;
+ mad.lo.s32 %r6455, %r6454, %r2589, %r1882;
+ mul.lo.s32 %r6456, %r6454, 7;
+ sub.s32 %r6457, %r6449, %r6456;
+ mad.lo.s32 %r6458, %r6457, %r2590, %r6455;
+ mul.wide.s32 %rd2123, %r6458, 4;
+ add.s64 %rd2124, %rd3, %rd2123;
+ ld.global.f32 %f5402, [%rd2124];
+
+$L__BB0_1418:
+ mov.u32 %r6459, -32;
+ sub.s32 %r6460, %r6459, %r1883;
+ setp.ge.s32 %p1205, %r11, %r6460;
+ @%p1205 bra $L__BB0_1420;
+
+ add.s32 %r6461, %r1883, %r1;
+ add.s32 %r6462, %r6461, 32;
+ mul.hi.s32 %r6463, %r6462, -1840700269;
+ add.s32 %r6464, %r6463, %r6462;
+ shr.u32 %r6465, %r6464, 31;
+ shr.s32 %r6466, %r6464, 2;
+ add.s32 %r6467, %r6466, %r6465;
+ mad.lo.s32 %r6468, %r6467, %r2589, %r1882;
+ mul.lo.s32 %r6469, %r6467, 7;
+ sub.s32 %r6470, %r6462, %r6469;
+ mad.lo.s32 %r6471, %r6470, %r2590, %r6468;
+ mul.wide.s32 %rd2125, %r6471, 4;
+ add.s64 %rd2126, %rd3, %rd2125;
+ ld.global.f32 %f5401, [%rd2126];
+
+$L__BB0_1420:
+ mov.u32 %r7783, %ctaid.x;
+ mul.lo.s32 %r1884, %r2578, %r7783;
+ shl.b32 %r6473, %r2576, 2;
+ add.s32 %r1885, %r6473, %r1884;
+ @%p1194 bra $L__BB0_1425;
+
+ shl.b32 %r1886, %r12, 5;
+ neg.s32 %r6474, %r1886;
+ setp.ge.s32 %p1207, %r11, %r6474;
+ @%p1207 bra $L__BB0_1423;
+
+ add.s32 %r6475, %r1886, %r1;
+ mul.hi.s32 %r6476, %r6475, 954437177;
+ shr.u32 %r6477, %r6476, 31;
+ shr.s32 %r6478, %r6476, 1;
+ add.s32 %r6479, %r6478, %r6477;
+ mad.lo.s32 %r6480, %r6479, %r2579, %r1885;
+ mul.lo.s32 %r6481, %r6479, 9;
+ sub.s32 %r6482, %r6475, %r6481;
+ mad.lo.s32 %r6483, %r6482, %r2580, %r6480;
+ mul.wide.s32 %rd2127, %r6483, 4;
+ add.s64 %rd2128, %rd2, %rd2127;
+ ld.global.f32 %f5416, [%rd2128];
+
+$L__BB0_1423:
+ mov.u32 %r6484, -32;
+ sub.s32 %r6485, %r6484, %r1886;
+ setp.ge.s32 %p1208, %r11, %r6485;
+ @%p1208 bra $L__BB0_1425;
+
+ add.s32 %r6486, %r1886, %r1;
+ add.s32 %r6487, %r6486, 32;
+ mul.hi.s32 %r6488, %r6487, 954437177;
+ shr.u32 %r6489, %r6488, 31;
+ shr.s32 %r6490, %r6488, 1;
+ add.s32 %r6491, %r6490, %r6489;
+ mad.lo.s32 %r6492, %r6491, %r2579, %r1885;
+ mul.lo.s32 %r6493, %r6491, 9;
+ sub.s32 %r6494, %r6487, %r6493;
+ mad.lo.s32 %r6495, %r6494, %r2580, %r6492;
+ mul.wide.s32 %rd2129, %r6495, 4;
+ add.s64 %rd2130, %rd2, %rd2129;
+ ld.global.f32 %f5415, [%rd2130];
+
+$L__BB0_1425:
+ add.s32 %r1887, %r1885, %r2577;
+ @%p1197 bra $L__BB0_1430;
+
+ shl.b32 %r1888, %r12, 5;
+ neg.s32 %r6496, %r1888;
+ setp.ge.s32 %p1210, %r11, %r6496;
+ @%p1210 bra $L__BB0_1428;
+
+ add.s32 %r6497, %r1888, %r1;
+ mul.hi.s32 %r6498, %r6497, 954437177;
+ shr.u32 %r6499, %r6498, 31;
+ shr.s32 %r6500, %r6498, 1;
+ add.s32 %r6501, %r6500, %r6499;
+ mad.lo.s32 %r6502, %r6501, %r2579, %r1887;
+ mul.lo.s32 %r6503, %r6501, 9;
+ sub.s32 %r6504, %r6497, %r6503;
+ mad.lo.s32 %r6505, %r6504, %r2580, %r6502;
+ mul.wide.s32 %rd2131, %r6505, 4;
+ add.s64 %rd2132, %rd2, %rd2131;
+ ld.global.f32 %f5414, [%rd2132];
+
+$L__BB0_1428:
+ mov.u32 %r6506, -32;
+ sub.s32 %r6507, %r6506, %r1888;
+ setp.ge.s32 %p1211, %r11, %r6507;
+ @%p1211 bra $L__BB0_1430;
+
+ add.s32 %r6508, %r1888, %r1;
+ add.s32 %r6509, %r6508, 32;
+ mul.hi.s32 %r6510, %r6509, 954437177;
+ shr.u32 %r6511, %r6510, 31;
+ shr.s32 %r6512, %r6510, 1;
+ add.s32 %r6513, %r6512, %r6511;
+ mad.lo.s32 %r6514, %r6513, %r2579, %r1887;
+ mul.lo.s32 %r6515, %r6513, 9;
+ sub.s32 %r6516, %r6509, %r6515;
+ mad.lo.s32 %r6517, %r6516, %r2580, %r6514;
+ mul.wide.s32 %rd2133, %r6517, 4;
+ add.s64 %rd2134, %rd2, %rd2133;
+ ld.global.f32 %f5413, [%rd2134];
+
+$L__BB0_1430:
+ add.s32 %r1889, %r1887, %r2577;
+ @%p1200 bra $L__BB0_1435;
+
+ shl.b32 %r1890, %r12, 5;
+ neg.s32 %r6518, %r1890;
+ setp.ge.s32 %p1213, %r11, %r6518;
+ @%p1213 bra $L__BB0_1433;
+
+ add.s32 %r6519, %r1890, %r1;
+ mul.hi.s32 %r6520, %r6519, 954437177;
+ shr.u32 %r6521, %r6520, 31;
+ shr.s32 %r6522, %r6520, 1;
+ add.s32 %r6523, %r6522, %r6521;
+ mad.lo.s32 %r6524, %r6523, %r2579, %r1889;
+ mul.lo.s32 %r6525, %r6523, 9;
+ sub.s32 %r6526, %r6519, %r6525;
+ mad.lo.s32 %r6527, %r6526, %r2580, %r6524;
+ mul.wide.s32 %rd2135, %r6527, 4;
+ add.s64 %rd2136, %rd2, %rd2135;
+ ld.global.f32 %f5412, [%rd2136];
+
+$L__BB0_1433:
+ mov.u32 %r6528, -32;
+ sub.s32 %r6529, %r6528, %r1890;
+ setp.ge.s32 %p1214, %r11, %r6529;
+ @%p1214 bra $L__BB0_1435;
+
+ add.s32 %r6530, %r1890, %r1;
+ add.s32 %r6531, %r6530, 32;
+ mul.hi.s32 %r6532, %r6531, 954437177;
+ shr.u32 %r6533, %r6532, 31;
+ shr.s32 %r6534, %r6532, 1;
+ add.s32 %r6535, %r6534, %r6533;
+ mad.lo.s32 %r6536, %r6535, %r2579, %r1889;
+ mul.lo.s32 %r6537, %r6535, 9;
+ sub.s32 %r6538, %r6531, %r6537;
+ mad.lo.s32 %r6539, %r6538, %r2580, %r6536;
+ mul.wide.s32 %rd2137, %r6539, 4;
+ add.s64 %rd2138, %rd2, %rd2137;
+ ld.global.f32 %f5411, [%rd2138];
+
+$L__BB0_1435:
+ mad.lo.s32 %r1891, %r2576, 5, %r1884;
+ @%p1203 bra $L__BB0_1440;
+
+ shl.b32 %r1892, %r12, 5;
+ neg.s32 %r6540, %r1892;
+ setp.ge.s32 %p1216, %r11, %r6540;
+ @%p1216 bra $L__BB0_1438;
+
+ add.s32 %r6541, %r1892, %r1;
+ mul.hi.s32 %r6542, %r6541, 954437177;
+ shr.u32 %r6543, %r6542, 31;
+ shr.s32 %r6544, %r6542, 1;
+ add.s32 %r6545, %r6544, %r6543;
+ mad.lo.s32 %r6546, %r6545, %r2579, %r1891;
+ mul.lo.s32 %r6547, %r6545, 9;
+ sub.s32 %r6548, %r6541, %r6547;
+ mad.lo.s32 %r6549, %r6548, %r2580, %r6546;
+ mul.wide.s32 %rd2139, %r6549, 4;
+ add.s64 %rd2140, %rd2, %rd2139;
+ ld.global.f32 %f5410, [%rd2140];
+
+$L__BB0_1438:
+ mov.u32 %r6550, -32;
+ sub.s32 %r6551, %r6550, %r1892;
+ setp.ge.s32 %p1217, %r11, %r6551;
+ @%p1217 bra $L__BB0_1440;
+
+ add.s32 %r6552, %r1892, %r1;
+ add.s32 %r6553, %r6552, 32;
+ mul.hi.s32 %r6554, %r6553, 954437177;
+ shr.u32 %r6555, %r6554, 31;
+ shr.s32 %r6556, %r6554, 1;
+ add.s32 %r6557, %r6556, %r6555;
+ mad.lo.s32 %r6558, %r6557, %r2579, %r1891;
+ mul.lo.s32 %r6559, %r6557, 9;
+ sub.s32 %r6560, %r6553, %r6559;
+ mad.lo.s32 %r6561, %r6560, %r2580, %r6558;
+ mul.wide.s32 %rd2141, %r6561, 4;
+ add.s64 %rd2142, %rd2, %rd2141;
+ ld.global.f32 %f5409, [%rd2142];
+
+$L__BB0_1440:
+ @%p1194 bra $L__BB0_1469;
+
+ shl.b32 %r6562, %r12, 5;
+ neg.s32 %r1893, %r6562;
+ setp.ge.s32 %p1219, %r11, %r1893;
+ @%p1219 bra $L__BB0_1454;
+
+ mul.f32 %f4380, %f5416, 0f3F22F983;
+ cvt.rni.s32.f32 %r8354, %f4380;
+ cvt.rn.f32.s32 %f4381, %r8354;
+ mov.f32 %f4382, 0fBFC90FDA;
+ fma.rn.f32 %f4383, %f4381, %f4382, %f5416;
+ mov.f32 %f4384, 0fB3A22168;
+ fma.rn.f32 %f4385, %f4381, %f4384, %f4383;
+ mov.f32 %f4386, 0fA7C234C5;
+ fma.rn.f32 %f5843, %f4381, %f4386, %f4385;
+ abs.f32 %f1691, %f5416;
+ setp.ltu.f32 %p1220, %f1691, 0f47CE4780;
+ @%p1220 bra $L__BB0_1450;
+
+ setp.eq.f32 %p1221, %f1691, 0f7F800000;
+ @%p1221 bra $L__BB0_1449;
+ bra.uni $L__BB0_1444;
+
+$L__BB0_1449:
+ mov.f32 %f4389, 0f00000000;
+ mul.rn.f32 %f5843, %f5416, %f4389;
+ mov.u32 %r8354, 0;
+ bra.uni $L__BB0_1450;
+
+$L__BB0_1674:
+ mov.b32 %r2198, %f1965;
+ shr.u32 %r7133, %r2198, 23;
+ and.b32 %r7134, %r7133, 255;
+ add.s32 %r2199, %r7134, -128;
+ shl.b32 %r7135, %r2198, 8;
+ or.b32 %r2200, %r7135, -2147483648;
+ shr.u32 %r2201, %r2199, 5;
+ mov.u64 %rd2747, 0;
+ mov.u32 %r8415, 0;
+ mov.u64 %rd2745, __cudart_i2opi_f;
+ mov.u64 %rd2746, %rd1;
+
+$L__BB0_1675:
.pragma "nounroll";
- ld.global.nc.u32 %r6420, [%rd2691];
- mad.wide.u32 %rd2067, %r6420, %r1856, %rd2693;
- shr.u64 %rd2693, %rd2067, 32;
- st.local.u32 [%rd2692], %rd2067;
- add.s64 %rd2692, %rd2692, 4;
- add.s64 %rd2691, %rd2691, 4;
- add.s32 %r8598, %r8598, 1;
- setp.ne.s32 %p1184, %r8598, 6;
- @%p1184 bra $L__BB0_1392;
-
- st.local.u32 [%rd5], %rd2693;
- mov.u32 %r6421, 4;
- sub.s32 %r1860, %r6421, %r1857;
- mov.u32 %r6422, 6;
- sub.s32 %r6423, %r6422, %r1857;
- mul.wide.s32 %rd2068, %r6423, 4;
- add.s64 %rd2069, %rd1, %rd2068;
- ld.local.u32 %r8599, [%rd2069];
- ld.local.u32 %r8600, [%rd2069+-4];
- and.b32 %r1863, %r1855, 31;
- setp.eq.s32 %p1185, %r1863, 0;
- @%p1185 bra $L__BB0_1395;
-
- mov.u32 %r6424, 32;
- sub.s32 %r6425, %r6424, %r1863;
- shr.u32 %r6426, %r8600, %r6425;
- shl.b32 %r6427, %r8599, %r1863;
- add.s32 %r8599, %r6426, %r6427;
- mul.wide.s32 %rd2070, %r1860, 4;
- add.s64 %rd2071, %rd1, %rd2070;
- ld.local.u32 %r6428, [%rd2071];
- shr.u32 %r6429, %r6428, %r6425;
- shl.b32 %r6430, %r8600, %r1863;
- add.s32 %r8600, %r6429, %r6430;
-
-$L__BB0_1395:
- and.b32 %r6431, %r1854, -2147483648;
- shr.u32 %r6432, %r8600, 30;
- shl.b32 %r6433, %r8599, 2;
- or.b32 %r6434, %r6432, %r6433;
- shr.u32 %r6435, %r6434, 31;
- shr.u32 %r6436, %r8599, 30;
- add.s32 %r6437, %r6435, %r6436;
- neg.s32 %r6438, %r6437;
- setp.eq.s32 %p1186, %r6431, 0;
- selp.b32 %r8601, %r6437, %r6438, %p1186;
- setp.ne.s32 %p1187, %r6435, 0;
- xor.b32 %r6439, %r6431, -2147483648;
- selp.b32 %r6440, %r6439, %r6431, %p1187;
- selp.b32 %r6441, -1, 0, %p1187;
- xor.b32 %r6442, %r6434, %r6441;
- shl.b32 %r6443, %r8600, 2;
- xor.b32 %r6444, %r6443, %r6441;
- cvt.u64.u32 %rd2072, %r6442;
- cvt.u64.u32 %rd2073, %r6444;
- bfi.b64 %rd2074, %rd2072, %rd2073, 32, 32;
- cvt.rn.f64.s64 %fd187, %rd2074;
- mul.f64 %fd188, %fd187, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4267, %fd188;
- setp.eq.s32 %p1188, %r6440, 0;
- neg.f32 %f4268, %f4267;
- selp.f32 %f5698, %f4267, %f4268, %p1188;
-
-$L__BB0_1397:
- add.s32 %r1870, %r8601, 1;
- and.b32 %r1871, %r1870, 1;
- setp.eq.s32 %p1189, %r1871, 0;
- selp.f32 %f1565, %f5698, 0f3F800000, %p1189;
- mul.rn.f32 %f1566, %f5698, %f5698;
- mov.f32 %f5699, 0fB94D4153;
- @%p1189 bra $L__BB0_1399;
-
- mov.f32 %f4271, 0fBAB607ED;
- mov.f32 %f4272, 0f37CBAC00;
- fma.rn.f32 %f5699, %f4272, %f1566, %f4271;
-
-$L__BB0_1399:
- selp.f32 %f4273, 0f3C0885E4, 0f3D2AAABB, %p1189;
- fma.rn.f32 %f4274, %f5699, %f1566, %f4273;
- selp.f32 %f4275, 0fBE2AAAA8, 0fBEFFFFFF, %p1189;
- fma.rn.f32 %f4276, %f4274, %f1566, %f4275;
- mov.f32 %f4277, 0f00000000;
- fma.rn.f32 %f4278, %f1566, %f1565, %f4277;
- fma.rn.f32 %f5700, %f4276, %f4278, %f1565;
- and.b32 %r6446, %r1870, 2;
- setp.eq.s32 %p1191, %r6446, 0;
- @%p1191 bra $L__BB0_1401;
-
- mov.f32 %f4280, 0fBF800000;
- fma.rn.f32 %f5700, %f5700, %f4280, %f4277;
-
-$L__BB0_1401:
- add.f32 %f5708, %f5697, %f5700;
- mul.f32 %f4281, %f1397, 0f3F22F983;
- cvt.rni.s32.f32 %r8605, %f4281;
- cvt.rn.f32.s32 %f4282, %r8605;
- mov.f32 %f4283, 0fBFC90FDA;
- fma.rn.f32 %f4284, %f4282, %f4283, %f1397;
- mov.f32 %f4285, 0fB3A22168;
- fma.rn.f32 %f4286, %f4282, %f4285, %f4284;
- mov.f32 %f4287, 0fA7C234C5;
- fma.rn.f32 %f5701, %f4282, %f4287, %f4286;
- abs.f32 %f1574, %f1397;
- setp.ltu.f32 %p1192, %f1574, 0f47CE4780;
- @%p1192 bra $L__BB0_1409;
-
- setp.eq.f32 %p1193, %f1574, 0f7F800000;
- @%p1193 bra $L__BB0_1408;
- bra.uni $L__BB0_1403;
-
-$L__BB0_1408:
- mov.f32 %f4290, 0f00000000;
- mul.rn.f32 %f5701, %f1397, %f4290;
- mov.u32 %r8605, 0;
- bra.uni $L__BB0_1409;
-
-$L__BB0_1403:
- mov.b32 %r1873, %f1397;
- shr.u32 %r6448, %r1873, 23;
- and.b32 %r6449, %r6448, 255;
- add.s32 %r1874, %r6449, -128;
- shl.b32 %r6450, %r1873, 8;
- or.b32 %r1875, %r6450, -2147483648;
- shr.u32 %r1876, %r1874, 5;
- mov.u64 %rd2696, 0;
- mov.u32 %r8602, 0;
- mov.u64 %rd2694, __cudart_i2opi_f;
- mov.u64 %rd2695, %rd1;
-
-$L__BB0_1404:
+ ld.global.nc.u32 %r7136, [%rd2745];
+ mad.wide.u32 %rd2331, %r7136, %r2200, %rd2747;
+ shr.u64 %rd2747, %rd2331, 32;
+ st.local.u32 [%rd2746], %rd2331;
+ add.s64 %rd2746, %rd2746, 4;
+ add.s64 %rd2745, %rd2745, 4;
+ add.s32 %r8415, %r8415, 1;
+ setp.ne.s32 %p1421, %r8415, 6;
+ @%p1421 bra $L__BB0_1675;
+
+ st.local.u32 [%rd4], %rd2747;
+ mov.u32 %r7137, 4;
+ sub.s32 %r2204, %r7137, %r2201;
+ mov.u32 %r7138, 6;
+ sub.s32 %r7139, %r7138, %r2201;
+ mul.wide.s32 %rd2332, %r7139, 4;
+ add.s64 %rd2333, %rd1, %rd2332;
+ ld.local.u32 %r8416, [%rd2333];
+ ld.local.u32 %r8417, [%rd2333+-4];
+ and.b32 %r2207, %r2199, 31;
+ setp.eq.s32 %p1422, %r2207, 0;
+ @%p1422 bra $L__BB0_1678;
+
+ mov.u32 %r7140, 32;
+ sub.s32 %r7141, %r7140, %r2207;
+ shr.u32 %r7142, %r8417, %r7141;
+ shl.b32 %r7143, %r8416, %r2207;
+ add.s32 %r8416, %r7142, %r7143;
+ mul.wide.s32 %rd2334, %r2204, 4;
+ add.s64 %rd2335, %rd1, %rd2334;
+ ld.local.u32 %r7144, [%rd2335];
+ shr.u32 %r7145, %r7144, %r7141;
+ shl.b32 %r7146, %r8417, %r2207;
+ add.s32 %r8417, %r7145, %r7146;
+
+$L__BB0_1678:
+ and.b32 %r7147, %r2198, -2147483648;
+ shr.u32 %r7148, %r8417, 30;
+ shl.b32 %r7149, %r8416, 2;
+ or.b32 %r7150, %r7148, %r7149;
+ shr.u32 %r7151, %r7150, 31;
+ shr.u32 %r7152, %r8416, 30;
+ add.s32 %r7153, %r7151, %r7152;
+ neg.s32 %r7154, %r7153;
+ setp.eq.s32 %p1423, %r7147, 0;
+ selp.b32 %r8418, %r7153, %r7154, %p1423;
+ setp.ne.s32 %p1424, %r7151, 0;
+ xor.b32 %r7155, %r7147, -2147483648;
+ selp.b32 %r7156, %r7155, %r7147, %p1424;
+ selp.b32 %r7157, -1, 0, %p1424;
+ xor.b32 %r7158, %r7150, %r7157;
+ shl.b32 %r7159, %r8417, 2;
+ xor.b32 %r7160, %r7159, %r7157;
+ cvt.u64.u32 %rd2336, %r7158;
+ cvt.u64.u32 %rd2337, %r7160;
+ bfi.b64 %rd2338, %rd2336, %rd2337, 32, 32;
+ cvt.rn.f64.s64 %fd225, %rd2338;
+ mul.f64 %fd226, %fd225, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4738, %fd226;
+ setp.eq.s32 %p1425, %r7156, 0;
+ neg.f32 %f4739, %f4738;
+ selp.f32 %f5942, %f4738, %f4739, %p1425;
+
+$L__BB0_1680:
+ and.b32 %r2214, %r8418, 1;
+ setp.eq.s32 %p1426, %r2214, 0;
+ selp.f32 %f1978, %f5942, 0f3F800000, %p1426;
+ mul.rn.f32 %f1979, %f5942, %f5942;
+ mov.f32 %f5943, 0fB94D4153;
+ @%p1426 bra $L__BB0_1682;
+
+ mov.f32 %f4742, 0fBAB607ED;
+ mov.f32 %f4743, 0f37CBAC00;
+ fma.rn.f32 %f5943, %f4743, %f1979, %f4742;
+
+$L__BB0_1682:
+ selp.f32 %f4744, 0f3C0885E4, 0f3D2AAABB, %p1426;
+ fma.rn.f32 %f4745, %f5943, %f1979, %f4744;
+ selp.f32 %f4746, 0fBE2AAAA8, 0fBEFFFFFF, %p1426;
+ fma.rn.f32 %f4747, %f4745, %f1979, %f4746;
+ mov.f32 %f4748, 0f00000000;
+ fma.rn.f32 %f4749, %f1979, %f1978, %f4748;
+ fma.rn.f32 %f5944, %f4747, %f4749, %f1978;
+ and.b32 %r7162, %r8418, 2;
+ setp.eq.s32 %p1428, %r7162, 0;
+ @%p1428 bra $L__BB0_1684;
+
+ mov.f32 %f4751, 0fBF800000;
+ fma.rn.f32 %f5944, %f5944, %f4751, %f4748;
+
+$L__BB0_1684:
+ mul.f32 %f4752, %f1957, 0f3F22F983;
+ cvt.rni.s32.f32 %r8422, %f4752;
+ cvt.rn.f32.s32 %f4753, %r8422;
+ mov.f32 %f4754, 0fBFC90FDA;
+ fma.rn.f32 %f4755, %f4753, %f4754, %f1957;
+ mov.f32 %f4756, 0fB3A22168;
+ fma.rn.f32 %f4757, %f4753, %f4756, %f4755;
+ mov.f32 %f4758, 0fA7C234C5;
+ fma.rn.f32 %f5945, %f4753, %f4758, %f4757;
+ abs.f32 %f1986, %f1957;
+ setp.ltu.f32 %p1429, %f1986, 0f47CE4780;
+ @%p1429 bra $L__BB0_1692;
+
+ setp.eq.f32 %p1430, %f1986, 0f7F800000;
+ @%p1430 bra $L__BB0_1691;
+ bra.uni $L__BB0_1686;
+
+$L__BB0_1691:
+ mov.f32 %f4761, 0f00000000;
+ mul.rn.f32 %f5945, %f1957, %f4761;
+ mov.u32 %r8422, 0;
+ bra.uni $L__BB0_1692;
+
+$L__BB0_1686:
+ mov.b32 %r2216, %f1957;
+ shr.u32 %r7164, %r2216, 23;
+ and.b32 %r7165, %r7164, 255;
+ add.s32 %r2217, %r7165, -128;
+ shl.b32 %r7166, %r2216, 8;
+ or.b32 %r2218, %r7166, -2147483648;
+ shr.u32 %r2219, %r2217, 5;
+ mov.u64 %rd2750, 0;
+ mov.u32 %r8419, 0;
+ mov.u64 %rd2748, __cudart_i2opi_f;
+ mov.u64 %rd2749, %rd1;
+
+$L__BB0_1687:
.pragma "nounroll";
- ld.global.nc.u32 %r6451, [%rd2694];
- mad.wide.u32 %rd2077, %r6451, %r1875, %rd2696;
- shr.u64 %rd2696, %rd2077, 32;
- st.local.u32 [%rd2695], %rd2077;
- add.s64 %rd2695, %rd2695, 4;
- add.s64 %rd2694, %rd2694, 4;
- add.s32 %r8602, %r8602, 1;
- setp.ne.s32 %p1194, %r8602, 6;
- @%p1194 bra $L__BB0_1404;
-
- st.local.u32 [%rd5], %rd2696;
- mov.u32 %r6452, 4;
- sub.s32 %r1879, %r6452, %r1876;
- mov.u32 %r6453, 6;
- sub.s32 %r6454, %r6453, %r1876;
- mul.wide.s32 %rd2078, %r6454, 4;
- add.s64 %rd2079, %rd1, %rd2078;
- ld.local.u32 %r8603, [%rd2079];
- ld.local.u32 %r8604, [%rd2079+-4];
- and.b32 %r1882, %r1874, 31;
- setp.eq.s32 %p1195, %r1882, 0;
- @%p1195 bra $L__BB0_1407;
-
- mov.u32 %r6455, 32;
- sub.s32 %r6456, %r6455, %r1882;
- shr.u32 %r6457, %r8604, %r6456;
- shl.b32 %r6458, %r8603, %r1882;
- add.s32 %r8603, %r6457, %r6458;
- mul.wide.s32 %rd2080, %r1879, 4;
- add.s64 %rd2081, %rd1, %rd2080;
- ld.local.u32 %r6459, [%rd2081];
- shr.u32 %r6460, %r6459, %r6456;
- shl.b32 %r6461, %r8604, %r1882;
- add.s32 %r8604, %r6460, %r6461;
-
-$L__BB0_1407:
- and.b32 %r6462, %r1873, -2147483648;
- shr.u32 %r6463, %r8604, 30;
- shl.b32 %r6464, %r8603, 2;
- or.b32 %r6465, %r6463, %r6464;
- shr.u32 %r6466, %r6465, 31;
- shr.u32 %r6467, %r8603, 30;
- add.s32 %r6468, %r6466, %r6467;
- neg.s32 %r6469, %r6468;
- setp.eq.s32 %p1196, %r6462, 0;
- selp.b32 %r8605, %r6468, %r6469, %p1196;
- setp.ne.s32 %p1197, %r6466, 0;
- xor.b32 %r6470, %r6462, -2147483648;
- selp.b32 %r6471, %r6470, %r6462, %p1197;
- selp.b32 %r6472, -1, 0, %p1197;
- xor.b32 %r6473, %r6465, %r6472;
- shl.b32 %r6474, %r8604, 2;
- xor.b32 %r6475, %r6474, %r6472;
- cvt.u64.u32 %rd2082, %r6473;
- cvt.u64.u32 %rd2083, %r6475;
- bfi.b64 %rd2084, %rd2082, %rd2083, 32, 32;
- cvt.rn.f64.s64 %fd189, %rd2084;
- mul.f64 %fd190, %fd189, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4288, %fd190;
- setp.eq.s32 %p1198, %r6471, 0;
- neg.f32 %f4289, %f4288;
- selp.f32 %f5701, %f4288, %f4289, %p1198;
-
-$L__BB0_1409:
- and.b32 %r1889, %r8605, 1;
- setp.eq.s32 %p1199, %r1889, 0;
- selp.f32 %f1578, %f5701, 0f3F800000, %p1199;
- mul.rn.f32 %f1579, %f5701, %f5701;
- mov.f32 %f5702, 0fB94D4153;
- @%p1199 bra $L__BB0_1411;
-
- mov.f32 %f4292, 0fBAB607ED;
- mov.f32 %f4293, 0f37CBAC00;
- fma.rn.f32 %f5702, %f4293, %f1579, %f4292;
-
-$L__BB0_1411:
- selp.f32 %f4294, 0f3C0885E4, 0f3D2AAABB, %p1199;
- fma.rn.f32 %f4295, %f5702, %f1579, %f4294;
- selp.f32 %f4296, 0fBE2AAAA8, 0fBEFFFFFF, %p1199;
- fma.rn.f32 %f4297, %f4295, %f1579, %f4296;
- mov.f32 %f4298, 0f00000000;
- fma.rn.f32 %f4299, %f1579, %f1578, %f4298;
- fma.rn.f32 %f5703, %f4297, %f4299, %f1578;
- and.b32 %r6477, %r8605, 2;
- setp.eq.s32 %p1201, %r6477, 0;
- @%p1201 bra $L__BB0_1413;
-
- mov.f32 %f4301, 0fBF800000;
- fma.rn.f32 %f5703, %f5703, %f4301, %f4298;
-
-$L__BB0_1413:
- mul.f32 %f4302, %f1389, 0f3F22F983;
- cvt.rni.s32.f32 %r8609, %f4302;
- cvt.rn.f32.s32 %f4303, %r8609;
- mov.f32 %f4304, 0fBFC90FDA;
- fma.rn.f32 %f4305, %f4303, %f4304, %f1389;
- mov.f32 %f4306, 0fB3A22168;
- fma.rn.f32 %f4307, %f4303, %f4306, %f4305;
- mov.f32 %f4308, 0fA7C234C5;
- fma.rn.f32 %f5704, %f4303, %f4308, %f4307;
- abs.f32 %f1586, %f1389;
- setp.ltu.f32 %p1202, %f1586, 0f47CE4780;
- @%p1202 bra $L__BB0_1421;
-
- setp.eq.f32 %p1203, %f1586, 0f7F800000;
- @%p1203 bra $L__BB0_1420;
- bra.uni $L__BB0_1415;
-
-$L__BB0_1420:
- mov.f32 %f4311, 0f00000000;
- mul.rn.f32 %f5704, %f1389, %f4311;
- mov.u32 %r8609, 0;
- bra.uni $L__BB0_1421;
-
-$L__BB0_1415:
- mov.b32 %r1891, %f1389;
- shr.u32 %r6479, %r1891, 23;
- and.b32 %r6480, %r6479, 255;
- add.s32 %r1892, %r6480, -128;
- shl.b32 %r6481, %r1891, 8;
- or.b32 %r1893, %r6481, -2147483648;
- shr.u32 %r1894, %r1892, 5;
+ ld.global.nc.u32 %r7167, [%rd2748];
+ mad.wide.u32 %rd2341, %r7167, %r2218, %rd2750;
+ shr.u64 %rd2750, %rd2341, 32;
+ st.local.u32 [%rd2749], %rd2341;
+ add.s64 %rd2749, %rd2749, 4;
+ add.s64 %rd2748, %rd2748, 4;
+ add.s32 %r8419, %r8419, 1;
+ setp.ne.s32 %p1431, %r8419, 6;
+ @%p1431 bra $L__BB0_1687;
+
+ st.local.u32 [%rd4], %rd2750;
+ mov.u32 %r7168, 4;
+ sub.s32 %r2222, %r7168, %r2219;
+ mov.u32 %r7169, 6;
+ sub.s32 %r7170, %r7169, %r2219;
+ mul.wide.s32 %rd2342, %r7170, 4;
+ add.s64 %rd2343, %rd1, %rd2342;
+ ld.local.u32 %r8420, [%rd2343];
+ ld.local.u32 %r8421, [%rd2343+-4];
+ and.b32 %r2225, %r2217, 31;
+ setp.eq.s32 %p1432, %r2225, 0;
+ @%p1432 bra $L__BB0_1690;
+
+ mov.u32 %r7171, 32;
+ sub.s32 %r7172, %r7171, %r2225;
+ shr.u32 %r7173, %r8421, %r7172;
+ shl.b32 %r7174, %r8420, %r2225;
+ add.s32 %r8420, %r7173, %r7174;
+ mul.wide.s32 %rd2344, %r2222, 4;
+ add.s64 %rd2345, %rd1, %rd2344;
+ ld.local.u32 %r7175, [%rd2345];
+ shr.u32 %r7176, %r7175, %r7172;
+ shl.b32 %r7177, %r8421, %r2225;
+ add.s32 %r8421, %r7176, %r7177;
+
+$L__BB0_1690:
+ and.b32 %r7178, %r2216, -2147483648;
+ shr.u32 %r7179, %r8421, 30;
+ shl.b32 %r7180, %r8420, 2;
+ or.b32 %r7181, %r7179, %r7180;
+ shr.u32 %r7182, %r7181, 31;
+ shr.u32 %r7183, %r8420, 30;
+ add.s32 %r7184, %r7182, %r7183;
+ neg.s32 %r7185, %r7184;
+ setp.eq.s32 %p1433, %r7178, 0;
+ selp.b32 %r8422, %r7184, %r7185, %p1433;
+ setp.ne.s32 %p1434, %r7182, 0;
+ xor.b32 %r7186, %r7178, -2147483648;
+ selp.b32 %r7187, %r7186, %r7178, %p1434;
+ selp.b32 %r7188, -1, 0, %p1434;
+ xor.b32 %r7189, %r7181, %r7188;
+ shl.b32 %r7190, %r8421, 2;
+ xor.b32 %r7191, %r7190, %r7188;
+ cvt.u64.u32 %rd2346, %r7189;
+ cvt.u64.u32 %rd2347, %r7191;
+ bfi.b64 %rd2348, %rd2346, %rd2347, 32, 32;
+ cvt.rn.f64.s64 %fd227, %rd2348;
+ mul.f64 %fd228, %fd227, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4759, %fd228;
+ setp.eq.s32 %p1435, %r7187, 0;
+ neg.f32 %f4760, %f4759;
+ selp.f32 %f5945, %f4759, %f4760, %p1435;
+
+$L__BB0_1692:
+ add.s32 %r2232, %r8422, 1;
+ and.b32 %r2233, %r2232, 1;
+ setp.eq.s32 %p1436, %r2233, 0;
+ selp.f32 %f1990, %f5945, 0f3F800000, %p1436;
+ mul.rn.f32 %f1991, %f5945, %f5945;
+ mov.f32 %f5946, 0fB94D4153;
+ @%p1436 bra $L__BB0_1694;
+
+ mov.f32 %f4763, 0fBAB607ED;
+ mov.f32 %f4764, 0f37CBAC00;
+ fma.rn.f32 %f5946, %f4764, %f1991, %f4763;
+
+$L__BB0_1694:
+ selp.f32 %f4765, 0f3C0885E4, 0f3D2AAABB, %p1436;
+ fma.rn.f32 %f4766, %f5946, %f1991, %f4765;
+ selp.f32 %f4767, 0fBE2AAAA8, 0fBEFFFFFF, %p1436;
+ fma.rn.f32 %f4768, %f4766, %f1991, %f4767;
+ mov.f32 %f4769, 0f00000000;
+ fma.rn.f32 %f4770, %f1991, %f1990, %f4769;
+ fma.rn.f32 %f5947, %f4768, %f4770, %f1990;
+ and.b32 %r7193, %r2232, 2;
+ setp.eq.s32 %p1438, %r7193, 0;
+ @%p1438 bra $L__BB0_1696;
+
+ mov.f32 %f4772, 0fBF800000;
+ fma.rn.f32 %f5947, %f5947, %f4772, %f4769;
+
+$L__BB0_1696:
+ add.f32 %f5997, %f5944, %f5947;
+ mul.f32 %f4773, %f1966, 0f3F22F983;
+ cvt.rni.s32.f32 %r8426, %f4773;
+ cvt.rn.f32.s32 %f4774, %r8426;
+ mov.f32 %f4775, 0fBFC90FDA;
+ fma.rn.f32 %f4776, %f4774, %f4775, %f1966;
+ mov.f32 %f4777, 0fB3A22168;
+ fma.rn.f32 %f4778, %f4774, %f4777, %f4776;
+ mov.f32 %f4779, 0fA7C234C5;
+ fma.rn.f32 %f5948, %f4774, %f4779, %f4778;
+ abs.f32 %f1999, %f1966;
+ setp.ltu.f32 %p1439, %f1999, 0f47CE4780;
+ @%p1439 bra $L__BB0_1704;
+
+ setp.eq.f32 %p1440, %f1999, 0f7F800000;
+ @%p1440 bra $L__BB0_1703;
+ bra.uni $L__BB0_1698;
+
+$L__BB0_1703:
+ mov.f32 %f4782, 0f00000000;
+ mul.rn.f32 %f5948, %f1966, %f4782;
+ mov.u32 %r8426, 0;
+ bra.uni $L__BB0_1704;
+
+$L__BB0_1698:
+ mov.b32 %r2235, %f1966;
+ shr.u32 %r7195, %r2235, 23;
+ and.b32 %r7196, %r7195, 255;
+ add.s32 %r2236, %r7196, -128;
+ shl.b32 %r7197, %r2235, 8;
+ or.b32 %r2237, %r7197, -2147483648;
+ shr.u32 %r2238, %r2236, 5;
+ mov.u64 %rd2753, 0;
+ mov.u32 %r8423, 0;
+ mov.u64 %rd2751, __cudart_i2opi_f;
+ mov.u64 %rd2752, %rd1;
+
+$L__BB0_1699:
+ .pragma "nounroll";
+ ld.global.nc.u32 %r7198, [%rd2751];
+ mad.wide.u32 %rd2351, %r7198, %r2237, %rd2753;
+ shr.u64 %rd2753, %rd2351, 32;
+ st.local.u32 [%rd2752], %rd2351;
+ add.s64 %rd2752, %rd2752, 4;
+ add.s64 %rd2751, %rd2751, 4;
+ add.s32 %r8423, %r8423, 1;
+ setp.ne.s32 %p1441, %r8423, 6;
+ @%p1441 bra $L__BB0_1699;
+
+ st.local.u32 [%rd4], %rd2753;
+ mov.u32 %r7199, 4;
+ sub.s32 %r2241, %r7199, %r2238;
+ mov.u32 %r7200, 6;
+ sub.s32 %r7201, %r7200, %r2238;
+ mul.wide.s32 %rd2352, %r7201, 4;
+ add.s64 %rd2353, %rd1, %rd2352;
+ ld.local.u32 %r8424, [%rd2353];
+ ld.local.u32 %r8425, [%rd2353+-4];
+ and.b32 %r2244, %r2236, 31;
+ setp.eq.s32 %p1442, %r2244, 0;
+ @%p1442 bra $L__BB0_1702;
+
+ mov.u32 %r7202, 32;
+ sub.s32 %r7203, %r7202, %r2244;
+ shr.u32 %r7204, %r8425, %r7203;
+ shl.b32 %r7205, %r8424, %r2244;
+ add.s32 %r8424, %r7204, %r7205;
+ mul.wide.s32 %rd2354, %r2241, 4;
+ add.s64 %rd2355, %rd1, %rd2354;
+ ld.local.u32 %r7206, [%rd2355];
+ shr.u32 %r7207, %r7206, %r7203;
+ shl.b32 %r7208, %r8425, %r2244;
+ add.s32 %r8425, %r7207, %r7208;
+
+$L__BB0_1702:
+ and.b32 %r7209, %r2235, -2147483648;
+ shr.u32 %r7210, %r8425, 30;
+ shl.b32 %r7211, %r8424, 2;
+ or.b32 %r7212, %r7210, %r7211;
+ shr.u32 %r7213, %r7212, 31;
+ shr.u32 %r7214, %r8424, 30;
+ add.s32 %r7215, %r7213, %r7214;
+ neg.s32 %r7216, %r7215;
+ setp.eq.s32 %p1443, %r7209, 0;
+ selp.b32 %r8426, %r7215, %r7216, %p1443;
+ setp.ne.s32 %p1444, %r7213, 0;
+ xor.b32 %r7217, %r7209, -2147483648;
+ selp.b32 %r7218, %r7217, %r7209, %p1444;
+ selp.b32 %r7219, -1, 0, %p1444;
+ xor.b32 %r7220, %r7212, %r7219;
+ shl.b32 %r7221, %r8425, 2;
+ xor.b32 %r7222, %r7221, %r7219;
+ cvt.u64.u32 %rd2356, %r7220;
+ cvt.u64.u32 %rd2357, %r7222;
+ bfi.b64 %rd2358, %rd2356, %rd2357, 32, 32;
+ cvt.rn.f64.s64 %fd229, %rd2358;
+ mul.f64 %fd230, %fd229, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4780, %fd230;
+ setp.eq.s32 %p1445, %r7218, 0;
+ neg.f32 %f4781, %f4780;
+ selp.f32 %f5948, %f4780, %f4781, %p1445;
+
+$L__BB0_1704:
+ and.b32 %r2251, %r8426, 1;
+ setp.eq.s32 %p1446, %r2251, 0;
+ selp.f32 %f2003, %f5948, 0f3F800000, %p1446;
+ mul.rn.f32 %f2004, %f5948, %f5948;
+ mov.f32 %f5949, 0fB94D4153;
+ @%p1446 bra $L__BB0_1706;
+
+ mov.f32 %f4784, 0fBAB607ED;
+ mov.f32 %f4785, 0f37CBAC00;
+ fma.rn.f32 %f5949, %f4785, %f2004, %f4784;
+
+$L__BB0_1706:
+ selp.f32 %f4786, 0f3C0885E4, 0f3D2AAABB, %p1446;
+ fma.rn.f32 %f4787, %f5949, %f2004, %f4786;
+ selp.f32 %f4788, 0fBE2AAAA8, 0fBEFFFFFF, %p1446;
+ fma.rn.f32 %f4789, %f4787, %f2004, %f4788;
+ mov.f32 %f4790, 0f00000000;
+ fma.rn.f32 %f4791, %f2004, %f2003, %f4790;
+ fma.rn.f32 %f5950, %f4789, %f4791, %f2003;
+ and.b32 %r7224, %r8426, 2;
+ setp.eq.s32 %p1448, %r7224, 0;
+ @%p1448 bra $L__BB0_1708;
+
+ mov.f32 %f4793, 0fBF800000;
+ fma.rn.f32 %f5950, %f5950, %f4793, %f4790;
+
+$L__BB0_1708:
+ mul.f32 %f4794, %f1958, 0f3F22F983;
+ cvt.rni.s32.f32 %r8430, %f4794;
+ cvt.rn.f32.s32 %f4795, %r8430;
+ mov.f32 %f4796, 0fBFC90FDA;
+ fma.rn.f32 %f4797, %f4795, %f4796, %f1958;
+ mov.f32 %f4798, 0fB3A22168;
+ fma.rn.f32 %f4799, %f4795, %f4798, %f4797;
+ mov.f32 %f4800, 0fA7C234C5;
+ fma.rn.f32 %f5951, %f4795, %f4800, %f4799;
+ abs.f32 %f2011, %f1958;
+ setp.ltu.f32 %p1449, %f2011, 0f47CE4780;
+ @%p1449 bra $L__BB0_1716;
+
+ setp.eq.f32 %p1450, %f2011, 0f7F800000;
+ @%p1450 bra $L__BB0_1715;
+ bra.uni $L__BB0_1710;
+
+$L__BB0_1715:
+ mov.f32 %f4803, 0f00000000;
+ mul.rn.f32 %f5951, %f1958, %f4803;
+ mov.u32 %r8430, 0;
+ bra.uni $L__BB0_1716;
+
+$L__BB0_1710:
+ mov.b32 %r2253, %f1958;
+ shr.u32 %r7226, %r2253, 23;
+ and.b32 %r7227, %r7226, 255;
+ add.s32 %r2254, %r7227, -128;
+ shl.b32 %r7228, %r2253, 8;
+ or.b32 %r2255, %r7228, -2147483648;
+ shr.u32 %r2256, %r2254, 5;
+ mov.u64 %rd2756, 0;
+ mov.u32 %r8427, 0;
+ mov.u64 %rd2754, __cudart_i2opi_f;
+ mov.u64 %rd2755, %rd1;
+
+$L__BB0_1711:
+ .pragma "nounroll";
+ ld.global.nc.u32 %r7229, [%rd2754];
+ mad.wide.u32 %rd2361, %r7229, %r2255, %rd2756;
+ shr.u64 %rd2756, %rd2361, 32;
+ st.local.u32 [%rd2755], %rd2361;
+ add.s64 %rd2755, %rd2755, 4;
+ add.s64 %rd2754, %rd2754, 4;
+ add.s32 %r8427, %r8427, 1;
+ setp.ne.s32 %p1451, %r8427, 6;
+ @%p1451 bra $L__BB0_1711;
+
+ st.local.u32 [%rd4], %rd2756;
+ mov.u32 %r7230, 4;
+ sub.s32 %r2259, %r7230, %r2256;
+ mov.u32 %r7231, 6;
+ sub.s32 %r7232, %r7231, %r2256;
+ mul.wide.s32 %rd2362, %r7232, 4;
+ add.s64 %rd2363, %rd1, %rd2362;
+ ld.local.u32 %r8428, [%rd2363];
+ ld.local.u32 %r8429, [%rd2363+-4];
+ and.b32 %r2262, %r2254, 31;
+ setp.eq.s32 %p1452, %r2262, 0;
+ @%p1452 bra $L__BB0_1714;
+
+ mov.u32 %r7233, 32;
+ sub.s32 %r7234, %r7233, %r2262;
+ shr.u32 %r7235, %r8429, %r7234;
+ shl.b32 %r7236, %r8428, %r2262;
+ add.s32 %r8428, %r7235, %r7236;
+ mul.wide.s32 %rd2364, %r2259, 4;
+ add.s64 %rd2365, %rd1, %rd2364;
+ ld.local.u32 %r7237, [%rd2365];
+ shr.u32 %r7238, %r7237, %r7234;
+ shl.b32 %r7239, %r8429, %r2262;
+ add.s32 %r8429, %r7238, %r7239;
+
+$L__BB0_1714:
+ and.b32 %r7240, %r2253, -2147483648;
+ shr.u32 %r7241, %r8429, 30;
+ shl.b32 %r7242, %r8428, 2;
+ or.b32 %r7243, %r7241, %r7242;
+ shr.u32 %r7244, %r7243, 31;
+ shr.u32 %r7245, %r8428, 30;
+ add.s32 %r7246, %r7244, %r7245;
+ neg.s32 %r7247, %r7246;
+ setp.eq.s32 %p1453, %r7240, 0;
+ selp.b32 %r8430, %r7246, %r7247, %p1453;
+ setp.ne.s32 %p1454, %r7244, 0;
+ xor.b32 %r7248, %r7240, -2147483648;
+ selp.b32 %r7249, %r7248, %r7240, %p1454;
+ selp.b32 %r7250, -1, 0, %p1454;
+ xor.b32 %r7251, %r7243, %r7250;
+ shl.b32 %r7252, %r8429, 2;
+ xor.b32 %r7253, %r7252, %r7250;
+ cvt.u64.u32 %rd2366, %r7251;
+ cvt.u64.u32 %rd2367, %r7253;
+ bfi.b64 %rd2368, %rd2366, %rd2367, 32, 32;
+ cvt.rn.f64.s64 %fd231, %rd2368;
+ mul.f64 %fd232, %fd231, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4801, %fd232;
+ setp.eq.s32 %p1455, %r7249, 0;
+ neg.f32 %f4802, %f4801;
+ selp.f32 %f5951, %f4801, %f4802, %p1455;
+
+$L__BB0_1716:
+ add.s32 %r2269, %r8430, 1;
+ and.b32 %r2270, %r2269, 1;
+ setp.eq.s32 %p1456, %r2270, 0;
+ selp.f32 %f2015, %f5951, 0f3F800000, %p1456;
+ mul.rn.f32 %f2016, %f5951, %f5951;
+ mov.f32 %f5952, 0fB94D4153;
+ @%p1456 bra $L__BB0_1718;
+
+ mov.f32 %f4805, 0fBAB607ED;
+ mov.f32 %f4806, 0f37CBAC00;
+ fma.rn.f32 %f5952, %f4806, %f2016, %f4805;
+
+$L__BB0_1718:
+ selp.f32 %f4807, 0f3C0885E4, 0f3D2AAABB, %p1456;
+ fma.rn.f32 %f4808, %f5952, %f2016, %f4807;
+ selp.f32 %f4809, 0fBE2AAAA8, 0fBEFFFFFF, %p1456;
+ fma.rn.f32 %f4810, %f4808, %f2016, %f4809;
+ mov.f32 %f4811, 0f00000000;
+ fma.rn.f32 %f4812, %f2016, %f2015, %f4811;
+ fma.rn.f32 %f5953, %f4810, %f4812, %f2015;
+ and.b32 %r7255, %r2269, 2;
+ setp.eq.s32 %p1458, %r7255, 0;
+ @%p1458 bra $L__BB0_1720;
+
+ mov.f32 %f4814, 0fBF800000;
+ fma.rn.f32 %f5953, %f5953, %f4814, %f4811;
+
+$L__BB0_1720:
+ add.f32 %f5996, %f5950, %f5953;
+ mul.f32 %f4815, %f1967, 0f3F22F983;
+ cvt.rni.s32.f32 %r8434, %f4815;
+ cvt.rn.f32.s32 %f4816, %r8434;
+ mov.f32 %f4817, 0fBFC90FDA;
+ fma.rn.f32 %f4818, %f4816, %f4817, %f1967;
+ mov.f32 %f4819, 0fB3A22168;
+ fma.rn.f32 %f4820, %f4816, %f4819, %f4818;
+ mov.f32 %f4821, 0fA7C234C5;
+ fma.rn.f32 %f5954, %f4816, %f4821, %f4820;
+ abs.f32 %f2024, %f1967;
+ setp.ltu.f32 %p1459, %f2024, 0f47CE4780;
+ @%p1459 bra $L__BB0_1728;
+
+ setp.eq.f32 %p1460, %f2024, 0f7F800000;
+ @%p1460 bra $L__BB0_1727;
+ bra.uni $L__BB0_1722;
+
+$L__BB0_1727:
+ mov.f32 %f4824, 0f00000000;
+ mul.rn.f32 %f5954, %f1967, %f4824;
+ mov.u32 %r8434, 0;
+ bra.uni $L__BB0_1728;
+
+$L__BB0_1722:
+ mov.b32 %r2272, %f1967;
+ shr.u32 %r7257, %r2272, 23;
+ and.b32 %r7258, %r7257, 255;
+ add.s32 %r2273, %r7258, -128;
+ shl.b32 %r7259, %r2272, 8;
+ or.b32 %r2274, %r7259, -2147483648;
+ shr.u32 %r2275, %r2273, 5;
+ mov.u64 %rd2759, 0;
+ mov.u32 %r8431, 0;
+ mov.u64 %rd2757, __cudart_i2opi_f;
+ mov.u64 %rd2758, %rd1;
+
+$L__BB0_1723:
+ .pragma "nounroll";
+ ld.global.nc.u32 %r7260, [%rd2757];
+ mad.wide.u32 %rd2371, %r7260, %r2274, %rd2759;
+ shr.u64 %rd2759, %rd2371, 32;
+ st.local.u32 [%rd2758], %rd2371;
+ add.s64 %rd2758, %rd2758, 4;
+ add.s64 %rd2757, %rd2757, 4;
+ add.s32 %r8431, %r8431, 1;
+ setp.ne.s32 %p1461, %r8431, 6;
+ @%p1461 bra $L__BB0_1723;
+
+ st.local.u32 [%rd4], %rd2759;
+ mov.u32 %r7261, 4;
+ sub.s32 %r2278, %r7261, %r2275;
+ mov.u32 %r7262, 6;
+ sub.s32 %r7263, %r7262, %r2275;
+ mul.wide.s32 %rd2372, %r7263, 4;
+ add.s64 %rd2373, %rd1, %rd2372;
+ ld.local.u32 %r8432, [%rd2373];
+ ld.local.u32 %r8433, [%rd2373+-4];
+ and.b32 %r2281, %r2273, 31;
+ setp.eq.s32 %p1462, %r2281, 0;
+ @%p1462 bra $L__BB0_1726;
+
+ mov.u32 %r7264, 32;
+ sub.s32 %r7265, %r7264, %r2281;
+ shr.u32 %r7266, %r8433, %r7265;
+ shl.b32 %r7267, %r8432, %r2281;
+ add.s32 %r8432, %r7266, %r7267;
+ mul.wide.s32 %rd2374, %r2278, 4;
+ add.s64 %rd2375, %rd1, %rd2374;
+ ld.local.u32 %r7268, [%rd2375];
+ shr.u32 %r7269, %r7268, %r7265;
+ shl.b32 %r7270, %r8433, %r2281;
+ add.s32 %r8433, %r7269, %r7270;
+
+$L__BB0_1726:
+ and.b32 %r7271, %r2272, -2147483648;
+ shr.u32 %r7272, %r8433, 30;
+ shl.b32 %r7273, %r8432, 2;
+ or.b32 %r7274, %r7272, %r7273;
+ shr.u32 %r7275, %r7274, 31;
+ shr.u32 %r7276, %r8432, 30;
+ add.s32 %r7277, %r7275, %r7276;
+ neg.s32 %r7278, %r7277;
+ setp.eq.s32 %p1463, %r7271, 0;
+ selp.b32 %r8434, %r7277, %r7278, %p1463;
+ setp.ne.s32 %p1464, %r7275, 0;
+ xor.b32 %r7279, %r7271, -2147483648;
+ selp.b32 %r7280, %r7279, %r7271, %p1464;
+ selp.b32 %r7281, -1, 0, %p1464;
+ xor.b32 %r7282, %r7274, %r7281;
+ shl.b32 %r7283, %r8433, 2;
+ xor.b32 %r7284, %r7283, %r7281;
+ cvt.u64.u32 %rd2376, %r7282;
+ cvt.u64.u32 %rd2377, %r7284;
+ bfi.b64 %rd2378, %rd2376, %rd2377, 32, 32;
+ cvt.rn.f64.s64 %fd233, %rd2378;
+ mul.f64 %fd234, %fd233, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4822, %fd234;
+ setp.eq.s32 %p1465, %r7280, 0;
+ neg.f32 %f4823, %f4822;
+ selp.f32 %f5954, %f4822, %f4823, %p1465;
+
+$L__BB0_1728:
+ and.b32 %r2288, %r8434, 1;
+ setp.eq.s32 %p1466, %r2288, 0;
+ selp.f32 %f2028, %f5954, 0f3F800000, %p1466;
+ mul.rn.f32 %f2029, %f5954, %f5954;
+ mov.f32 %f5955, 0fB94D4153;
+ @%p1466 bra $L__BB0_1730;
+
+ mov.f32 %f4826, 0fBAB607ED;
+ mov.f32 %f4827, 0f37CBAC00;
+ fma.rn.f32 %f5955, %f4827, %f2029, %f4826;
+
+$L__BB0_1730:
+ selp.f32 %f4828, 0f3C0885E4, 0f3D2AAABB, %p1466;
+ fma.rn.f32 %f4829, %f5955, %f2029, %f4828;
+ selp.f32 %f4830, 0fBE2AAAA8, 0fBEFFFFFF, %p1466;
+ fma.rn.f32 %f4831, %f4829, %f2029, %f4830;
+ mov.f32 %f4832, 0f00000000;
+ fma.rn.f32 %f4833, %f2029, %f2028, %f4832;
+ fma.rn.f32 %f5956, %f4831, %f4833, %f2028;
+ and.b32 %r7286, %r8434, 2;
+ setp.eq.s32 %p1468, %r7286, 0;
+ @%p1468 bra $L__BB0_1732;
+
+ mov.f32 %f4835, 0fBF800000;
+ fma.rn.f32 %f5956, %f5956, %f4835, %f4832;
+
+$L__BB0_1732:
+ mul.f32 %f4836, %f1959, 0f3F22F983;
+ cvt.rni.s32.f32 %r8438, %f4836;
+ cvt.rn.f32.s32 %f4837, %r8438;
+ mov.f32 %f4838, 0fBFC90FDA;
+ fma.rn.f32 %f4839, %f4837, %f4838, %f1959;
+ mov.f32 %f4840, 0fB3A22168;
+ fma.rn.f32 %f4841, %f4837, %f4840, %f4839;
+ mov.f32 %f4842, 0fA7C234C5;
+ fma.rn.f32 %f5957, %f4837, %f4842, %f4841;
+ abs.f32 %f2036, %f1959;
+ setp.ltu.f32 %p1469, %f2036, 0f47CE4780;
+ @%p1469 bra $L__BB0_1740;
+
+ setp.eq.f32 %p1470, %f2036, 0f7F800000;
+ @%p1470 bra $L__BB0_1739;
+ bra.uni $L__BB0_1734;
+
+$L__BB0_1739:
+ mov.f32 %f4845, 0f00000000;
+ mul.rn.f32 %f5957, %f1959, %f4845;
+ mov.u32 %r8438, 0;
+ bra.uni $L__BB0_1740;
+
+$L__BB0_1734:
+ mov.b32 %r2290, %f1959;
+ shr.u32 %r7288, %r2290, 23;
+ and.b32 %r7289, %r7288, 255;
+ add.s32 %r2291, %r7289, -128;
+ shl.b32 %r7290, %r2290, 8;
+ or.b32 %r2292, %r7290, -2147483648;
+ shr.u32 %r2293, %r2291, 5;
+ mov.u64 %rd2762, 0;
+ mov.u32 %r8435, 0;
+ mov.u64 %rd2760, __cudart_i2opi_f;
+ mov.u64 %rd2761, %rd1;
+
+$L__BB0_1735:
+ .pragma "nounroll";
+ ld.global.nc.u32 %r7291, [%rd2760];
+ mad.wide.u32 %rd2381, %r7291, %r2292, %rd2762;
+ shr.u64 %rd2762, %rd2381, 32;
+ st.local.u32 [%rd2761], %rd2381;
+ add.s64 %rd2761, %rd2761, 4;
+ add.s64 %rd2760, %rd2760, 4;
+ add.s32 %r8435, %r8435, 1;
+ setp.ne.s32 %p1471, %r8435, 6;
+ @%p1471 bra $L__BB0_1735;
+
+ st.local.u32 [%rd4], %rd2762;
+ mov.u32 %r7292, 4;
+ sub.s32 %r2296, %r7292, %r2293;
+ mov.u32 %r7293, 6;
+ sub.s32 %r7294, %r7293, %r2293;
+ mul.wide.s32 %rd2382, %r7294, 4;
+ add.s64 %rd2383, %rd1, %rd2382;
+ ld.local.u32 %r8436, [%rd2383];
+ ld.local.u32 %r8437, [%rd2383+-4];
+ and.b32 %r2299, %r2291, 31;
+ setp.eq.s32 %p1472, %r2299, 0;
+ @%p1472 bra $L__BB0_1738;
+
+ mov.u32 %r7295, 32;
+ sub.s32 %r7296, %r7295, %r2299;
+ shr.u32 %r7297, %r8437, %r7296;
+ shl.b32 %r7298, %r8436, %r2299;
+ add.s32 %r8436, %r7297, %r7298;
+ mul.wide.s32 %rd2384, %r2296, 4;
+ add.s64 %rd2385, %rd1, %rd2384;
+ ld.local.u32 %r7299, [%rd2385];
+ shr.u32 %r7300, %r7299, %r7296;
+ shl.b32 %r7301, %r8437, %r2299;
+ add.s32 %r8437, %r7300, %r7301;
+
+$L__BB0_1738:
+ and.b32 %r7302, %r2290, -2147483648;
+ shr.u32 %r7303, %r8437, 30;
+ shl.b32 %r7304, %r8436, 2;
+ or.b32 %r7305, %r7303, %r7304;
+ shr.u32 %r7306, %r7305, 31;
+ shr.u32 %r7307, %r8436, 30;
+ add.s32 %r7308, %r7306, %r7307;
+ neg.s32 %r7309, %r7308;
+ setp.eq.s32 %p1473, %r7302, 0;
+ selp.b32 %r8438, %r7308, %r7309, %p1473;
+ setp.ne.s32 %p1474, %r7306, 0;
+ xor.b32 %r7310, %r7302, -2147483648;
+ selp.b32 %r7311, %r7310, %r7302, %p1474;
+ selp.b32 %r7312, -1, 0, %p1474;
+ xor.b32 %r7313, %r7305, %r7312;
+ shl.b32 %r7314, %r8437, 2;
+ xor.b32 %r7315, %r7314, %r7312;
+ cvt.u64.u32 %rd2386, %r7313;
+ cvt.u64.u32 %rd2387, %r7315;
+ bfi.b64 %rd2388, %rd2386, %rd2387, 32, 32;
+ cvt.rn.f64.s64 %fd235, %rd2388;
+ mul.f64 %fd236, %fd235, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4843, %fd236;
+ setp.eq.s32 %p1475, %r7311, 0;
+ neg.f32 %f4844, %f4843;
+ selp.f32 %f5957, %f4843, %f4844, %p1475;
+
+$L__BB0_1740:
+ add.s32 %r2306, %r8438, 1;
+ and.b32 %r2307, %r2306, 1;
+ setp.eq.s32 %p1476, %r2307, 0;
+ selp.f32 %f2040, %f5957, 0f3F800000, %p1476;
+ mul.rn.f32 %f2041, %f5957, %f5957;
+ mov.f32 %f5958, 0fB94D4153;
+ @%p1476 bra $L__BB0_1742;
+
+ mov.f32 %f4847, 0fBAB607ED;
+ mov.f32 %f4848, 0f37CBAC00;
+ fma.rn.f32 %f5958, %f4848, %f2041, %f4847;
+
+$L__BB0_1742:
+ selp.f32 %f4849, 0f3C0885E4, 0f3D2AAABB, %p1476;
+ fma.rn.f32 %f4850, %f5958, %f2041, %f4849;
+ selp.f32 %f4851, 0fBE2AAAA8, 0fBEFFFFFF, %p1476;
+ fma.rn.f32 %f4852, %f4850, %f2041, %f4851;
+ mov.f32 %f4853, 0f00000000;
+ fma.rn.f32 %f4854, %f2041, %f2040, %f4853;
+ fma.rn.f32 %f5959, %f4852, %f4854, %f2040;
+ and.b32 %r7317, %r2306, 2;
+ setp.eq.s32 %p1478, %r7317, 0;
+ @%p1478 bra $L__BB0_1744;
+
+ mov.f32 %f4856, 0fBF800000;
+ fma.rn.f32 %f5959, %f5959, %f4856, %f4853;
+
+$L__BB0_1744:
+ add.f32 %f5995, %f5956, %f5959;
+ mul.f32 %f4857, %f1968, 0f3F22F983;
+ cvt.rni.s32.f32 %r8442, %f4857;
+ cvt.rn.f32.s32 %f4858, %r8442;
+ mov.f32 %f4859, 0fBFC90FDA;
+ fma.rn.f32 %f4860, %f4858, %f4859, %f1968;
+ mov.f32 %f4861, 0fB3A22168;
+ fma.rn.f32 %f4862, %f4858, %f4861, %f4860;
+ mov.f32 %f4863, 0fA7C234C5;
+ fma.rn.f32 %f5960, %f4858, %f4863, %f4862;
+ abs.f32 %f2049, %f1968;
+ setp.ltu.f32 %p1479, %f2049, 0f47CE4780;
+ @%p1479 bra $L__BB0_1752;
+
+ setp.eq.f32 %p1480, %f2049, 0f7F800000;
+ @%p1480 bra $L__BB0_1751;
+ bra.uni $L__BB0_1746;
+
+$L__BB0_1751:
+ mov.f32 %f4866, 0f00000000;
+ mul.rn.f32 %f5960, %f1968, %f4866;
+ mov.u32 %r8442, 0;
+ bra.uni $L__BB0_1752;
+
+$L__BB0_1746:
+ mov.b32 %r2309, %f1968;
+ shr.u32 %r7319, %r2309, 23;
+ and.b32 %r7320, %r7319, 255;
+ add.s32 %r2310, %r7320, -128;
+ shl.b32 %r7321, %r2309, 8;
+ or.b32 %r2311, %r7321, -2147483648;
+ shr.u32 %r2312, %r2310, 5;
+ mov.u64 %rd2765, 0;
+ mov.u32 %r8439, 0;
+ mov.u64 %rd2763, __cudart_i2opi_f;
+ mov.u64 %rd2764, %rd1;
+
+$L__BB0_1747:
+ .pragma "nounroll";
+ ld.global.nc.u32 %r7322, [%rd2763];
+ mad.wide.u32 %rd2391, %r7322, %r2311, %rd2765;
+ shr.u64 %rd2765, %rd2391, 32;
+ st.local.u32 [%rd2764], %rd2391;
+ add.s64 %rd2764, %rd2764, 4;
+ add.s64 %rd2763, %rd2763, 4;
+ add.s32 %r8439, %r8439, 1;
+ setp.ne.s32 %p1481, %r8439, 6;
+ @%p1481 bra $L__BB0_1747;
+
+ st.local.u32 [%rd4], %rd2765;
+ mov.u32 %r7323, 4;
+ sub.s32 %r2315, %r7323, %r2312;
+ mov.u32 %r7324, 6;
+ sub.s32 %r7325, %r7324, %r2312;
+ mul.wide.s32 %rd2392, %r7325, 4;
+ add.s64 %rd2393, %rd1, %rd2392;
+ ld.local.u32 %r8440, [%rd2393];
+ ld.local.u32 %r8441, [%rd2393+-4];
+ and.b32 %r2318, %r2310, 31;
+ setp.eq.s32 %p1482, %r2318, 0;
+ @%p1482 bra $L__BB0_1750;
+
+ mov.u32 %r7326, 32;
+ sub.s32 %r7327, %r7326, %r2318;
+ shr.u32 %r7328, %r8441, %r7327;
+ shl.b32 %r7329, %r8440, %r2318;
+ add.s32 %r8440, %r7328, %r7329;
+ mul.wide.s32 %rd2394, %r2315, 4;
+ add.s64 %rd2395, %rd1, %rd2394;
+ ld.local.u32 %r7330, [%rd2395];
+ shr.u32 %r7331, %r7330, %r7327;
+ shl.b32 %r7332, %r8441, %r2318;
+ add.s32 %r8441, %r7331, %r7332;
+
+$L__BB0_1750:
+ and.b32 %r7333, %r2309, -2147483648;
+ shr.u32 %r7334, %r8441, 30;
+ shl.b32 %r7335, %r8440, 2;
+ or.b32 %r7336, %r7334, %r7335;
+ shr.u32 %r7337, %r7336, 31;
+ shr.u32 %r7338, %r8440, 30;
+ add.s32 %r7339, %r7337, %r7338;
+ neg.s32 %r7340, %r7339;
+ setp.eq.s32 %p1483, %r7333, 0;
+ selp.b32 %r8442, %r7339, %r7340, %p1483;
+ setp.ne.s32 %p1484, %r7337, 0;
+ xor.b32 %r7341, %r7333, -2147483648;
+ selp.b32 %r7342, %r7341, %r7333, %p1484;
+ selp.b32 %r7343, -1, 0, %p1484;
+ xor.b32 %r7344, %r7336, %r7343;
+ shl.b32 %r7345, %r8441, 2;
+ xor.b32 %r7346, %r7345, %r7343;
+ cvt.u64.u32 %rd2396, %r7344;
+ cvt.u64.u32 %rd2397, %r7346;
+ bfi.b64 %rd2398, %rd2396, %rd2397, 32, 32;
+ cvt.rn.f64.s64 %fd237, %rd2398;
+ mul.f64 %fd238, %fd237, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4864, %fd238;
+ setp.eq.s32 %p1485, %r7342, 0;
+ neg.f32 %f4865, %f4864;
+ selp.f32 %f5960, %f4864, %f4865, %p1485;
+
+$L__BB0_1752:
+ and.b32 %r2325, %r8442, 1;
+ setp.eq.s32 %p1486, %r2325, 0;
+ selp.f32 %f2053, %f5960, 0f3F800000, %p1486;
+ mul.rn.f32 %f2054, %f5960, %f5960;
+ mov.f32 %f5961, 0fB94D4153;
+ @%p1486 bra $L__BB0_1754;
+
+ mov.f32 %f4868, 0fBAB607ED;
+ mov.f32 %f4869, 0f37CBAC00;
+ fma.rn.f32 %f5961, %f4869, %f2054, %f4868;
+
+$L__BB0_1754:
+ selp.f32 %f4870, 0f3C0885E4, 0f3D2AAABB, %p1486;
+ fma.rn.f32 %f4871, %f5961, %f2054, %f4870;
+ selp.f32 %f4872, 0fBE2AAAA8, 0fBEFFFFFF, %p1486;
+ fma.rn.f32 %f4873, %f4871, %f2054, %f4872;
+ mov.f32 %f4874, 0f00000000;
+ fma.rn.f32 %f4875, %f2054, %f2053, %f4874;
+ fma.rn.f32 %f5962, %f4873, %f4875, %f2053;
+ and.b32 %r7348, %r8442, 2;
+ setp.eq.s32 %p1488, %r7348, 0;
+ @%p1488 bra $L__BB0_1756;
+
+ mov.f32 %f4877, 0fBF800000;
+ fma.rn.f32 %f5962, %f5962, %f4877, %f4874;
+
+$L__BB0_1756:
+ mul.f32 %f4878, %f1960, 0f3F22F983;
+ cvt.rni.s32.f32 %r8446, %f4878;
+ cvt.rn.f32.s32 %f4879, %r8446;
+ mov.f32 %f4880, 0fBFC90FDA;
+ fma.rn.f32 %f4881, %f4879, %f4880, %f1960;
+ mov.f32 %f4882, 0fB3A22168;
+ fma.rn.f32 %f4883, %f4879, %f4882, %f4881;
+ mov.f32 %f4884, 0fA7C234C5;
+ fma.rn.f32 %f5963, %f4879, %f4884, %f4883;
+ abs.f32 %f2061, %f1960;
+ setp.ltu.f32 %p1489, %f2061, 0f47CE4780;
+ @%p1489 bra $L__BB0_1764;
+
+ setp.eq.f32 %p1490, %f2061, 0f7F800000;
+ @%p1490 bra $L__BB0_1763;
+ bra.uni $L__BB0_1758;
+
+$L__BB0_1763:
+ mov.f32 %f4887, 0f00000000;
+ mul.rn.f32 %f5963, %f1960, %f4887;
+ mov.u32 %r8446, 0;
+ bra.uni $L__BB0_1764;
+
+$L__BB0_1758:
+ mov.b32 %r2327, %f1960;
+ shr.u32 %r7350, %r2327, 23;
+ and.b32 %r7351, %r7350, 255;
+ add.s32 %r2328, %r7351, -128;
+ shl.b32 %r7352, %r2327, 8;
+ or.b32 %r2329, %r7352, -2147483648;
+ shr.u32 %r2330, %r2328, 5;
+ mov.u64 %rd2768, 0;
+ mov.u32 %r8443, 0;
+ mov.u64 %rd2766, __cudart_i2opi_f;
+ mov.u64 %rd2767, %rd1;
+
+$L__BB0_1759:
+ .pragma "nounroll";
+ ld.global.nc.u32 %r7353, [%rd2766];
+ mad.wide.u32 %rd2401, %r7353, %r2329, %rd2768;
+ shr.u64 %rd2768, %rd2401, 32;
+ st.local.u32 [%rd2767], %rd2401;
+ add.s64 %rd2767, %rd2767, 4;
+ add.s64 %rd2766, %rd2766, 4;
+ add.s32 %r8443, %r8443, 1;
+ setp.ne.s32 %p1491, %r8443, 6;
+ @%p1491 bra $L__BB0_1759;
+
+ st.local.u32 [%rd4], %rd2768;
+ mov.u32 %r7354, 4;
+ sub.s32 %r2333, %r7354, %r2330;
+ mov.u32 %r7355, 6;
+ sub.s32 %r7356, %r7355, %r2330;
+ mul.wide.s32 %rd2402, %r7356, 4;
+ add.s64 %rd2403, %rd1, %rd2402;
+ ld.local.u32 %r8444, [%rd2403];
+ ld.local.u32 %r8445, [%rd2403+-4];
+ and.b32 %r2336, %r2328, 31;
+ setp.eq.s32 %p1492, %r2336, 0;
+ @%p1492 bra $L__BB0_1762;
+
+ mov.u32 %r7357, 32;
+ sub.s32 %r7358, %r7357, %r2336;
+ shr.u32 %r7359, %r8445, %r7358;
+ shl.b32 %r7360, %r8444, %r2336;
+ add.s32 %r8444, %r7359, %r7360;
+ mul.wide.s32 %rd2404, %r2333, 4;
+ add.s64 %rd2405, %rd1, %rd2404;
+ ld.local.u32 %r7361, [%rd2405];
+ shr.u32 %r7362, %r7361, %r7358;
+ shl.b32 %r7363, %r8445, %r2336;
+ add.s32 %r8445, %r7362, %r7363;
+
+$L__BB0_1762:
+ and.b32 %r7364, %r2327, -2147483648;
+ shr.u32 %r7365, %r8445, 30;
+ shl.b32 %r7366, %r8444, 2;
+ or.b32 %r7367, %r7365, %r7366;
+ shr.u32 %r7368, %r7367, 31;
+ shr.u32 %r7369, %r8444, 30;
+ add.s32 %r7370, %r7368, %r7369;
+ neg.s32 %r7371, %r7370;
+ setp.eq.s32 %p1493, %r7364, 0;
+ selp.b32 %r8446, %r7370, %r7371, %p1493;
+ setp.ne.s32 %p1494, %r7368, 0;
+ xor.b32 %r7372, %r7364, -2147483648;
+ selp.b32 %r7373, %r7372, %r7364, %p1494;
+ selp.b32 %r7374, -1, 0, %p1494;
+ xor.b32 %r7375, %r7367, %r7374;
+ shl.b32 %r7376, %r8445, 2;
+ xor.b32 %r7377, %r7376, %r7374;
+ cvt.u64.u32 %rd2406, %r7375;
+ cvt.u64.u32 %rd2407, %r7377;
+ bfi.b64 %rd2408, %rd2406, %rd2407, 32, 32;
+ cvt.rn.f64.s64 %fd239, %rd2408;
+ mul.f64 %fd240, %fd239, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4885, %fd240;
+ setp.eq.s32 %p1495, %r7373, 0;
+ neg.f32 %f4886, %f4885;
+ selp.f32 %f5963, %f4885, %f4886, %p1495;
+
+$L__BB0_1764:
+ add.s32 %r2343, %r8446, 1;
+ and.b32 %r2344, %r2343, 1;
+ setp.eq.s32 %p1496, %r2344, 0;
+ selp.f32 %f2065, %f5963, 0f3F800000, %p1496;
+ mul.rn.f32 %f2066, %f5963, %f5963;
+ mov.f32 %f5964, 0fB94D4153;
+ @%p1496 bra $L__BB0_1766;
+
+ mov.f32 %f4889, 0fBAB607ED;
+ mov.f32 %f4890, 0f37CBAC00;
+ fma.rn.f32 %f5964, %f4890, %f2066, %f4889;
+
+$L__BB0_1766:
+ selp.f32 %f4891, 0f3C0885E4, 0f3D2AAABB, %p1496;
+ fma.rn.f32 %f4892, %f5964, %f2066, %f4891;
+ selp.f32 %f4893, 0fBE2AAAA8, 0fBEFFFFFF, %p1496;
+ fma.rn.f32 %f4894, %f4892, %f2066, %f4893;
+ mov.f32 %f4895, 0f00000000;
+ fma.rn.f32 %f4896, %f2066, %f2065, %f4895;
+ fma.rn.f32 %f5965, %f4894, %f4896, %f2065;
+ and.b32 %r7379, %r2343, 2;
+ setp.eq.s32 %p1498, %r7379, 0;
+ @%p1498 bra $L__BB0_1768;
+
+ mov.f32 %f4898, 0fBF800000;
+ fma.rn.f32 %f5965, %f5965, %f4898, %f4895;
+
+$L__BB0_1768:
+ add.f32 %f5994, %f5962, %f5965;
+ mul.f32 %f4899, %f1969, 0f3F22F983;
+ cvt.rni.s32.f32 %r8450, %f4899;
+ cvt.rn.f32.s32 %f4900, %r8450;
+ mov.f32 %f4901, 0fBFC90FDA;
+ fma.rn.f32 %f4902, %f4900, %f4901, %f1969;
+ mov.f32 %f4903, 0fB3A22168;
+ fma.rn.f32 %f4904, %f4900, %f4903, %f4902;
+ mov.f32 %f4905, 0fA7C234C5;
+ fma.rn.f32 %f5966, %f4900, %f4905, %f4904;
+ abs.f32 %f2074, %f1969;
+ setp.ltu.f32 %p1499, %f2074, 0f47CE4780;
+ @%p1499 bra $L__BB0_1776;
+
+ setp.eq.f32 %p1500, %f2074, 0f7F800000;
+ @%p1500 bra $L__BB0_1775;
+ bra.uni $L__BB0_1770;
+
+$L__BB0_1775:
+ mov.f32 %f4908, 0f00000000;
+ mul.rn.f32 %f5966, %f1969, %f4908;
+ mov.u32 %r8450, 0;
+ bra.uni $L__BB0_1776;
+
+$L__BB0_1770:
+ mov.b32 %r2346, %f1969;
+ shr.u32 %r7381, %r2346, 23;
+ and.b32 %r7382, %r7381, 255;
+ add.s32 %r2347, %r7382, -128;
+ shl.b32 %r7383, %r2346, 8;
+ or.b32 %r2348, %r7383, -2147483648;
+ shr.u32 %r2349, %r2347, 5;
+ mov.u64 %rd2771, 0;
+ mov.u32 %r8447, 0;
+ mov.u64 %rd2769, __cudart_i2opi_f;
+ mov.u64 %rd2770, %rd1;
+
+$L__BB0_1771:
+ .pragma "nounroll";
+ ld.global.nc.u32 %r7384, [%rd2769];
+ mad.wide.u32 %rd2411, %r7384, %r2348, %rd2771;
+ shr.u64 %rd2771, %rd2411, 32;
+ st.local.u32 [%rd2770], %rd2411;
+ add.s64 %rd2770, %rd2770, 4;
+ add.s64 %rd2769, %rd2769, 4;
+ add.s32 %r8447, %r8447, 1;
+ setp.ne.s32 %p1501, %r8447, 6;
+ @%p1501 bra $L__BB0_1771;
+
+ st.local.u32 [%rd4], %rd2771;
+ mov.u32 %r7385, 4;
+ sub.s32 %r2352, %r7385, %r2349;
+ mov.u32 %r7386, 6;
+ sub.s32 %r7387, %r7386, %r2349;
+ mul.wide.s32 %rd2412, %r7387, 4;
+ add.s64 %rd2413, %rd1, %rd2412;
+ ld.local.u32 %r8448, [%rd2413];
+ ld.local.u32 %r8449, [%rd2413+-4];
+ and.b32 %r2355, %r2347, 31;
+ setp.eq.s32 %p1502, %r2355, 0;
+ @%p1502 bra $L__BB0_1774;
+
+ mov.u32 %r7388, 32;
+ sub.s32 %r7389, %r7388, %r2355;
+ shr.u32 %r7390, %r8449, %r7389;
+ shl.b32 %r7391, %r8448, %r2355;
+ add.s32 %r8448, %r7390, %r7391;
+ mul.wide.s32 %rd2414, %r2352, 4;
+ add.s64 %rd2415, %rd1, %rd2414;
+ ld.local.u32 %r7392, [%rd2415];
+ shr.u32 %r7393, %r7392, %r7389;
+ shl.b32 %r7394, %r8449, %r2355;
+ add.s32 %r8449, %r7393, %r7394;
+
+$L__BB0_1774:
+ and.b32 %r7395, %r2346, -2147483648;
+ shr.u32 %r7396, %r8449, 30;
+ shl.b32 %r7397, %r8448, 2;
+ or.b32 %r7398, %r7396, %r7397;
+ shr.u32 %r7399, %r7398, 31;
+ shr.u32 %r7400, %r8448, 30;
+ add.s32 %r7401, %r7399, %r7400;
+ neg.s32 %r7402, %r7401;
+ setp.eq.s32 %p1503, %r7395, 0;
+ selp.b32 %r8450, %r7401, %r7402, %p1503;
+ setp.ne.s32 %p1504, %r7399, 0;
+ xor.b32 %r7403, %r7395, -2147483648;
+ selp.b32 %r7404, %r7403, %r7395, %p1504;
+ selp.b32 %r7405, -1, 0, %p1504;
+ xor.b32 %r7406, %r7398, %r7405;
+ shl.b32 %r7407, %r8449, 2;
+ xor.b32 %r7408, %r7407, %r7405;
+ cvt.u64.u32 %rd2416, %r7406;
+ cvt.u64.u32 %rd2417, %r7408;
+ bfi.b64 %rd2418, %rd2416, %rd2417, 32, 32;
+ cvt.rn.f64.s64 %fd241, %rd2418;
+ mul.f64 %fd242, %fd241, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4906, %fd242;
+ setp.eq.s32 %p1505, %r7404, 0;
+ neg.f32 %f4907, %f4906;
+ selp.f32 %f5966, %f4906, %f4907, %p1505;
+
+$L__BB0_1776:
+ and.b32 %r2362, %r8450, 1;
+ setp.eq.s32 %p1506, %r2362, 0;
+ selp.f32 %f2078, %f5966, 0f3F800000, %p1506;
+ mul.rn.f32 %f2079, %f5966, %f5966;
+ mov.f32 %f5967, 0fB94D4153;
+ @%p1506 bra $L__BB0_1778;
+
+ mov.f32 %f4910, 0fBAB607ED;
+ mov.f32 %f4911, 0f37CBAC00;
+ fma.rn.f32 %f5967, %f4911, %f2079, %f4910;
+
+$L__BB0_1778:
+ selp.f32 %f4912, 0f3C0885E4, 0f3D2AAABB, %p1506;
+ fma.rn.f32 %f4913, %f5967, %f2079, %f4912;
+ selp.f32 %f4914, 0fBE2AAAA8, 0fBEFFFFFF, %p1506;
+ fma.rn.f32 %f4915, %f4913, %f2079, %f4914;
+ mov.f32 %f4916, 0f00000000;
+ fma.rn.f32 %f4917, %f2079, %f2078, %f4916;
+ fma.rn.f32 %f5968, %f4915, %f4917, %f2078;
+ and.b32 %r7410, %r8450, 2;
+ setp.eq.s32 %p1508, %r7410, 0;
+ @%p1508 bra $L__BB0_1780;
+
+ mov.f32 %f4919, 0fBF800000;
+ fma.rn.f32 %f5968, %f5968, %f4919, %f4916;
+
+$L__BB0_1780:
+ mul.f32 %f4920, %f1961, 0f3F22F983;
+ cvt.rni.s32.f32 %r8454, %f4920;
+ cvt.rn.f32.s32 %f4921, %r8454;
+ mov.f32 %f4922, 0fBFC90FDA;
+ fma.rn.f32 %f4923, %f4921, %f4922, %f1961;
+ mov.f32 %f4924, 0fB3A22168;
+ fma.rn.f32 %f4925, %f4921, %f4924, %f4923;
+ mov.f32 %f4926, 0fA7C234C5;
+ fma.rn.f32 %f5969, %f4921, %f4926, %f4925;
+ abs.f32 %f2086, %f1961;
+ setp.ltu.f32 %p1509, %f2086, 0f47CE4780;
+ @%p1509 bra $L__BB0_1788;
+
+ setp.eq.f32 %p1510, %f2086, 0f7F800000;
+ @%p1510 bra $L__BB0_1787;
+ bra.uni $L__BB0_1782;
+
+$L__BB0_1787:
+ mov.f32 %f4929, 0f00000000;
+ mul.rn.f32 %f5969, %f1961, %f4929;
+ mov.u32 %r8454, 0;
+ bra.uni $L__BB0_1788;
+
+$L__BB0_1782:
+ mov.b32 %r2364, %f1961;
+ shr.u32 %r7412, %r2364, 23;
+ and.b32 %r7413, %r7412, 255;
+ add.s32 %r2365, %r7413, -128;
+ shl.b32 %r7414, %r2364, 8;
+ or.b32 %r2366, %r7414, -2147483648;
+ shr.u32 %r2367, %r2365, 5;
+ mov.u64 %rd2774, 0;
+ mov.u32 %r8451, 0;
+ mov.u64 %rd2772, __cudart_i2opi_f;
+ mov.u64 %rd2773, %rd1;
+
+$L__BB0_1783:
+ .pragma "nounroll";
+ ld.global.nc.u32 %r7415, [%rd2772];
+ mad.wide.u32 %rd2421, %r7415, %r2366, %rd2774;
+ shr.u64 %rd2774, %rd2421, 32;
+ st.local.u32 [%rd2773], %rd2421;
+ add.s64 %rd2773, %rd2773, 4;
+ add.s64 %rd2772, %rd2772, 4;
+ add.s32 %r8451, %r8451, 1;
+ setp.ne.s32 %p1511, %r8451, 6;
+ @%p1511 bra $L__BB0_1783;
+
+ st.local.u32 [%rd4], %rd2774;
+ mov.u32 %r7416, 4;
+ sub.s32 %r2370, %r7416, %r2367;
+ mov.u32 %r7417, 6;
+ sub.s32 %r7418, %r7417, %r2367;
+ mul.wide.s32 %rd2422, %r7418, 4;
+ add.s64 %rd2423, %rd1, %rd2422;
+ ld.local.u32 %r8452, [%rd2423];
+ ld.local.u32 %r8453, [%rd2423+-4];
+ and.b32 %r2373, %r2365, 31;
+ setp.eq.s32 %p1512, %r2373, 0;
+ @%p1512 bra $L__BB0_1786;
+
+ mov.u32 %r7419, 32;
+ sub.s32 %r7420, %r7419, %r2373;
+ shr.u32 %r7421, %r8453, %r7420;
+ shl.b32 %r7422, %r8452, %r2373;
+ add.s32 %r8452, %r7421, %r7422;
+ mul.wide.s32 %rd2424, %r2370, 4;
+ add.s64 %rd2425, %rd1, %rd2424;
+ ld.local.u32 %r7423, [%rd2425];
+ shr.u32 %r7424, %r7423, %r7420;
+ shl.b32 %r7425, %r8453, %r2373;
+ add.s32 %r8453, %r7424, %r7425;
+
+$L__BB0_1786:
+ and.b32 %r7426, %r2364, -2147483648;
+ shr.u32 %r7427, %r8453, 30;
+ shl.b32 %r7428, %r8452, 2;
+ or.b32 %r7429, %r7427, %r7428;
+ shr.u32 %r7430, %r7429, 31;
+ shr.u32 %r7431, %r8452, 30;
+ add.s32 %r7432, %r7430, %r7431;
+ neg.s32 %r7433, %r7432;
+ setp.eq.s32 %p1513, %r7426, 0;
+ selp.b32 %r8454, %r7432, %r7433, %p1513;
+ setp.ne.s32 %p1514, %r7430, 0;
+ xor.b32 %r7434, %r7426, -2147483648;
+ selp.b32 %r7435, %r7434, %r7426, %p1514;
+ selp.b32 %r7436, -1, 0, %p1514;
+ xor.b32 %r7437, %r7429, %r7436;
+ shl.b32 %r7438, %r8453, 2;
+ xor.b32 %r7439, %r7438, %r7436;
+ cvt.u64.u32 %rd2426, %r7437;
+ cvt.u64.u32 %rd2427, %r7439;
+ bfi.b64 %rd2428, %rd2426, %rd2427, 32, 32;
+ cvt.rn.f64.s64 %fd243, %rd2428;
+ mul.f64 %fd244, %fd243, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4927, %fd244;
+ setp.eq.s32 %p1515, %r7435, 0;
+ neg.f32 %f4928, %f4927;
+ selp.f32 %f5969, %f4927, %f4928, %p1515;
+
+$L__BB0_1788:
+ add.s32 %r2380, %r8454, 1;
+ and.b32 %r2381, %r2380, 1;
+ setp.eq.s32 %p1516, %r2381, 0;
+ selp.f32 %f2090, %f5969, 0f3F800000, %p1516;
+ mul.rn.f32 %f2091, %f5969, %f5969;
+ mov.f32 %f5970, 0fB94D4153;
+ @%p1516 bra $L__BB0_1790;
+
+ mov.f32 %f4931, 0fBAB607ED;
+ mov.f32 %f4932, 0f37CBAC00;
+ fma.rn.f32 %f5970, %f4932, %f2091, %f4931;
+
+$L__BB0_1790:
+ selp.f32 %f4933, 0f3C0885E4, 0f3D2AAABB, %p1516;
+ fma.rn.f32 %f4934, %f5970, %f2091, %f4933;
+ selp.f32 %f4935, 0fBE2AAAA8, 0fBEFFFFFF, %p1516;
+ fma.rn.f32 %f4936, %f4934, %f2091, %f4935;
+ mov.f32 %f4937, 0f00000000;
+ fma.rn.f32 %f4938, %f2091, %f2090, %f4937;
+ fma.rn.f32 %f5971, %f4936, %f4938, %f2090;
+ and.b32 %r7441, %r2380, 2;
+ setp.eq.s32 %p1518, %r7441, 0;
+ @%p1518 bra $L__BB0_1792;
+
+ mov.f32 %f4940, 0fBF800000;
+ fma.rn.f32 %f5971, %f5971, %f4940, %f4937;
+
+$L__BB0_1792:
+ add.f32 %f5993, %f5968, %f5971;
+ mul.f32 %f4941, %f1970, 0f3F22F983;
+ cvt.rni.s32.f32 %r8458, %f4941;
+ cvt.rn.f32.s32 %f4942, %r8458;
+ mov.f32 %f4943, 0fBFC90FDA;
+ fma.rn.f32 %f4944, %f4942, %f4943, %f1970;
+ mov.f32 %f4945, 0fB3A22168;
+ fma.rn.f32 %f4946, %f4942, %f4945, %f4944;
+ mov.f32 %f4947, 0fA7C234C5;
+ fma.rn.f32 %f5972, %f4942, %f4947, %f4946;
+ abs.f32 %f2099, %f1970;
+ setp.ltu.f32 %p1519, %f2099, 0f47CE4780;
+ @%p1519 bra $L__BB0_1800;
+
+ setp.eq.f32 %p1520, %f2099, 0f7F800000;
+ @%p1520 bra $L__BB0_1799;
+ bra.uni $L__BB0_1794;
+
+$L__BB0_1799:
+ mov.f32 %f4950, 0f00000000;
+ mul.rn.f32 %f5972, %f1970, %f4950;
+ mov.u32 %r8458, 0;
+ bra.uni $L__BB0_1800;
+
+$L__BB0_1794:
+ mov.b32 %r2383, %f1970;
+ shr.u32 %r7443, %r2383, 23;
+ and.b32 %r7444, %r7443, 255;
+ add.s32 %r2384, %r7444, -128;
+ shl.b32 %r7445, %r2383, 8;
+ or.b32 %r2385, %r7445, -2147483648;
+ shr.u32 %r2386, %r2384, 5;
+ mov.u64 %rd2777, 0;
+ mov.u32 %r8455, 0;
+ mov.u64 %rd2775, __cudart_i2opi_f;
+ mov.u64 %rd2776, %rd1;
+
+$L__BB0_1795:
+ .pragma "nounroll";
+ ld.global.nc.u32 %r7446, [%rd2775];
+ mad.wide.u32 %rd2431, %r7446, %r2385, %rd2777;
+ shr.u64 %rd2777, %rd2431, 32;
+ st.local.u32 [%rd2776], %rd2431;
+ add.s64 %rd2776, %rd2776, 4;
+ add.s64 %rd2775, %rd2775, 4;
+ add.s32 %r8455, %r8455, 1;
+ setp.ne.s32 %p1521, %r8455, 6;
+ @%p1521 bra $L__BB0_1795;
+
+ st.local.u32 [%rd4], %rd2777;
+ mov.u32 %r7447, 4;
+ sub.s32 %r2389, %r7447, %r2386;
+ mov.u32 %r7448, 6;
+ sub.s32 %r7449, %r7448, %r2386;
+ mul.wide.s32 %rd2432, %r7449, 4;
+ add.s64 %rd2433, %rd1, %rd2432;
+ ld.local.u32 %r8456, [%rd2433];
+ ld.local.u32 %r8457, [%rd2433+-4];
+ and.b32 %r2392, %r2384, 31;
+ setp.eq.s32 %p1522, %r2392, 0;
+ @%p1522 bra $L__BB0_1798;
+
+ mov.u32 %r7450, 32;
+ sub.s32 %r7451, %r7450, %r2392;
+ shr.u32 %r7452, %r8457, %r7451;
+ shl.b32 %r7453, %r8456, %r2392;
+ add.s32 %r8456, %r7452, %r7453;
+ mul.wide.s32 %rd2434, %r2389, 4;
+ add.s64 %rd2435, %rd1, %rd2434;
+ ld.local.u32 %r7454, [%rd2435];
+ shr.u32 %r7455, %r7454, %r7451;
+ shl.b32 %r7456, %r8457, %r2392;
+ add.s32 %r8457, %r7455, %r7456;
+
+$L__BB0_1798:
+ and.b32 %r7457, %r2383, -2147483648;
+ shr.u32 %r7458, %r8457, 30;
+ shl.b32 %r7459, %r8456, 2;
+ or.b32 %r7460, %r7458, %r7459;
+ shr.u32 %r7461, %r7460, 31;
+ shr.u32 %r7462, %r8456, 30;
+ add.s32 %r7463, %r7461, %r7462;
+ neg.s32 %r7464, %r7463;
+ setp.eq.s32 %p1523, %r7457, 0;
+ selp.b32 %r8458, %r7463, %r7464, %p1523;
+ setp.ne.s32 %p1524, %r7461, 0;
+ xor.b32 %r7465, %r7457, -2147483648;
+ selp.b32 %r7466, %r7465, %r7457, %p1524;
+ selp.b32 %r7467, -1, 0, %p1524;
+ xor.b32 %r7468, %r7460, %r7467;
+ shl.b32 %r7469, %r8457, 2;
+ xor.b32 %r7470, %r7469, %r7467;
+ cvt.u64.u32 %rd2436, %r7468;
+ cvt.u64.u32 %rd2437, %r7470;
+ bfi.b64 %rd2438, %rd2436, %rd2437, 32, 32;
+ cvt.rn.f64.s64 %fd245, %rd2438;
+ mul.f64 %fd246, %fd245, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4948, %fd246;
+ setp.eq.s32 %p1525, %r7466, 0;
+ neg.f32 %f4949, %f4948;
+ selp.f32 %f5972, %f4948, %f4949, %p1525;
+
+$L__BB0_1800:
+ and.b32 %r2399, %r8458, 1;
+ setp.eq.s32 %p1526, %r2399, 0;
+ selp.f32 %f2103, %f5972, 0f3F800000, %p1526;
+ mul.rn.f32 %f2104, %f5972, %f5972;
+ mov.f32 %f5973, 0fB94D4153;
+ @%p1526 bra $L__BB0_1802;
+
+ mov.f32 %f4952, 0fBAB607ED;
+ mov.f32 %f4953, 0f37CBAC00;
+ fma.rn.f32 %f5973, %f4953, %f2104, %f4952;
+
+$L__BB0_1802:
+ selp.f32 %f4954, 0f3C0885E4, 0f3D2AAABB, %p1526;
+ fma.rn.f32 %f4955, %f5973, %f2104, %f4954;
+ selp.f32 %f4956, 0fBE2AAAA8, 0fBEFFFFFF, %p1526;
+ fma.rn.f32 %f4957, %f4955, %f2104, %f4956;
+ mov.f32 %f4958, 0f00000000;
+ fma.rn.f32 %f4959, %f2104, %f2103, %f4958;
+ fma.rn.f32 %f5974, %f4957, %f4959, %f2103;
+ and.b32 %r7472, %r8458, 2;
+ setp.eq.s32 %p1528, %r7472, 0;
+ @%p1528 bra $L__BB0_1804;
+
+ mov.f32 %f4961, 0fBF800000;
+ fma.rn.f32 %f5974, %f5974, %f4961, %f4958;
+
+$L__BB0_1804:
+ mul.f32 %f4962, %f1962, 0f3F22F983;
+ cvt.rni.s32.f32 %r8462, %f4962;
+ cvt.rn.f32.s32 %f4963, %r8462;
+ mov.f32 %f4964, 0fBFC90FDA;
+ fma.rn.f32 %f4965, %f4963, %f4964, %f1962;
+ mov.f32 %f4966, 0fB3A22168;
+ fma.rn.f32 %f4967, %f4963, %f4966, %f4965;
+ mov.f32 %f4968, 0fA7C234C5;
+ fma.rn.f32 %f5975, %f4963, %f4968, %f4967;
+ abs.f32 %f2111, %f1962;
+ setp.ltu.f32 %p1529, %f2111, 0f47CE4780;
+ @%p1529 bra $L__BB0_1812;
+
+ setp.eq.f32 %p1530, %f2111, 0f7F800000;
+ @%p1530 bra $L__BB0_1811;
+ bra.uni $L__BB0_1806;
+
+$L__BB0_1811:
+ mov.f32 %f4971, 0f00000000;
+ mul.rn.f32 %f5975, %f1962, %f4971;
+ mov.u32 %r8462, 0;
+ bra.uni $L__BB0_1812;
+
+$L__BB0_1806:
+ mov.b32 %r2401, %f1962;
+ shr.u32 %r7474, %r2401, 23;
+ and.b32 %r7475, %r7474, 255;
+ add.s32 %r2402, %r7475, -128;
+ shl.b32 %r7476, %r2401, 8;
+ or.b32 %r2403, %r7476, -2147483648;
+ shr.u32 %r2404, %r2402, 5;
+ mov.u64 %rd2780, 0;
+ mov.u32 %r8459, 0;
+ mov.u64 %rd2778, __cudart_i2opi_f;
+ mov.u64 %rd2779, %rd1;
+
+$L__BB0_1807:
+ .pragma "nounroll";
+ ld.global.nc.u32 %r7477, [%rd2778];
+ mad.wide.u32 %rd2441, %r7477, %r2403, %rd2780;
+ shr.u64 %rd2780, %rd2441, 32;
+ st.local.u32 [%rd2779], %rd2441;
+ add.s64 %rd2779, %rd2779, 4;
+ add.s64 %rd2778, %rd2778, 4;
+ add.s32 %r8459, %r8459, 1;
+ setp.ne.s32 %p1531, %r8459, 6;
+ @%p1531 bra $L__BB0_1807;
+
+ st.local.u32 [%rd4], %rd2780;
+ mov.u32 %r7478, 4;
+ sub.s32 %r2407, %r7478, %r2404;
+ mov.u32 %r7479, 6;
+ sub.s32 %r7480, %r7479, %r2404;
+ mul.wide.s32 %rd2442, %r7480, 4;
+ add.s64 %rd2443, %rd1, %rd2442;
+ ld.local.u32 %r8460, [%rd2443];
+ ld.local.u32 %r8461, [%rd2443+-4];
+ and.b32 %r2410, %r2402, 31;
+ setp.eq.s32 %p1532, %r2410, 0;
+ @%p1532 bra $L__BB0_1810;
+
+ mov.u32 %r7481, 32;
+ sub.s32 %r7482, %r7481, %r2410;
+ shr.u32 %r7483, %r8461, %r7482;
+ shl.b32 %r7484, %r8460, %r2410;
+ add.s32 %r8460, %r7483, %r7484;
+ mul.wide.s32 %rd2444, %r2407, 4;
+ add.s64 %rd2445, %rd1, %rd2444;
+ ld.local.u32 %r7485, [%rd2445];
+ shr.u32 %r7486, %r7485, %r7482;
+ shl.b32 %r7487, %r8461, %r2410;
+ add.s32 %r8461, %r7486, %r7487;
+
+$L__BB0_1810:
+ and.b32 %r7488, %r2401, -2147483648;
+ shr.u32 %r7489, %r8461, 30;
+ shl.b32 %r7490, %r8460, 2;
+ or.b32 %r7491, %r7489, %r7490;
+ shr.u32 %r7492, %r7491, 31;
+ shr.u32 %r7493, %r8460, 30;
+ add.s32 %r7494, %r7492, %r7493;
+ neg.s32 %r7495, %r7494;
+ setp.eq.s32 %p1533, %r7488, 0;
+ selp.b32 %r8462, %r7494, %r7495, %p1533;
+ setp.ne.s32 %p1534, %r7492, 0;
+ xor.b32 %r7496, %r7488, -2147483648;
+ selp.b32 %r7497, %r7496, %r7488, %p1534;
+ selp.b32 %r7498, -1, 0, %p1534;
+ xor.b32 %r7499, %r7491, %r7498;
+ shl.b32 %r7500, %r8461, 2;
+ xor.b32 %r7501, %r7500, %r7498;
+ cvt.u64.u32 %rd2446, %r7499;
+ cvt.u64.u32 %rd2447, %r7501;
+ bfi.b64 %rd2448, %rd2446, %rd2447, 32, 32;
+ cvt.rn.f64.s64 %fd247, %rd2448;
+ mul.f64 %fd248, %fd247, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4969, %fd248;
+ setp.eq.s32 %p1535, %r7497, 0;
+ neg.f32 %f4970, %f4969;
+ selp.f32 %f5975, %f4969, %f4970, %p1535;
+
+$L__BB0_1812:
+ add.s32 %r2417, %r8462, 1;
+ and.b32 %r2418, %r2417, 1;
+ setp.eq.s32 %p1536, %r2418, 0;
+ selp.f32 %f2115, %f5975, 0f3F800000, %p1536;
+ mul.rn.f32 %f2116, %f5975, %f5975;
+ mov.f32 %f5976, 0fB94D4153;
+ @%p1536 bra $L__BB0_1814;
+
+ mov.f32 %f4973, 0fBAB607ED;
+ mov.f32 %f4974, 0f37CBAC00;
+ fma.rn.f32 %f5976, %f4974, %f2116, %f4973;
+
+$L__BB0_1814:
+ selp.f32 %f4975, 0f3C0885E4, 0f3D2AAABB, %p1536;
+ fma.rn.f32 %f4976, %f5976, %f2116, %f4975;
+ selp.f32 %f4977, 0fBE2AAAA8, 0fBEFFFFFF, %p1536;
+ fma.rn.f32 %f4978, %f4976, %f2116, %f4977;
+ mov.f32 %f4979, 0f00000000;
+ fma.rn.f32 %f4980, %f2116, %f2115, %f4979;
+ fma.rn.f32 %f5977, %f4978, %f4980, %f2115;
+ and.b32 %r7503, %r2417, 2;
+ setp.eq.s32 %p1538, %r7503, 0;
+ @%p1538 bra $L__BB0_1816;
+
+ mov.f32 %f4982, 0fBF800000;
+ fma.rn.f32 %f5977, %f5977, %f4982, %f4979;
+
+$L__BB0_1816:
+ add.f32 %f5992, %f5974, %f5977;
+ mul.f32 %f4983, %f1971, 0f3F22F983;
+ cvt.rni.s32.f32 %r8466, %f4983;
+ cvt.rn.f32.s32 %f4984, %r8466;
+ mov.f32 %f4985, 0fBFC90FDA;
+ fma.rn.f32 %f4986, %f4984, %f4985, %f1971;
+ mov.f32 %f4987, 0fB3A22168;
+ fma.rn.f32 %f4988, %f4984, %f4987, %f4986;
+ mov.f32 %f4989, 0fA7C234C5;
+ fma.rn.f32 %f5978, %f4984, %f4989, %f4988;
+ abs.f32 %f2124, %f1971;
+ setp.ltu.f32 %p1539, %f2124, 0f47CE4780;
+ @%p1539 bra $L__BB0_1824;
+
+ setp.eq.f32 %p1540, %f2124, 0f7F800000;
+ @%p1540 bra $L__BB0_1823;
+ bra.uni $L__BB0_1818;
+
+$L__BB0_1823:
+ mov.f32 %f4992, 0f00000000;
+ mul.rn.f32 %f5978, %f1971, %f4992;
+ mov.u32 %r8466, 0;
+ bra.uni $L__BB0_1824;
+
+$L__BB0_1818:
+ mov.b32 %r2420, %f1971;
+ shr.u32 %r7505, %r2420, 23;
+ and.b32 %r7506, %r7505, 255;
+ add.s32 %r2421, %r7506, -128;
+ shl.b32 %r7507, %r2420, 8;
+ or.b32 %r2422, %r7507, -2147483648;
+ shr.u32 %r2423, %r2421, 5;
+ mov.u64 %rd2783, 0;
+ mov.u32 %r8463, 0;
+ mov.u64 %rd2781, __cudart_i2opi_f;
+ mov.u64 %rd2782, %rd1;
+
+$L__BB0_1819:
+ .pragma "nounroll";
+ ld.global.nc.u32 %r7508, [%rd2781];
+ mad.wide.u32 %rd2451, %r7508, %r2422, %rd2783;
+ shr.u64 %rd2783, %rd2451, 32;
+ st.local.u32 [%rd2782], %rd2451;
+ add.s64 %rd2782, %rd2782, 4;
+ add.s64 %rd2781, %rd2781, 4;
+ add.s32 %r8463, %r8463, 1;
+ setp.ne.s32 %p1541, %r8463, 6;
+ @%p1541 bra $L__BB0_1819;
+
+ st.local.u32 [%rd4], %rd2783;
+ mov.u32 %r7509, 4;
+ sub.s32 %r2426, %r7509, %r2423;
+ mov.u32 %r7510, 6;
+ sub.s32 %r7511, %r7510, %r2423;
+ mul.wide.s32 %rd2452, %r7511, 4;
+ add.s64 %rd2453, %rd1, %rd2452;
+ ld.local.u32 %r8464, [%rd2453];
+ ld.local.u32 %r8465, [%rd2453+-4];
+ and.b32 %r2429, %r2421, 31;
+ setp.eq.s32 %p1542, %r2429, 0;
+ @%p1542 bra $L__BB0_1822;
+
+ mov.u32 %r7512, 32;
+ sub.s32 %r7513, %r7512, %r2429;
+ shr.u32 %r7514, %r8465, %r7513;
+ shl.b32 %r7515, %r8464, %r2429;
+ add.s32 %r8464, %r7514, %r7515;
+ mul.wide.s32 %rd2454, %r2426, 4;
+ add.s64 %rd2455, %rd1, %rd2454;
+ ld.local.u32 %r7516, [%rd2455];
+ shr.u32 %r7517, %r7516, %r7513;
+ shl.b32 %r7518, %r8465, %r2429;
+ add.s32 %r8465, %r7517, %r7518;
+
+$L__BB0_1822:
+ and.b32 %r7519, %r2420, -2147483648;
+ shr.u32 %r7520, %r8465, 30;
+ shl.b32 %r7521, %r8464, 2;
+ or.b32 %r7522, %r7520, %r7521;
+ shr.u32 %r7523, %r7522, 31;
+ shr.u32 %r7524, %r8464, 30;
+ add.s32 %r7525, %r7523, %r7524;
+ neg.s32 %r7526, %r7525;
+ setp.eq.s32 %p1543, %r7519, 0;
+ selp.b32 %r8466, %r7525, %r7526, %p1543;
+ setp.ne.s32 %p1544, %r7523, 0;
+ xor.b32 %r7527, %r7519, -2147483648;
+ selp.b32 %r7528, %r7527, %r7519, %p1544;
+ selp.b32 %r7529, -1, 0, %p1544;
+ xor.b32 %r7530, %r7522, %r7529;
+ shl.b32 %r7531, %r8465, 2;
+ xor.b32 %r7532, %r7531, %r7529;
+ cvt.u64.u32 %rd2456, %r7530;
+ cvt.u64.u32 %rd2457, %r7532;
+ bfi.b64 %rd2458, %rd2456, %rd2457, 32, 32;
+ cvt.rn.f64.s64 %fd249, %rd2458;
+ mul.f64 %fd250, %fd249, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4990, %fd250;
+ setp.eq.s32 %p1545, %r7528, 0;
+ neg.f32 %f4991, %f4990;
+ selp.f32 %f5978, %f4990, %f4991, %p1545;
+
+$L__BB0_1824:
+ and.b32 %r2436, %r8466, 1;
+ setp.eq.s32 %p1546, %r2436, 0;
+ selp.f32 %f2128, %f5978, 0f3F800000, %p1546;
+ mul.rn.f32 %f2129, %f5978, %f5978;
+ mov.f32 %f5979, 0fB94D4153;
+ @%p1546 bra $L__BB0_1826;
+
+ mov.f32 %f4994, 0fBAB607ED;
+ mov.f32 %f4995, 0f37CBAC00;
+ fma.rn.f32 %f5979, %f4995, %f2129, %f4994;
+
+$L__BB0_1826:
+ selp.f32 %f4996, 0f3C0885E4, 0f3D2AAABB, %p1546;
+ fma.rn.f32 %f4997, %f5979, %f2129, %f4996;
+ selp.f32 %f4998, 0fBE2AAAA8, 0fBEFFFFFF, %p1546;
+ fma.rn.f32 %f4999, %f4997, %f2129, %f4998;
+ mov.f32 %f5000, 0f00000000;
+ fma.rn.f32 %f5001, %f2129, %f2128, %f5000;
+ fma.rn.f32 %f5980, %f4999, %f5001, %f2128;
+ and.b32 %r7534, %r8466, 2;
+ setp.eq.s32 %p1548, %r7534, 0;
+ @%p1548 bra $L__BB0_1828;
+
+ mov.f32 %f5003, 0fBF800000;
+ fma.rn.f32 %f5980, %f5980, %f5003, %f5000;
+
+$L__BB0_1828:
+ mul.f32 %f5004, %f1963, 0f3F22F983;
+ cvt.rni.s32.f32 %r8470, %f5004;
+ cvt.rn.f32.s32 %f5005, %r8470;
+ mov.f32 %f5006, 0fBFC90FDA;
+ fma.rn.f32 %f5007, %f5005, %f5006, %f1963;
+ mov.f32 %f5008, 0fB3A22168;
+ fma.rn.f32 %f5009, %f5005, %f5008, %f5007;
+ mov.f32 %f5010, 0fA7C234C5;
+ fma.rn.f32 %f5981, %f5005, %f5010, %f5009;
+ abs.f32 %f2136, %f1963;
+ setp.ltu.f32 %p1549, %f2136, 0f47CE4780;
+ @%p1549 bra $L__BB0_1836;
+
+ setp.eq.f32 %p1550, %f2136, 0f7F800000;
+ @%p1550 bra $L__BB0_1835;
+ bra.uni $L__BB0_1830;
+
+$L__BB0_1835:
+ mov.f32 %f5013, 0f00000000;
+ mul.rn.f32 %f5981, %f1963, %f5013;
+ mov.u32 %r8470, 0;
+ bra.uni $L__BB0_1836;
+
+$L__BB0_1830:
+ mov.b32 %r2438, %f1963;
+ shr.u32 %r7536, %r2438, 23;
+ and.b32 %r7537, %r7536, 255;
+ add.s32 %r2439, %r7537, -128;
+ shl.b32 %r7538, %r2438, 8;
+ or.b32 %r2440, %r7538, -2147483648;
+ shr.u32 %r2441, %r2439, 5;
+ mov.u64 %rd2786, 0;
+ mov.u32 %r8467, 0;
+ mov.u64 %rd2784, __cudart_i2opi_f;
+ mov.u64 %rd2785, %rd1;
+
+$L__BB0_1831:
+ .pragma "nounroll";
+ ld.global.nc.u32 %r7539, [%rd2784];
+ mad.wide.u32 %rd2461, %r7539, %r2440, %rd2786;
+ shr.u64 %rd2786, %rd2461, 32;
+ st.local.u32 [%rd2785], %rd2461;
+ add.s64 %rd2785, %rd2785, 4;
+ add.s64 %rd2784, %rd2784, 4;
+ add.s32 %r8467, %r8467, 1;
+ setp.ne.s32 %p1551, %r8467, 6;
+ @%p1551 bra $L__BB0_1831;
+
+ st.local.u32 [%rd4], %rd2786;
+ mov.u32 %r7540, 4;
+ sub.s32 %r2444, %r7540, %r2441;
+ mov.u32 %r7541, 6;
+ sub.s32 %r7542, %r7541, %r2441;
+ mul.wide.s32 %rd2462, %r7542, 4;
+ add.s64 %rd2463, %rd1, %rd2462;
+ ld.local.u32 %r8468, [%rd2463];
+ ld.local.u32 %r8469, [%rd2463+-4];
+ and.b32 %r2447, %r2439, 31;
+ setp.eq.s32 %p1552, %r2447, 0;
+ @%p1552 bra $L__BB0_1834;
+
+ mov.u32 %r7543, 32;
+ sub.s32 %r7544, %r7543, %r2447;
+ shr.u32 %r7545, %r8469, %r7544;
+ shl.b32 %r7546, %r8468, %r2447;
+ add.s32 %r8468, %r7545, %r7546;
+ mul.wide.s32 %rd2464, %r2444, 4;
+ add.s64 %rd2465, %rd1, %rd2464;
+ ld.local.u32 %r7547, [%rd2465];
+ shr.u32 %r7548, %r7547, %r7544;
+ shl.b32 %r7549, %r8469, %r2447;
+ add.s32 %r8469, %r7548, %r7549;
+
+$L__BB0_1834:
+ and.b32 %r7550, %r2438, -2147483648;
+ shr.u32 %r7551, %r8469, 30;
+ shl.b32 %r7552, %r8468, 2;
+ or.b32 %r7553, %r7551, %r7552;
+ shr.u32 %r7554, %r7553, 31;
+ shr.u32 %r7555, %r8468, 30;
+ add.s32 %r7556, %r7554, %r7555;
+ neg.s32 %r7557, %r7556;
+ setp.eq.s32 %p1553, %r7550, 0;
+ selp.b32 %r8470, %r7556, %r7557, %p1553;
+ setp.ne.s32 %p1554, %r7554, 0;
+ xor.b32 %r7558, %r7550, -2147483648;
+ selp.b32 %r7559, %r7558, %r7550, %p1554;
+ selp.b32 %r7560, -1, 0, %p1554;
+ xor.b32 %r7561, %r7553, %r7560;
+ shl.b32 %r7562, %r8469, 2;
+ xor.b32 %r7563, %r7562, %r7560;
+ cvt.u64.u32 %rd2466, %r7561;
+ cvt.u64.u32 %rd2467, %r7563;
+ bfi.b64 %rd2468, %rd2466, %rd2467, 32, 32;
+ cvt.rn.f64.s64 %fd251, %rd2468;
+ mul.f64 %fd252, %fd251, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f5011, %fd252;
+ setp.eq.s32 %p1555, %r7559, 0;
+ neg.f32 %f5012, %f5011;
+ selp.f32 %f5981, %f5011, %f5012, %p1555;
+
+$L__BB0_1836:
+ add.s32 %r2454, %r8470, 1;
+ and.b32 %r2455, %r2454, 1;
+ setp.eq.s32 %p1556, %r2455, 0;
+ selp.f32 %f2140, %f5981, 0f3F800000, %p1556;
+ mul.rn.f32 %f2141, %f5981, %f5981;
+ mov.f32 %f5982, 0fB94D4153;
+ @%p1556 bra $L__BB0_1838;
+
+ mov.f32 %f5015, 0fBAB607ED;
+ mov.f32 %f5016, 0f37CBAC00;
+ fma.rn.f32 %f5982, %f5016, %f2141, %f5015;
+
+$L__BB0_1838:
+ selp.f32 %f5017, 0f3C0885E4, 0f3D2AAABB, %p1556;
+ fma.rn.f32 %f5018, %f5982, %f2141, %f5017;
+ selp.f32 %f5019, 0fBE2AAAA8, 0fBEFFFFFF, %p1556;
+ fma.rn.f32 %f5020, %f5018, %f2141, %f5019;
+ mov.f32 %f5021, 0f00000000;
+ fma.rn.f32 %f5022, %f2141, %f2140, %f5021;
+ fma.rn.f32 %f5983, %f5020, %f5022, %f2140;
+ and.b32 %r7565, %r2454, 2;
+ setp.eq.s32 %p1558, %r7565, 0;
+ @%p1558 bra $L__BB0_1840;
+
+ mov.f32 %f5024, 0fBF800000;
+ fma.rn.f32 %f5983, %f5983, %f5024, %f5021;
+
+$L__BB0_1840:
+ add.f32 %f5991, %f5980, %f5983;
+ mul.f32 %f5025, %f1972, 0f3F22F983;
+ cvt.rni.s32.f32 %r8474, %f5025;
+ cvt.rn.f32.s32 %f5026, %r8474;
+ mov.f32 %f5027, 0fBFC90FDA;
+ fma.rn.f32 %f5028, %f5026, %f5027, %f1972;
+ mov.f32 %f5029, 0fB3A22168;
+ fma.rn.f32 %f5030, %f5026, %f5029, %f5028;
+ mov.f32 %f5031, 0fA7C234C5;
+ fma.rn.f32 %f5984, %f5026, %f5031, %f5030;
+ abs.f32 %f2149, %f1972;
+ setp.ltu.f32 %p1559, %f2149, 0f47CE4780;
+ @%p1559 bra $L__BB0_1848;
+
+ setp.eq.f32 %p1560, %f2149, 0f7F800000;
+ @%p1560 bra $L__BB0_1847;
+ bra.uni $L__BB0_1842;
+
+$L__BB0_1847:
+ mov.f32 %f5034, 0f00000000;
+ mul.rn.f32 %f5984, %f1972, %f5034;
+ mov.u32 %r8474, 0;
+ bra.uni $L__BB0_1848;
+
+$L__BB0_1842:
+ mov.b32 %r2457, %f1972;
+ shr.u32 %r7567, %r2457, 23;
+ and.b32 %r7568, %r7567, 255;
+ add.s32 %r2458, %r7568, -128;
+ shl.b32 %r7569, %r2457, 8;
+ or.b32 %r2459, %r7569, -2147483648;
+ shr.u32 %r2460, %r2458, 5;
+ mov.u64 %rd2789, 0;
+ mov.u32 %r8471, 0;
+ mov.u64 %rd2787, __cudart_i2opi_f;
+ mov.u64 %rd2788, %rd1;
+
+$L__BB0_1843:
+ .pragma "nounroll";
+ ld.global.nc.u32 %r7570, [%rd2787];
+ mad.wide.u32 %rd2471, %r7570, %r2459, %rd2789;
+ shr.u64 %rd2789, %rd2471, 32;
+ st.local.u32 [%rd2788], %rd2471;
+ add.s64 %rd2788, %rd2788, 4;
+ add.s64 %rd2787, %rd2787, 4;
+ add.s32 %r8471, %r8471, 1;
+ setp.ne.s32 %p1561, %r8471, 6;
+ @%p1561 bra $L__BB0_1843;
+
+ st.local.u32 [%rd4], %rd2789;
+ mov.u32 %r7571, 4;
+ sub.s32 %r2463, %r7571, %r2460;
+ mov.u32 %r7572, 6;
+ sub.s32 %r7573, %r7572, %r2460;
+ mul.wide.s32 %rd2472, %r7573, 4;
+ add.s64 %rd2473, %rd1, %rd2472;
+ ld.local.u32 %r8472, [%rd2473];
+ ld.local.u32 %r8473, [%rd2473+-4];
+ and.b32 %r2466, %r2458, 31;
+ setp.eq.s32 %p1562, %r2466, 0;
+ @%p1562 bra $L__BB0_1846;
+
+ mov.u32 %r7574, 32;
+ sub.s32 %r7575, %r7574, %r2466;
+ shr.u32 %r7576, %r8473, %r7575;
+ shl.b32 %r7577, %r8472, %r2466;
+ add.s32 %r8472, %r7576, %r7577;
+ mul.wide.s32 %rd2474, %r2463, 4;
+ add.s64 %rd2475, %rd1, %rd2474;
+ ld.local.u32 %r7578, [%rd2475];
+ shr.u32 %r7579, %r7578, %r7575;
+ shl.b32 %r7580, %r8473, %r2466;
+ add.s32 %r8473, %r7579, %r7580;
+
+$L__BB0_1846:
+ and.b32 %r7581, %r2457, -2147483648;
+ shr.u32 %r7582, %r8473, 30;
+ shl.b32 %r7583, %r8472, 2;
+ or.b32 %r7584, %r7582, %r7583;
+ shr.u32 %r7585, %r7584, 31;
+ shr.u32 %r7586, %r8472, 30;
+ add.s32 %r7587, %r7585, %r7586;
+ neg.s32 %r7588, %r7587;
+ setp.eq.s32 %p1563, %r7581, 0;
+ selp.b32 %r8474, %r7587, %r7588, %p1563;
+ setp.ne.s32 %p1564, %r7585, 0;
+ xor.b32 %r7589, %r7581, -2147483648;
+ selp.b32 %r7590, %r7589, %r7581, %p1564;
+ selp.b32 %r7591, -1, 0, %p1564;
+ xor.b32 %r7592, %r7584, %r7591;
+ shl.b32 %r7593, %r8473, 2;
+ xor.b32 %r7594, %r7593, %r7591;
+ cvt.u64.u32 %rd2476, %r7592;
+ cvt.u64.u32 %rd2477, %r7594;
+ bfi.b64 %rd2478, %rd2476, %rd2477, 32, 32;
+ cvt.rn.f64.s64 %fd253, %rd2478;
+ mul.f64 %fd254, %fd253, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f5032, %fd254;
+ setp.eq.s32 %p1565, %r7590, 0;
+ neg.f32 %f5033, %f5032;
+ selp.f32 %f5984, %f5032, %f5033, %p1565;
+
+$L__BB0_1848:
+ and.b32 %r2473, %r8474, 1;
+ setp.eq.s32 %p1566, %r2473, 0;
+ selp.f32 %f2153, %f5984, 0f3F800000, %p1566;
+ mul.rn.f32 %f2154, %f5984, %f5984;
+ mov.f32 %f5985, 0fB94D4153;
+ @%p1566 bra $L__BB0_1850;
+
+ mov.f32 %f5036, 0fBAB607ED;
+ mov.f32 %f5037, 0f37CBAC00;
+ fma.rn.f32 %f5985, %f5037, %f2154, %f5036;
+
+$L__BB0_1850:
+ selp.f32 %f5038, 0f3C0885E4, 0f3D2AAABB, %p1566;
+ fma.rn.f32 %f5039, %f5985, %f2154, %f5038;
+ selp.f32 %f5040, 0fBE2AAAA8, 0fBEFFFFFF, %p1566;
+ fma.rn.f32 %f5041, %f5039, %f2154, %f5040;
+ mov.f32 %f5042, 0f00000000;
+ fma.rn.f32 %f5043, %f2154, %f2153, %f5042;
+ fma.rn.f32 %f5986, %f5041, %f5043, %f2153;
+ and.b32 %r7596, %r8474, 2;
+ setp.eq.s32 %p1568, %r7596, 0;
+ @%p1568 bra $L__BB0_1852;
+
+ mov.f32 %f5045, 0fBF800000;
+ fma.rn.f32 %f5986, %f5986, %f5045, %f5042;
+
+$L__BB0_1852:
+ mul.f32 %f5046, %f1964, 0f3F22F983;
+ cvt.rni.s32.f32 %r8478, %f5046;
+ cvt.rn.f32.s32 %f5047, %r8478;
+ mov.f32 %f5048, 0fBFC90FDA;
+ fma.rn.f32 %f5049, %f5047, %f5048, %f1964;
+ mov.f32 %f5050, 0fB3A22168;
+ fma.rn.f32 %f5051, %f5047, %f5050, %f5049;
+ mov.f32 %f5052, 0fA7C234C5;
+ fma.rn.f32 %f5987, %f5047, %f5052, %f5051;
+ abs.f32 %f2161, %f1964;
+ setp.ltu.f32 %p1569, %f2161, 0f47CE4780;
+ @%p1569 bra $L__BB0_1860;
+
+ setp.eq.f32 %p1570, %f2161, 0f7F800000;
+ @%p1570 bra $L__BB0_1859;
+ bra.uni $L__BB0_1854;
+
+$L__BB0_1859:
+ mov.f32 %f5055, 0f00000000;
+ mul.rn.f32 %f5987, %f1964, %f5055;
+ mov.u32 %r8478, 0;
+ bra.uni $L__BB0_1860;
+
+$L__BB0_1854:
+ mov.b32 %r2475, %f1964;
+ shr.u32 %r7598, %r2475, 23;
+ and.b32 %r7599, %r7598, 255;
+ add.s32 %r2476, %r7599, -128;
+ shl.b32 %r7600, %r2475, 8;
+ or.b32 %r2477, %r7600, -2147483648;
+ shr.u32 %r2478, %r2476, 5;
+ mov.u64 %rd2792, 0;
+ mov.u32 %r8475, 0;
+ mov.u64 %rd2790, __cudart_i2opi_f;
+ mov.u64 %rd2791, %rd1;
+
+$L__BB0_1855:
+ .pragma "nounroll";
+ ld.global.nc.u32 %r7601, [%rd2790];
+ mad.wide.u32 %rd2481, %r7601, %r2477, %rd2792;
+ shr.u64 %rd2792, %rd2481, 32;
+ st.local.u32 [%rd2791], %rd2481;
+ add.s64 %rd2791, %rd2791, 4;
+ add.s64 %rd2790, %rd2790, 4;
+ add.s32 %r8475, %r8475, 1;
+ setp.ne.s32 %p1571, %r8475, 6;
+ @%p1571 bra $L__BB0_1855;
+
+ st.local.u32 [%rd4], %rd2792;
+ mov.u32 %r7602, 4;
+ sub.s32 %r2481, %r7602, %r2478;
+ mov.u32 %r7603, 6;
+ sub.s32 %r7604, %r7603, %r2478;
+ mul.wide.s32 %rd2482, %r7604, 4;
+ add.s64 %rd2483, %rd1, %rd2482;
+ ld.local.u32 %r8476, [%rd2483];
+ ld.local.u32 %r8477, [%rd2483+-4];
+ and.b32 %r2484, %r2476, 31;
+ setp.eq.s32 %p1572, %r2484, 0;
+ @%p1572 bra $L__BB0_1858;
+
+ mov.u32 %r7605, 32;
+ sub.s32 %r7606, %r7605, %r2484;
+ shr.u32 %r7607, %r8477, %r7606;
+ shl.b32 %r7608, %r8476, %r2484;
+ add.s32 %r8476, %r7607, %r7608;
+ mul.wide.s32 %rd2484, %r2481, 4;
+ add.s64 %rd2485, %rd1, %rd2484;
+ ld.local.u32 %r7609, [%rd2485];
+ shr.u32 %r7610, %r7609, %r7606;
+ shl.b32 %r7611, %r8477, %r2484;
+ add.s32 %r8477, %r7610, %r7611;
+
+$L__BB0_1858:
+ and.b32 %r7612, %r2475, -2147483648;
+ shr.u32 %r7613, %r8477, 30;
+ shl.b32 %r7614, %r8476, 2;
+ or.b32 %r7615, %r7613, %r7614;
+ shr.u32 %r7616, %r7615, 31;
+ shr.u32 %r7617, %r8476, 30;
+ add.s32 %r7618, %r7616, %r7617;
+ neg.s32 %r7619, %r7618;
+ setp.eq.s32 %p1573, %r7612, 0;
+ selp.b32 %r8478, %r7618, %r7619, %p1573;
+ setp.ne.s32 %p1574, %r7616, 0;
+ xor.b32 %r7620, %r7612, -2147483648;
+ selp.b32 %r7621, %r7620, %r7612, %p1574;
+ selp.b32 %r7622, -1, 0, %p1574;
+ xor.b32 %r7623, %r7615, %r7622;
+ shl.b32 %r7624, %r8477, 2;
+ xor.b32 %r7625, %r7624, %r7622;
+ cvt.u64.u32 %rd2486, %r7623;
+ cvt.u64.u32 %rd2487, %r7625;
+ bfi.b64 %rd2488, %rd2486, %rd2487, 32, 32;
+ cvt.rn.f64.s64 %fd255, %rd2488;
+ mul.f64 %fd256, %fd255, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f5053, %fd256;
+ setp.eq.s32 %p1575, %r7621, 0;
+ neg.f32 %f5054, %f5053;
+ selp.f32 %f5987, %f5053, %f5054, %p1575;
+
+$L__BB0_1860:
+ add.s32 %r2491, %r8478, 1;
+ and.b32 %r2492, %r2491, 1;
+ setp.eq.s32 %p1576, %r2492, 0;
+ selp.f32 %f2165, %f5987, 0f3F800000, %p1576;
+ mul.rn.f32 %f2166, %f5987, %f5987;
+ mov.f32 %f5988, 0fB94D4153;
+ @%p1576 bra $L__BB0_1862;
+
+ mov.f32 %f5057, 0fBAB607ED;
+ mov.f32 %f5058, 0f37CBAC00;
+ fma.rn.f32 %f5988, %f5058, %f2166, %f5057;
+
+$L__BB0_1862:
+ selp.f32 %f5059, 0f3C0885E4, 0f3D2AAABB, %p1576;
+ fma.rn.f32 %f5060, %f5988, %f2166, %f5059;
+ selp.f32 %f5061, 0fBE2AAAA8, 0fBEFFFFFF, %p1576;
+ fma.rn.f32 %f5062, %f5060, %f2166, %f5061;
+ mov.f32 %f5063, 0f00000000;
+ fma.rn.f32 %f5064, %f2166, %f2165, %f5063;
+ fma.rn.f32 %f5989, %f5062, %f5064, %f2165;
+ and.b32 %r7627, %r2491, 2;
+ setp.eq.s32 %p1578, %r7627, 0;
+ @%p1578 bra $L__BB0_1864;
+
+ mov.f32 %f5066, 0fBF800000;
+ fma.rn.f32 %f5989, %f5989, %f5066, %f5063;
+
+$L__BB0_1864:
+ add.f32 %f5990, %f5986, %f5989;
+ bra.uni $L__BB0_1865;
+
+$L__BB0_1444:
+ mov.b32 %r1895, %f5416;
+ shr.u32 %r6564, %r1895, 23;
+ and.b32 %r6565, %r6564, 255;
+ add.s32 %r1896, %r6565, -128;
+ shl.b32 %r6566, %r1895, 8;
+ or.b32 %r1897, %r6566, -2147483648;
+ shr.u32 %r1898, %r1896, 5;
mov.u64 %rd2699, 0;
- mov.u32 %r8606, 0;
+ mov.u32 %r8351, 0;
mov.u64 %rd2697, __cudart_i2opi_f;
mov.u64 %rd2698, %rd1;
-$L__BB0_1416:
+$L__BB0_1445:
.pragma "nounroll";
- ld.global.nc.u32 %r6482, [%rd2697];
- mad.wide.u32 %rd2087, %r6482, %r1893, %rd2699;
- shr.u64 %rd2699, %rd2087, 32;
- st.local.u32 [%rd2698], %rd2087;
+ ld.global.nc.u32 %r6567, [%rd2697];
+ mad.wide.u32 %rd2145, %r6567, %r1897, %rd2699;
+ shr.u64 %rd2699, %rd2145, 32;
+ st.local.u32 [%rd2698], %rd2145;
add.s64 %rd2698, %rd2698, 4;
add.s64 %rd2697, %rd2697, 4;
- add.s32 %r8606, %r8606, 1;
- setp.ne.s32 %p1204, %r8606, 6;
- @%p1204 bra $L__BB0_1416;
-
- st.local.u32 [%rd5], %rd2699;
- mov.u32 %r6483, 4;
- sub.s32 %r1897, %r6483, %r1894;
- mov.u32 %r6484, 6;
- sub.s32 %r6485, %r6484, %r1894;
- mul.wide.s32 %rd2088, %r6485, 4;
- add.s64 %rd2089, %rd1, %rd2088;
- ld.local.u32 %r8607, [%rd2089];
- ld.local.u32 %r8608, [%rd2089+-4];
- and.b32 %r1900, %r1892, 31;
- setp.eq.s32 %p1205, %r1900, 0;
- @%p1205 bra $L__BB0_1419;
-
- mov.u32 %r6486, 32;
- sub.s32 %r6487, %r6486, %r1900;
- shr.u32 %r6488, %r8608, %r6487;
- shl.b32 %r6489, %r8607, %r1900;
- add.s32 %r8607, %r6488, %r6489;
- mul.wide.s32 %rd2090, %r1897, 4;
- add.s64 %rd2091, %rd1, %rd2090;
- ld.local.u32 %r6490, [%rd2091];
- shr.u32 %r6491, %r6490, %r6487;
- shl.b32 %r6492, %r8608, %r1900;
- add.s32 %r8608, %r6491, %r6492;
-
-$L__BB0_1419:
- and.b32 %r6493, %r1891, -2147483648;
- shr.u32 %r6494, %r8608, 30;
- shl.b32 %r6495, %r8607, 2;
- or.b32 %r6496, %r6494, %r6495;
- shr.u32 %r6497, %r6496, 31;
- shr.u32 %r6498, %r8607, 30;
- add.s32 %r6499, %r6497, %r6498;
- neg.s32 %r6500, %r6499;
- setp.eq.s32 %p1206, %r6493, 0;
- selp.b32 %r8609, %r6499, %r6500, %p1206;
- setp.ne.s32 %p1207, %r6497, 0;
- xor.b32 %r6501, %r6493, -2147483648;
- selp.b32 %r6502, %r6501, %r6493, %p1207;
- selp.b32 %r6503, -1, 0, %p1207;
- xor.b32 %r6504, %r6496, %r6503;
- shl.b32 %r6505, %r8608, 2;
- xor.b32 %r6506, %r6505, %r6503;
- cvt.u64.u32 %rd2092, %r6504;
- cvt.u64.u32 %rd2093, %r6506;
- bfi.b64 %rd2094, %rd2092, %rd2093, 32, 32;
- cvt.rn.f64.s64 %fd191, %rd2094;
- mul.f64 %fd192, %fd191, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4309, %fd192;
- setp.eq.s32 %p1208, %r6502, 0;
- neg.f32 %f4310, %f4309;
- selp.f32 %f5704, %f4309, %f4310, %p1208;
-
-$L__BB0_1421:
- add.s32 %r1907, %r8609, 1;
- and.b32 %r1908, %r1907, 1;
- setp.eq.s32 %p1209, %r1908, 0;
- selp.f32 %f1590, %f5704, 0f3F800000, %p1209;
- mul.rn.f32 %f1591, %f5704, %f5704;
- mov.f32 %f5705, 0fB94D4153;
- @%p1209 bra $L__BB0_1423;
-
- mov.f32 %f4313, 0fBAB607ED;
- mov.f32 %f4314, 0f37CBAC00;
- fma.rn.f32 %f5705, %f4314, %f1591, %f4313;
-
-$L__BB0_1423:
- selp.f32 %f4315, 0f3C0885E4, 0f3D2AAABB, %p1209;
- fma.rn.f32 %f4316, %f5705, %f1591, %f4315;
- selp.f32 %f4317, 0fBE2AAAA8, 0fBEFFFFFF, %p1209;
- fma.rn.f32 %f4318, %f4316, %f1591, %f4317;
- mov.f32 %f4319, 0f00000000;
- fma.rn.f32 %f4320, %f1591, %f1590, %f4319;
- fma.rn.f32 %f5706, %f4318, %f4320, %f1590;
- and.b32 %r6508, %r1907, 2;
- setp.eq.s32 %p1211, %r6508, 0;
- @%p1211 bra $L__BB0_1425;
-
- mov.f32 %f4322, 0fBF800000;
- fma.rn.f32 %f5706, %f5706, %f4322, %f4319;
-
-$L__BB0_1425:
- add.f32 %f5707, %f5703, %f5706;
- bra.uni $L__BB0_1426;
-
-$L__BB0_1005:
- mov.b32 %r1311, %f5348;
- shr.u32 %r5425, %r1311, 23;
- and.b32 %r5426, %r5425, 255;
- add.s32 %r1312, %r5426, -128;
- shl.b32 %r5427, %r1311, 8;
- or.b32 %r1313, %r5427, -2147483648;
- shr.u32 %r1314, %r1312, 5;
- mov.u64 %rd2626, 0;
- mov.u32 %r8482, 0;
- mov.u64 %rd1661, __cudart_i2opi_f;
- mov.u64 %rd2627, %rd2626;
-
-$L__BB0_1006:
- .pragma "nounroll";
- shl.b64 %rd1660, %rd2626, 2;
- add.s64 %rd1662, %rd1661, %rd1660;
- ld.global.nc.u32 %r5428, [%rd1662];
- mad.wide.u32 %rd1663, %r5428, %r1313, %rd2627;
- shr.u64 %rd2627, %rd1663, 32;
- add.s64 %rd1664, %rd1, %rd1660;
- st.local.u32 [%rd1664], %rd1663;
- add.s32 %r8482, %r8482, 1;
- cvt.s64.s32 %rd2626, %r8482;
- setp.ne.s32 %p856, %r8482, 6;
- @%p856 bra $L__BB0_1006;
-
- st.local.u32 [%rd5], %rd2627;
- mov.u32 %r5429, 4;
- sub.s32 %r1317, %r5429, %r1314;
- mov.u32 %r5430, 6;
- sub.s32 %r5431, %r5430, %r1314;
- mul.wide.s32 %rd1665, %r5431, 4;
- add.s64 %rd1666, %rd1, %rd1665;
- ld.local.u32 %r8483, [%rd1666];
- ld.local.u32 %r8484, [%rd1666+-4];
- and.b32 %r1320, %r1312, 31;
- setp.eq.s32 %p857, %r1320, 0;
- @%p857 bra $L__BB0_1009;
-
- mov.u32 %r5432, 32;
- sub.s32 %r5433, %r5432, %r1320;
- shr.u32 %r5434, %r8484, %r5433;
- shl.b32 %r5435, %r8483, %r1320;
- add.s32 %r8483, %r5434, %r5435;
- mul.wide.s32 %rd1667, %r1317, 4;
- add.s64 %rd1668, %rd1, %rd1667;
- ld.local.u32 %r5436, [%rd1668];
- shr.u32 %r5437, %r5436, %r5433;
- shl.b32 %r5438, %r8484, %r1320;
- add.s32 %r8484, %r5437, %r5438;
-
-$L__BB0_1009:
- and.b32 %r5439, %r1311, -2147483648;
- shr.u32 %r5440, %r8484, 30;
- shl.b32 %r5441, %r8483, 2;
- or.b32 %r5442, %r5440, %r5441;
- shr.u32 %r5443, %r5442, 31;
- shr.u32 %r5444, %r8483, 30;
- add.s32 %r5445, %r5443, %r5444;
- neg.s32 %r5446, %r5445;
- setp.eq.s32 %p858, %r5439, 0;
- selp.b32 %r8485, %r5445, %r5446, %p858;
- setp.ne.s32 %p859, %r5443, 0;
- xor.b32 %r5447, %r5439, -2147483648;
- selp.b32 %r5448, %r5447, %r5439, %p859;
- selp.b32 %r5449, -1, 0, %p859;
- xor.b32 %r5450, %r5442, %r5449;
- shl.b32 %r5451, %r8484, 2;
- xor.b32 %r5452, %r5451, %r5449;
- cvt.u64.u32 %rd1669, %r5450;
- cvt.u64.u32 %rd1670, %r5452;
- bfi.b64 %rd1671, %rd1669, %rd1670, 32, 32;
- cvt.rn.f64.s64 %fd129, %rd1671;
- mul.f64 %fd130, %fd129, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3643, %fd130;
- setp.eq.s32 %p860, %r5448, 0;
- neg.f32 %f3644, %f3643;
- selp.f32 %f5560, %f3643, %f3644, %p860;
-
-$L__BB0_1011:
- and.b32 %r1327, %r8485, 1;
- setp.eq.s32 %p861, %r1327, 0;
- selp.f32 %f1120, %f5560, 0f3F800000, %p861;
- mul.rn.f32 %f1121, %f5560, %f5560;
- mov.f32 %f5561, 0fB94D4153;
- @%p861 bra $L__BB0_1013;
-
- mov.f32 %f3647, 0fBAB607ED;
- mov.f32 %f3648, 0f37CBAC00;
- fma.rn.f32 %f5561, %f3648, %f1121, %f3647;
-
-$L__BB0_1013:
- selp.f32 %f3649, 0f3C0885E4, 0f3D2AAABB, %p861;
- fma.rn.f32 %f3650, %f5561, %f1121, %f3649;
- selp.f32 %f3651, 0fBE2AAAA8, 0fBEFFFFFF, %p861;
- fma.rn.f32 %f3652, %f3650, %f1121, %f3651;
- mov.f32 %f3653, 0f00000000;
- fma.rn.f32 %f3654, %f1121, %f1120, %f3653;
- fma.rn.f32 %f5213, %f3652, %f3654, %f1120;
- and.b32 %r5454, %r8485, 2;
- setp.eq.s32 %p863, %r5454, 0;
- @%p863 bra $L__BB0_1015;
-
- mov.f32 %f3656, 0fBF800000;
- fma.rn.f32 %f5213, %f5213, %f3656, %f3653;
-
-$L__BB0_1015:
- setp.lt.s32 %p16, %r14, %r1309;
- @%p853 bra $L__BB0_1028;
-
- mul.f32 %f3657, %f5531, 0f3F22F983;
- cvt.rni.s32.f32 %r8489, %f3657;
- cvt.rn.f32.s32 %f3658, %r8489;
- mov.f32 %f3659, 0fBFC90FDA;
- fma.rn.f32 %f3660, %f3658, %f3659, %f5531;
- mov.f32 %f3661, 0fB3A22168;
- fma.rn.f32 %f3662, %f3658, %f3661, %f3660;
- mov.f32 %f3663, 0fA7C234C5;
- fma.rn.f32 %f5564, %f3658, %f3663, %f3662;
- abs.f32 %f1129, %f5531;
- setp.ltu.f32 %p865, %f1129, 0f47CE4780;
- @%p865 bra $L__BB0_1024;
-
- setp.eq.f32 %p866, %f1129, 0f7F800000;
- @%p866 bra $L__BB0_1023;
- bra.uni $L__BB0_1018;
-
-$L__BB0_1023:
- mov.f32 %f3666, 0f00000000;
- mul.rn.f32 %f5564, %f5531, %f3666;
- mov.u32 %r8489, 0;
- bra.uni $L__BB0_1024;
-
-$L__BB0_1018:
- mov.b32 %r1329, %f5531;
- shr.u32 %r5456, %r1329, 23;
- and.b32 %r5457, %r5456, 255;
- add.s32 %r1330, %r5457, -128;
- shl.b32 %r5458, %r1329, 8;
- or.b32 %r1331, %r5458, -2147483648;
- shr.u32 %r1332, %r1330, 5;
- mov.u64 %rd2628, 0;
- mov.u32 %r8486, 0;
- mov.u64 %rd1675, __cudart_i2opi_f;
- mov.u64 %rd2629, %rd2628;
-
-$L__BB0_1019:
- .pragma "nounroll";
- shl.b64 %rd1674, %rd2628, 2;
- add.s64 %rd1676, %rd1675, %rd1674;
- ld.global.nc.u32 %r5459, [%rd1676];
- mad.wide.u32 %rd1677, %r5459, %r1331, %rd2629;
- shr.u64 %rd2629, %rd1677, 32;
- add.s64 %rd1678, %rd1, %rd1674;
- st.local.u32 [%rd1678], %rd1677;
- add.s32 %r8486, %r8486, 1;
- cvt.s64.s32 %rd2628, %r8486;
- setp.ne.s32 %p867, %r8486, 6;
- @%p867 bra $L__BB0_1019;
-
- st.local.u32 [%rd5], %rd2629;
- mov.u32 %r5460, 4;
- sub.s32 %r1335, %r5460, %r1332;
- mov.u32 %r5461, 6;
- sub.s32 %r5462, %r5461, %r1332;
- mul.wide.s32 %rd1679, %r5462, 4;
- add.s64 %rd1680, %rd1, %rd1679;
- ld.local.u32 %r8487, [%rd1680];
- ld.local.u32 %r8488, [%rd1680+-4];
- and.b32 %r1338, %r1330, 31;
- setp.eq.s32 %p868, %r1338, 0;
- @%p868 bra $L__BB0_1022;
-
- mov.u32 %r5463, 32;
- sub.s32 %r5464, %r5463, %r1338;
- shr.u32 %r5465, %r8488, %r5464;
- shl.b32 %r5466, %r8487, %r1338;
- add.s32 %r8487, %r5465, %r5466;
- mul.wide.s32 %rd1681, %r1335, 4;
- add.s64 %rd1682, %rd1, %rd1681;
- ld.local.u32 %r5467, [%rd1682];
- shr.u32 %r5468, %r5467, %r5464;
- shl.b32 %r5469, %r8488, %r1338;
- add.s32 %r8488, %r5468, %r5469;
-
-$L__BB0_1022:
- and.b32 %r5470, %r1329, -2147483648;
- shr.u32 %r5471, %r8488, 30;
- shl.b32 %r5472, %r8487, 2;
- or.b32 %r5473, %r5471, %r5472;
- shr.u32 %r5474, %r5473, 31;
- shr.u32 %r5475, %r8487, 30;
- add.s32 %r5476, %r5474, %r5475;
- neg.s32 %r5477, %r5476;
- setp.eq.s32 %p869, %r5470, 0;
- selp.b32 %r8489, %r5476, %r5477, %p869;
- setp.ne.s32 %p870, %r5474, 0;
- xor.b32 %r5478, %r5470, -2147483648;
- selp.b32 %r5479, %r5478, %r5470, %p870;
- selp.b32 %r5480, -1, 0, %p870;
- xor.b32 %r5481, %r5473, %r5480;
- shl.b32 %r5482, %r8488, 2;
- xor.b32 %r5483, %r5482, %r5480;
- cvt.u64.u32 %rd1683, %r5481;
- cvt.u64.u32 %rd1684, %r5483;
- bfi.b64 %rd1685, %rd1683, %rd1684, 32, 32;
- cvt.rn.f64.s64 %fd131, %rd1685;
- mul.f64 %fd132, %fd131, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3664, %fd132;
- setp.eq.s32 %p871, %r5479, 0;
- neg.f32 %f3665, %f3664;
- selp.f32 %f5564, %f3664, %f3665, %p871;
-
-$L__BB0_1024:
- add.s32 %r1345, %r8489, 1;
- and.b32 %r1346, %r1345, 1;
- setp.eq.s32 %p872, %r1346, 0;
- selp.f32 %f1133, %f5564, 0f3F800000, %p872;
- mul.rn.f32 %f1134, %f5564, %f5564;
- mov.f32 %f5565, 0fB94D4153;
- @%p872 bra $L__BB0_1026;
-
- mov.f32 %f3668, 0fBAB607ED;
- mov.f32 %f3669, 0f37CBAC00;
- fma.rn.f32 %f5565, %f3669, %f1134, %f3668;
-
-$L__BB0_1026:
- selp.f32 %f3670, 0f3C0885E4, 0f3D2AAABB, %p872;
- fma.rn.f32 %f3671, %f5565, %f1134, %f3670;
- selp.f32 %f3672, 0fBE2AAAA8, 0fBEFFFFFF, %p872;
- fma.rn.f32 %f3673, %f3671, %f1134, %f3672;
- mov.f32 %f3674, 0f00000000;
- fma.rn.f32 %f3675, %f1134, %f1133, %f3674;
- fma.rn.f32 %f5215, %f3673, %f3675, %f1133;
- and.b32 %r5485, %r1345, 2;
- setp.eq.s32 %p874, %r5485, 0;
- @%p874 bra $L__BB0_1028;
-
- mov.f32 %f3677, 0fBF800000;
- fma.rn.f32 %f5215, %f5215, %f3677, %f3674;
-
-$L__BB0_1028:
- selp.f32 %f1141, %f5215, %f5216, %p16;
- selp.f32 %f1142, %f5213, %f5214, %p16;
- @%p853 bra $L__BB0_1030;
-
- add.f32 %f5714, %f1142, %f1141;
- mov.f32 %f5214, %f5213;
- mov.f32 %f5216, %f5215;
-
-$L__BB0_1030:
- @%p820 bra $L__BB0_1059;
-
- shl.b32 %r5487, %r12, 5;
- mov.u32 %r5488, -32;
- sub.s32 %r1347, %r5488, %r5487;
- setp.ge.s32 %p878, %r14, %r1347;
- @%p878 bra $L__BB0_1044;
-
- mul.f32 %f3680, %f5347, 0f3F22F983;
- cvt.rni.s32.f32 %r8493, %f3680;
- cvt.rn.f32.s32 %f3681, %r8493;
- mov.f32 %f3682, 0fBFC90FDA;
- fma.rn.f32 %f3683, %f3681, %f3682, %f5347;
- mov.f32 %f3684, 0fB3A22168;
- fma.rn.f32 %f3685, %f3681, %f3684, %f3683;
- mov.f32 %f3686, 0fA7C234C5;
- fma.rn.f32 %f5573, %f3681, %f3686, %f3685;
- abs.f32 %f1150, %f5347;
- setp.ltu.f32 %p879, %f1150, 0f47CE4780;
- @%p879 bra $L__BB0_1040;
-
- setp.eq.f32 %p880, %f1150, 0f7F800000;
- @%p880 bra $L__BB0_1039;
- bra.uni $L__BB0_1034;
-
-$L__BB0_1039:
- mov.f32 %f3689, 0f00000000;
- mul.rn.f32 %f5573, %f5347, %f3689;
- mov.u32 %r8493, 0;
- bra.uni $L__BB0_1040;
-
-$L__BB0_1034:
- mov.b32 %r1349, %f5347;
- shr.u32 %r5490, %r1349, 23;
- and.b32 %r5491, %r5490, 255;
- add.s32 %r1350, %r5491, -128;
- shl.b32 %r5492, %r1349, 8;
- or.b32 %r1351, %r5492, -2147483648;
- shr.u32 %r1352, %r1350, 5;
- mov.u64 %rd2630, 0;
- mov.u32 %r8490, 0;
- mov.u64 %rd1689, __cudart_i2opi_f;
- mov.u64 %rd2631, %rd2630;
-
-$L__BB0_1035:
- .pragma "nounroll";
- shl.b64 %rd1688, %rd2630, 2;
- add.s64 %rd1690, %rd1689, %rd1688;
- ld.global.nc.u32 %r5493, [%rd1690];
- mad.wide.u32 %rd1691, %r5493, %r1351, %rd2631;
- shr.u64 %rd2631, %rd1691, 32;
- add.s64 %rd1692, %rd1, %rd1688;
- st.local.u32 [%rd1692], %rd1691;
- add.s32 %r8490, %r8490, 1;
- cvt.s64.s32 %rd2630, %r8490;
- setp.ne.s32 %p881, %r8490, 6;
- @%p881 bra $L__BB0_1035;
-
- st.local.u32 [%rd5], %rd2631;
- mov.u32 %r5494, 4;
- sub.s32 %r1355, %r5494, %r1352;
- mov.u32 %r5495, 6;
- sub.s32 %r5496, %r5495, %r1352;
- mul.wide.s32 %rd1693, %r5496, 4;
- add.s64 %rd1694, %rd1, %rd1693;
- ld.local.u32 %r8491, [%rd1694];
- ld.local.u32 %r8492, [%rd1694+-4];
- and.b32 %r1358, %r1350, 31;
- setp.eq.s32 %p882, %r1358, 0;
- @%p882 bra $L__BB0_1038;
-
- mov.u32 %r5497, 32;
- sub.s32 %r5498, %r5497, %r1358;
- shr.u32 %r5499, %r8492, %r5498;
- shl.b32 %r5500, %r8491, %r1358;
- add.s32 %r8491, %r5499, %r5500;
- mul.wide.s32 %rd1695, %r1355, 4;
- add.s64 %rd1696, %rd1, %rd1695;
- ld.local.u32 %r5501, [%rd1696];
- shr.u32 %r5502, %r5501, %r5498;
- shl.b32 %r5503, %r8492, %r1358;
- add.s32 %r8492, %r5502, %r5503;
-
-$L__BB0_1038:
- and.b32 %r5504, %r1349, -2147483648;
- shr.u32 %r5505, %r8492, 30;
- shl.b32 %r5506, %r8491, 2;
- or.b32 %r5507, %r5505, %r5506;
- shr.u32 %r5508, %r5507, 31;
- shr.u32 %r5509, %r8491, 30;
- add.s32 %r5510, %r5508, %r5509;
- neg.s32 %r5511, %r5510;
- setp.eq.s32 %p883, %r5504, 0;
- selp.b32 %r8493, %r5510, %r5511, %p883;
- setp.ne.s32 %p884, %r5508, 0;
- xor.b32 %r5512, %r5504, -2147483648;
- selp.b32 %r5513, %r5512, %r5504, %p884;
- selp.b32 %r5514, -1, 0, %p884;
- xor.b32 %r5515, %r5507, %r5514;
- shl.b32 %r5516, %r8492, 2;
- xor.b32 %r5517, %r5516, %r5514;
- cvt.u64.u32 %rd1697, %r5515;
- cvt.u64.u32 %rd1698, %r5517;
- bfi.b64 %rd1699, %rd1697, %rd1698, 32, 32;
- cvt.rn.f64.s64 %fd133, %rd1699;
- mul.f64 %fd134, %fd133, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3687, %fd134;
- setp.eq.s32 %p885, %r5513, 0;
- neg.f32 %f3688, %f3687;
- selp.f32 %f5573, %f3687, %f3688, %p885;
-
-$L__BB0_1040:
- and.b32 %r1365, %r8493, 1;
- setp.eq.s32 %p886, %r1365, 0;
- selp.f32 %f1154, %f5573, 0f3F800000, %p886;
- mul.rn.f32 %f1155, %f5573, %f5573;
- mov.f32 %f5574, 0fB94D4153;
- @%p886 bra $L__BB0_1042;
-
- mov.f32 %f3691, 0fBAB607ED;
- mov.f32 %f3692, 0f37CBAC00;
- fma.rn.f32 %f5574, %f3692, %f1155, %f3691;
-
-$L__BB0_1042:
- selp.f32 %f3693, 0f3C0885E4, 0f3D2AAABB, %p886;
- fma.rn.f32 %f3694, %f5574, %f1155, %f3693;
- selp.f32 %f3695, 0fBE2AAAA8, 0fBEFFFFFF, %p886;
- fma.rn.f32 %f3696, %f3694, %f1155, %f3695;
- mov.f32 %f3697, 0f00000000;
- fma.rn.f32 %f3698, %f1155, %f1154, %f3697;
- fma.rn.f32 %f5213, %f3696, %f3698, %f1154;
- and.b32 %r5519, %r8493, 2;
- setp.eq.s32 %p888, %r5519, 0;
- @%p888 bra $L__BB0_1044;
-
- mov.f32 %f3700, 0fBF800000;
- fma.rn.f32 %f5213, %f5213, %f3700, %f3697;
-
-$L__BB0_1044:
- setp.lt.s32 %p17, %r14, %r1347;
- @%p878 bra $L__BB0_1057;
-
- mul.f32 %f3701, %f5339, 0f3F22F983;
- cvt.rni.s32.f32 %r8497, %f3701;
- cvt.rn.f32.s32 %f3702, %r8497;
- mov.f32 %f3703, 0fBFC90FDA;
- fma.rn.f32 %f3704, %f3702, %f3703, %f5339;
- mov.f32 %f3705, 0fB3A22168;
- fma.rn.f32 %f3706, %f3702, %f3705, %f3704;
- mov.f32 %f3707, 0fA7C234C5;
- fma.rn.f32 %f5577, %f3702, %f3707, %f3706;
- abs.f32 %f1163, %f5339;
- setp.ltu.f32 %p890, %f1163, 0f47CE4780;
- @%p890 bra $L__BB0_1053;
-
- setp.eq.f32 %p891, %f1163, 0f7F800000;
- @%p891 bra $L__BB0_1052;
- bra.uni $L__BB0_1047;
-
-$L__BB0_1052:
- mov.f32 %f3710, 0f00000000;
- mul.rn.f32 %f5577, %f5339, %f3710;
- mov.u32 %r8497, 0;
- bra.uni $L__BB0_1053;
-
-$L__BB0_1047:
- mov.b32 %r1367, %f5339;
- shr.u32 %r5521, %r1367, 23;
- and.b32 %r5522, %r5521, 255;
- add.s32 %r1368, %r5522, -128;
- shl.b32 %r5523, %r1367, 8;
- or.b32 %r1369, %r5523, -2147483648;
- shr.u32 %r1370, %r1368, 5;
- mov.u64 %rd2632, 0;
- mov.u32 %r8494, 0;
- mov.u64 %rd1703, __cudart_i2opi_f;
- mov.u64 %rd2633, %rd2632;
-
-$L__BB0_1048:
- .pragma "nounroll";
- shl.b64 %rd1702, %rd2632, 2;
- add.s64 %rd1704, %rd1703, %rd1702;
- ld.global.nc.u32 %r5524, [%rd1704];
- mad.wide.u32 %rd1705, %r5524, %r1369, %rd2633;
- shr.u64 %rd2633, %rd1705, 32;
- add.s64 %rd1706, %rd1, %rd1702;
- st.local.u32 [%rd1706], %rd1705;
- add.s32 %r8494, %r8494, 1;
- cvt.s64.s32 %rd2632, %r8494;
- setp.ne.s32 %p892, %r8494, 6;
- @%p892 bra $L__BB0_1048;
-
- st.local.u32 [%rd5], %rd2633;
- mov.u32 %r5525, 4;
- sub.s32 %r1373, %r5525, %r1370;
- mov.u32 %r5526, 6;
- sub.s32 %r5527, %r5526, %r1370;
- mul.wide.s32 %rd1707, %r5527, 4;
- add.s64 %rd1708, %rd1, %rd1707;
- ld.local.u32 %r8495, [%rd1708];
- ld.local.u32 %r8496, [%rd1708+-4];
- and.b32 %r1376, %r1368, 31;
- setp.eq.s32 %p893, %r1376, 0;
- @%p893 bra $L__BB0_1051;
-
- mov.u32 %r5528, 32;
- sub.s32 %r5529, %r5528, %r1376;
- shr.u32 %r5530, %r8496, %r5529;
- shl.b32 %r5531, %r8495, %r1376;
- add.s32 %r8495, %r5530, %r5531;
- mul.wide.s32 %rd1709, %r1373, 4;
- add.s64 %rd1710, %rd1, %rd1709;
- ld.local.u32 %r5532, [%rd1710];
- shr.u32 %r5533, %r5532, %r5529;
- shl.b32 %r5534, %r8496, %r1376;
- add.s32 %r8496, %r5533, %r5534;
-
-$L__BB0_1051:
- and.b32 %r5535, %r1367, -2147483648;
- shr.u32 %r5536, %r8496, 30;
- shl.b32 %r5537, %r8495, 2;
- or.b32 %r5538, %r5536, %r5537;
- shr.u32 %r5539, %r5538, 31;
- shr.u32 %r5540, %r8495, 30;
- add.s32 %r5541, %r5539, %r5540;
- neg.s32 %r5542, %r5541;
- setp.eq.s32 %p894, %r5535, 0;
- selp.b32 %r8497, %r5541, %r5542, %p894;
- setp.ne.s32 %p895, %r5539, 0;
- xor.b32 %r5543, %r5535, -2147483648;
- selp.b32 %r5544, %r5543, %r5535, %p895;
- selp.b32 %r5545, -1, 0, %p895;
- xor.b32 %r5546, %r5538, %r5545;
- shl.b32 %r5547, %r8496, 2;
- xor.b32 %r5548, %r5547, %r5545;
- cvt.u64.u32 %rd1711, %r5546;
- cvt.u64.u32 %rd1712, %r5548;
- bfi.b64 %rd1713, %rd1711, %rd1712, 32, 32;
- cvt.rn.f64.s64 %fd135, %rd1713;
- mul.f64 %fd136, %fd135, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3708, %fd136;
- setp.eq.s32 %p896, %r5544, 0;
- neg.f32 %f3709, %f3708;
- selp.f32 %f5577, %f3708, %f3709, %p896;
-
-$L__BB0_1053:
- add.s32 %r1383, %r8497, 1;
- and.b32 %r1384, %r1383, 1;
- setp.eq.s32 %p897, %r1384, 0;
- selp.f32 %f1167, %f5577, 0f3F800000, %p897;
- mul.rn.f32 %f1168, %f5577, %f5577;
- mov.f32 %f5578, 0fB94D4153;
- @%p897 bra $L__BB0_1055;
-
- mov.f32 %f3712, 0fBAB607ED;
- mov.f32 %f3713, 0f37CBAC00;
- fma.rn.f32 %f5578, %f3713, %f1168, %f3712;
-
-$L__BB0_1055:
- selp.f32 %f3714, 0f3C0885E4, 0f3D2AAABB, %p897;
- fma.rn.f32 %f3715, %f5578, %f1168, %f3714;
- selp.f32 %f3716, 0fBE2AAAA8, 0fBEFFFFFF, %p897;
- fma.rn.f32 %f3717, %f3715, %f1168, %f3716;
- mov.f32 %f3718, 0f00000000;
- fma.rn.f32 %f3719, %f1168, %f1167, %f3718;
- fma.rn.f32 %f5215, %f3717, %f3719, %f1167;
- and.b32 %r5550, %r1383, 2;
- setp.eq.s32 %p899, %r5550, 0;
- @%p899 bra $L__BB0_1057;
-
- mov.f32 %f3721, 0fBF800000;
- fma.rn.f32 %f5215, %f5215, %f3721, %f3718;
-
-$L__BB0_1057:
- selp.f32 %f1175, %f5215, %f5216, %p17;
- selp.f32 %f1176, %f5213, %f5214, %p17;
- @%p878 bra $L__BB0_1059;
-
- add.f32 %f5713, %f1176, %f1175;
- mov.f32 %f5214, %f5213;
- mov.f32 %f5216, %f5215;
-
-$L__BB0_1059:
- @%p824 bra $L__BB0_1088;
-
- shl.b32 %r5552, %r12, 5;
- neg.s32 %r1385, %r5552;
- setp.ge.s32 %p903, %r14, %r1385;
- @%p903 bra $L__BB0_1073;
-
- mul.f32 %f3724, %f5346, 0f3F22F983;
- cvt.rni.s32.f32 %r8501, %f3724;
- cvt.rn.f32.s32 %f3725, %r8501;
- mov.f32 %f3726, 0fBFC90FDA;
- fma.rn.f32 %f3727, %f3725, %f3726, %f5346;
- mov.f32 %f3728, 0fB3A22168;
- fma.rn.f32 %f3729, %f3725, %f3728, %f3727;
- mov.f32 %f3730, 0fA7C234C5;
- fma.rn.f32 %f5586, %f3725, %f3730, %f3729;
- abs.f32 %f1184, %f5346;
- setp.ltu.f32 %p904, %f1184, 0f47CE4780;
- @%p904 bra $L__BB0_1069;
-
- setp.eq.f32 %p905, %f1184, 0f7F800000;
- @%p905 bra $L__BB0_1068;
- bra.uni $L__BB0_1063;
-
-$L__BB0_1068:
- mov.f32 %f3733, 0f00000000;
- mul.rn.f32 %f5586, %f5346, %f3733;
- mov.u32 %r8501, 0;
- bra.uni $L__BB0_1069;
-
-$L__BB0_1063:
- mov.b32 %r1387, %f5346;
- shr.u32 %r5554, %r1387, 23;
- and.b32 %r5555, %r5554, 255;
- add.s32 %r1388, %r5555, -128;
- shl.b32 %r5556, %r1387, 8;
- or.b32 %r1389, %r5556, -2147483648;
- shr.u32 %r1390, %r1388, 5;
- mov.u64 %rd2634, 0;
- mov.u32 %r8498, 0;
- mov.u64 %rd1717, __cudart_i2opi_f;
- mov.u64 %rd2635, %rd2634;
-
-$L__BB0_1064:
- .pragma "nounroll";
- shl.b64 %rd1716, %rd2634, 2;
- add.s64 %rd1718, %rd1717, %rd1716;
- ld.global.nc.u32 %r5557, [%rd1718];
- mad.wide.u32 %rd1719, %r5557, %r1389, %rd2635;
- shr.u64 %rd2635, %rd1719, 32;
- add.s64 %rd1720, %rd1, %rd1716;
- st.local.u32 [%rd1720], %rd1719;
- add.s32 %r8498, %r8498, 1;
- cvt.s64.s32 %rd2634, %r8498;
- setp.ne.s32 %p906, %r8498, 6;
- @%p906 bra $L__BB0_1064;
-
- st.local.u32 [%rd5], %rd2635;
- mov.u32 %r5558, 4;
- sub.s32 %r1393, %r5558, %r1390;
- mov.u32 %r5559, 6;
- sub.s32 %r5560, %r5559, %r1390;
- mul.wide.s32 %rd1721, %r5560, 4;
- add.s64 %rd1722, %rd1, %rd1721;
- ld.local.u32 %r8499, [%rd1722];
- ld.local.u32 %r8500, [%rd1722+-4];
- and.b32 %r1396, %r1388, 31;
- setp.eq.s32 %p907, %r1396, 0;
- @%p907 bra $L__BB0_1067;
-
- mov.u32 %r5561, 32;
- sub.s32 %r5562, %r5561, %r1396;
- shr.u32 %r5563, %r8500, %r5562;
- shl.b32 %r5564, %r8499, %r1396;
- add.s32 %r8499, %r5563, %r5564;
- mul.wide.s32 %rd1723, %r1393, 4;
- add.s64 %rd1724, %rd1, %rd1723;
- ld.local.u32 %r5565, [%rd1724];
- shr.u32 %r5566, %r5565, %r5562;
- shl.b32 %r5567, %r8500, %r1396;
- add.s32 %r8500, %r5566, %r5567;
-
-$L__BB0_1067:
- and.b32 %r5568, %r1387, -2147483648;
- shr.u32 %r5569, %r8500, 30;
- shl.b32 %r5570, %r8499, 2;
- or.b32 %r5571, %r5569, %r5570;
- shr.u32 %r5572, %r5571, 31;
- shr.u32 %r5573, %r8499, 30;
- add.s32 %r5574, %r5572, %r5573;
- neg.s32 %r5575, %r5574;
- setp.eq.s32 %p908, %r5568, 0;
- selp.b32 %r8501, %r5574, %r5575, %p908;
- setp.ne.s32 %p909, %r5572, 0;
- xor.b32 %r5576, %r5568, -2147483648;
- selp.b32 %r5577, %r5576, %r5568, %p909;
- selp.b32 %r5578, -1, 0, %p909;
- xor.b32 %r5579, %r5571, %r5578;
- shl.b32 %r5580, %r8500, 2;
- xor.b32 %r5581, %r5580, %r5578;
- cvt.u64.u32 %rd1725, %r5579;
- cvt.u64.u32 %rd1726, %r5581;
- bfi.b64 %rd1727, %rd1725, %rd1726, 32, 32;
- cvt.rn.f64.s64 %fd137, %rd1727;
- mul.f64 %fd138, %fd137, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3731, %fd138;
- setp.eq.s32 %p910, %r5577, 0;
- neg.f32 %f3732, %f3731;
- selp.f32 %f5586, %f3731, %f3732, %p910;
-
-$L__BB0_1069:
- and.b32 %r1403, %r8501, 1;
- setp.eq.s32 %p911, %r1403, 0;
- selp.f32 %f1188, %f5586, 0f3F800000, %p911;
- mul.rn.f32 %f1189, %f5586, %f5586;
- mov.f32 %f5587, 0fB94D4153;
- @%p911 bra $L__BB0_1071;
-
- mov.f32 %f3735, 0fBAB607ED;
- mov.f32 %f3736, 0f37CBAC00;
- fma.rn.f32 %f5587, %f3736, %f1189, %f3735;
-
-$L__BB0_1071:
- selp.f32 %f3737, 0f3C0885E4, 0f3D2AAABB, %p911;
- fma.rn.f32 %f3738, %f5587, %f1189, %f3737;
- selp.f32 %f3739, 0fBE2AAAA8, 0fBEFFFFFF, %p911;
- fma.rn.f32 %f3740, %f3738, %f1189, %f3739;
- mov.f32 %f3741, 0f00000000;
- fma.rn.f32 %f3742, %f1189, %f1188, %f3741;
- fma.rn.f32 %f5213, %f3740, %f3742, %f1188;
- and.b32 %r5583, %r8501, 2;
- setp.eq.s32 %p913, %r5583, 0;
- @%p913 bra $L__BB0_1073;
-
- mov.f32 %f3744, 0fBF800000;
- fma.rn.f32 %f5213, %f5213, %f3744, %f3741;
-
-$L__BB0_1073:
- setp.lt.s32 %p18, %r14, %r1385;
- @%p903 bra $L__BB0_1086;
-
- mul.f32 %f3745, %f5338, 0f3F22F983;
- cvt.rni.s32.f32 %r8505, %f3745;
- cvt.rn.f32.s32 %f3746, %r8505;
- mov.f32 %f3747, 0fBFC90FDA;
- fma.rn.f32 %f3748, %f3746, %f3747, %f5338;
- mov.f32 %f3749, 0fB3A22168;
- fma.rn.f32 %f3750, %f3746, %f3749, %f3748;
- mov.f32 %f3751, 0fA7C234C5;
- fma.rn.f32 %f5590, %f3746, %f3751, %f3750;
- abs.f32 %f1197, %f5338;
- setp.ltu.f32 %p915, %f1197, 0f47CE4780;
- @%p915 bra $L__BB0_1082;
-
- setp.eq.f32 %p916, %f1197, 0f7F800000;
- @%p916 bra $L__BB0_1081;
- bra.uni $L__BB0_1076;
-
-$L__BB0_1081:
- mov.f32 %f3754, 0f00000000;
- mul.rn.f32 %f5590, %f5338, %f3754;
- mov.u32 %r8505, 0;
- bra.uni $L__BB0_1082;
-
-$L__BB0_1076:
- mov.b32 %r1405, %f5338;
- shr.u32 %r5585, %r1405, 23;
- and.b32 %r5586, %r5585, 255;
- add.s32 %r1406, %r5586, -128;
- shl.b32 %r5587, %r1405, 8;
- or.b32 %r1407, %r5587, -2147483648;
- shr.u32 %r1408, %r1406, 5;
- mov.u64 %rd2636, 0;
- mov.u32 %r8502, 0;
- mov.u64 %rd1731, __cudart_i2opi_f;
- mov.u64 %rd2637, %rd2636;
-
-$L__BB0_1077:
- .pragma "nounroll";
- shl.b64 %rd1730, %rd2636, 2;
- add.s64 %rd1732, %rd1731, %rd1730;
- ld.global.nc.u32 %r5588, [%rd1732];
- mad.wide.u32 %rd1733, %r5588, %r1407, %rd2637;
- shr.u64 %rd2637, %rd1733, 32;
- add.s64 %rd1734, %rd1, %rd1730;
- st.local.u32 [%rd1734], %rd1733;
- add.s32 %r8502, %r8502, 1;
- cvt.s64.s32 %rd2636, %r8502;
- setp.ne.s32 %p917, %r8502, 6;
- @%p917 bra $L__BB0_1077;
-
- st.local.u32 [%rd5], %rd2637;
- mov.u32 %r5589, 4;
- sub.s32 %r1411, %r5589, %r1408;
- mov.u32 %r5590, 6;
- sub.s32 %r5591, %r5590, %r1408;
- mul.wide.s32 %rd1735, %r5591, 4;
- add.s64 %rd1736, %rd1, %rd1735;
- ld.local.u32 %r8503, [%rd1736];
- ld.local.u32 %r8504, [%rd1736+-4];
- and.b32 %r1414, %r1406, 31;
- setp.eq.s32 %p918, %r1414, 0;
- @%p918 bra $L__BB0_1080;
-
- mov.u32 %r5592, 32;
- sub.s32 %r5593, %r5592, %r1414;
- shr.u32 %r5594, %r8504, %r5593;
- shl.b32 %r5595, %r8503, %r1414;
- add.s32 %r8503, %r5594, %r5595;
- mul.wide.s32 %rd1737, %r1411, 4;
- add.s64 %rd1738, %rd1, %rd1737;
- ld.local.u32 %r5596, [%rd1738];
- shr.u32 %r5597, %r5596, %r5593;
- shl.b32 %r5598, %r8504, %r1414;
- add.s32 %r8504, %r5597, %r5598;
-
-$L__BB0_1080:
- and.b32 %r5599, %r1405, -2147483648;
- shr.u32 %r5600, %r8504, 30;
- shl.b32 %r5601, %r8503, 2;
- or.b32 %r5602, %r5600, %r5601;
- shr.u32 %r5603, %r5602, 31;
- shr.u32 %r5604, %r8503, 30;
- add.s32 %r5605, %r5603, %r5604;
- neg.s32 %r5606, %r5605;
- setp.eq.s32 %p919, %r5599, 0;
- selp.b32 %r8505, %r5605, %r5606, %p919;
- setp.ne.s32 %p920, %r5603, 0;
- xor.b32 %r5607, %r5599, -2147483648;
- selp.b32 %r5608, %r5607, %r5599, %p920;
- selp.b32 %r5609, -1, 0, %p920;
- xor.b32 %r5610, %r5602, %r5609;
- shl.b32 %r5611, %r8504, 2;
- xor.b32 %r5612, %r5611, %r5609;
- cvt.u64.u32 %rd1739, %r5610;
- cvt.u64.u32 %rd1740, %r5612;
- bfi.b64 %rd1741, %rd1739, %rd1740, 32, 32;
- cvt.rn.f64.s64 %fd139, %rd1741;
- mul.f64 %fd140, %fd139, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3752, %fd140;
- setp.eq.s32 %p921, %r5608, 0;
- neg.f32 %f3753, %f3752;
- selp.f32 %f5590, %f3752, %f3753, %p921;
-
-$L__BB0_1082:
- add.s32 %r1421, %r8505, 1;
- and.b32 %r1422, %r1421, 1;
- setp.eq.s32 %p922, %r1422, 0;
- selp.f32 %f1201, %f5590, 0f3F800000, %p922;
- mul.rn.f32 %f1202, %f5590, %f5590;
- mov.f32 %f5591, 0fB94D4153;
- @%p922 bra $L__BB0_1084;
-
- mov.f32 %f3756, 0fBAB607ED;
- mov.f32 %f3757, 0f37CBAC00;
- fma.rn.f32 %f5591, %f3757, %f1202, %f3756;
-
-$L__BB0_1084:
- selp.f32 %f3758, 0f3C0885E4, 0f3D2AAABB, %p922;
- fma.rn.f32 %f3759, %f5591, %f1202, %f3758;
- selp.f32 %f3760, 0fBE2AAAA8, 0fBEFFFFFF, %p922;
- fma.rn.f32 %f3761, %f3759, %f1202, %f3760;
- mov.f32 %f3762, 0f00000000;
- fma.rn.f32 %f3763, %f1202, %f1201, %f3762;
- fma.rn.f32 %f5215, %f3761, %f3763, %f1201;
- and.b32 %r5614, %r1421, 2;
- setp.eq.s32 %p924, %r5614, 0;
- @%p924 bra $L__BB0_1086;
-
- mov.f32 %f3765, 0fBF800000;
- fma.rn.f32 %f5215, %f5215, %f3765, %f3762;
-
-$L__BB0_1086:
- selp.f32 %f1209, %f5215, %f5216, %p18;
- selp.f32 %f1210, %f5213, %f5214, %p18;
- @%p903 bra $L__BB0_1088;
-
- add.f32 %f5712, %f1210, %f1209;
- mov.f32 %f5214, %f5213;
- mov.f32 %f5216, %f5215;
-
-$L__BB0_1088:
- @%p824 bra $L__BB0_1117;
-
- shl.b32 %r5616, %r12, 5;
- mov.u32 %r5617, -32;
- sub.s32 %r1423, %r5617, %r5616;
- setp.ge.s32 %p928, %r14, %r1423;
- @%p928 bra $L__BB0_1102;
-
- mul.f32 %f3768, %f5345, 0f3F22F983;
- cvt.rni.s32.f32 %r8509, %f3768;
- cvt.rn.f32.s32 %f3769, %r8509;
- mov.f32 %f3770, 0fBFC90FDA;
- fma.rn.f32 %f3771, %f3769, %f3770, %f5345;
- mov.f32 %f3772, 0fB3A22168;
- fma.rn.f32 %f3773, %f3769, %f3772, %f3771;
- mov.f32 %f3774, 0fA7C234C5;
- fma.rn.f32 %f5599, %f3769, %f3774, %f3773;
- abs.f32 %f1218, %f5345;
- setp.ltu.f32 %p929, %f1218, 0f47CE4780;
- @%p929 bra $L__BB0_1098;
-
- setp.eq.f32 %p930, %f1218, 0f7F800000;
- @%p930 bra $L__BB0_1097;
- bra.uni $L__BB0_1092;
-
-$L__BB0_1097:
- mov.f32 %f3777, 0f00000000;
- mul.rn.f32 %f5599, %f5345, %f3777;
- mov.u32 %r8509, 0;
- bra.uni $L__BB0_1098;
-
-$L__BB0_1092:
- mov.b32 %r1425, %f5345;
- shr.u32 %r5619, %r1425, 23;
- and.b32 %r5620, %r5619, 255;
- add.s32 %r1426, %r5620, -128;
- shl.b32 %r5621, %r1425, 8;
- or.b32 %r1427, %r5621, -2147483648;
- shr.u32 %r1428, %r1426, 5;
- mov.u64 %rd2638, 0;
- mov.u32 %r8506, 0;
- mov.u64 %rd1745, __cudart_i2opi_f;
- mov.u64 %rd2639, %rd2638;
-
-$L__BB0_1093:
- .pragma "nounroll";
- shl.b64 %rd1744, %rd2638, 2;
- add.s64 %rd1746, %rd1745, %rd1744;
- ld.global.nc.u32 %r5622, [%rd1746];
- mad.wide.u32 %rd1747, %r5622, %r1427, %rd2639;
- shr.u64 %rd2639, %rd1747, 32;
- add.s64 %rd1748, %rd1, %rd1744;
- st.local.u32 [%rd1748], %rd1747;
- add.s32 %r8506, %r8506, 1;
- cvt.s64.s32 %rd2638, %r8506;
- setp.ne.s32 %p931, %r8506, 6;
- @%p931 bra $L__BB0_1093;
-
- st.local.u32 [%rd5], %rd2639;
- mov.u32 %r5623, 4;
- sub.s32 %r1431, %r5623, %r1428;
- mov.u32 %r5624, 6;
- sub.s32 %r5625, %r5624, %r1428;
- mul.wide.s32 %rd1749, %r5625, 4;
- add.s64 %rd1750, %rd1, %rd1749;
- ld.local.u32 %r8507, [%rd1750];
- ld.local.u32 %r8508, [%rd1750+-4];
- and.b32 %r1434, %r1426, 31;
- setp.eq.s32 %p932, %r1434, 0;
- @%p932 bra $L__BB0_1096;
-
- mov.u32 %r5626, 32;
- sub.s32 %r5627, %r5626, %r1434;
- shr.u32 %r5628, %r8508, %r5627;
- shl.b32 %r5629, %r8507, %r1434;
- add.s32 %r8507, %r5628, %r5629;
- mul.wide.s32 %rd1751, %r1431, 4;
- add.s64 %rd1752, %rd1, %rd1751;
- ld.local.u32 %r5630, [%rd1752];
- shr.u32 %r5631, %r5630, %r5627;
- shl.b32 %r5632, %r8508, %r1434;
- add.s32 %r8508, %r5631, %r5632;
-
-$L__BB0_1096:
- and.b32 %r5633, %r1425, -2147483648;
- shr.u32 %r5634, %r8508, 30;
- shl.b32 %r5635, %r8507, 2;
- or.b32 %r5636, %r5634, %r5635;
- shr.u32 %r5637, %r5636, 31;
- shr.u32 %r5638, %r8507, 30;
- add.s32 %r5639, %r5637, %r5638;
- neg.s32 %r5640, %r5639;
- setp.eq.s32 %p933, %r5633, 0;
- selp.b32 %r8509, %r5639, %r5640, %p933;
- setp.ne.s32 %p934, %r5637, 0;
- xor.b32 %r5641, %r5633, -2147483648;
- selp.b32 %r5642, %r5641, %r5633, %p934;
- selp.b32 %r5643, -1, 0, %p934;
- xor.b32 %r5644, %r5636, %r5643;
- shl.b32 %r5645, %r8508, 2;
- xor.b32 %r5646, %r5645, %r5643;
- cvt.u64.u32 %rd1753, %r5644;
- cvt.u64.u32 %rd1754, %r5646;
- bfi.b64 %rd1755, %rd1753, %rd1754, 32, 32;
- cvt.rn.f64.s64 %fd141, %rd1755;
- mul.f64 %fd142, %fd141, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3775, %fd142;
- setp.eq.s32 %p935, %r5642, 0;
- neg.f32 %f3776, %f3775;
- selp.f32 %f5599, %f3775, %f3776, %p935;
-
-$L__BB0_1098:
- and.b32 %r1441, %r8509, 1;
- setp.eq.s32 %p936, %r1441, 0;
- selp.f32 %f1222, %f5599, 0f3F800000, %p936;
- mul.rn.f32 %f1223, %f5599, %f5599;
- mov.f32 %f5600, 0fB94D4153;
- @%p936 bra $L__BB0_1100;
-
- mov.f32 %f3779, 0fBAB607ED;
- mov.f32 %f3780, 0f37CBAC00;
- fma.rn.f32 %f5600, %f3780, %f1223, %f3779;
-
-$L__BB0_1100:
- selp.f32 %f3781, 0f3C0885E4, 0f3D2AAABB, %p936;
- fma.rn.f32 %f3782, %f5600, %f1223, %f3781;
- selp.f32 %f3783, 0fBE2AAAA8, 0fBEFFFFFF, %p936;
- fma.rn.f32 %f3784, %f3782, %f1223, %f3783;
- mov.f32 %f3785, 0f00000000;
- fma.rn.f32 %f3786, %f1223, %f1222, %f3785;
- fma.rn.f32 %f5213, %f3784, %f3786, %f1222;
- and.b32 %r5648, %r8509, 2;
- setp.eq.s32 %p938, %r5648, 0;
- @%p938 bra $L__BB0_1102;
-
- mov.f32 %f3788, 0fBF800000;
- fma.rn.f32 %f5213, %f5213, %f3788, %f3785;
-
-$L__BB0_1102:
- setp.lt.s32 %p19, %r14, %r1423;
- @%p928 bra $L__BB0_1115;
-
- mul.f32 %f3789, %f5337, 0f3F22F983;
- cvt.rni.s32.f32 %r8513, %f3789;
- cvt.rn.f32.s32 %f3790, %r8513;
- mov.f32 %f3791, 0fBFC90FDA;
- fma.rn.f32 %f3792, %f3790, %f3791, %f5337;
- mov.f32 %f3793, 0fB3A22168;
- fma.rn.f32 %f3794, %f3790, %f3793, %f3792;
- mov.f32 %f3795, 0fA7C234C5;
- fma.rn.f32 %f5603, %f3790, %f3795, %f3794;
- abs.f32 %f1231, %f5337;
- setp.ltu.f32 %p940, %f1231, 0f47CE4780;
- @%p940 bra $L__BB0_1111;
-
- setp.eq.f32 %p941, %f1231, 0f7F800000;
- @%p941 bra $L__BB0_1110;
- bra.uni $L__BB0_1105;
-
-$L__BB0_1110:
- mov.f32 %f3798, 0f00000000;
- mul.rn.f32 %f5603, %f5337, %f3798;
- mov.u32 %r8513, 0;
- bra.uni $L__BB0_1111;
-
-$L__BB0_1105:
- mov.b32 %r1443, %f5337;
- shr.u32 %r5650, %r1443, 23;
- and.b32 %r5651, %r5650, 255;
- add.s32 %r1444, %r5651, -128;
- shl.b32 %r5652, %r1443, 8;
- or.b32 %r1445, %r5652, -2147483648;
- shr.u32 %r1446, %r1444, 5;
- mov.u64 %rd2640, 0;
- mov.u32 %r8510, 0;
- mov.u64 %rd1759, __cudart_i2opi_f;
- mov.u64 %rd2641, %rd2640;
-
-$L__BB0_1106:
- .pragma "nounroll";
- shl.b64 %rd1758, %rd2640, 2;
- add.s64 %rd1760, %rd1759, %rd1758;
- ld.global.nc.u32 %r5653, [%rd1760];
- mad.wide.u32 %rd1761, %r5653, %r1445, %rd2641;
- shr.u64 %rd2641, %rd1761, 32;
- add.s64 %rd1762, %rd1, %rd1758;
- st.local.u32 [%rd1762], %rd1761;
- add.s32 %r8510, %r8510, 1;
- cvt.s64.s32 %rd2640, %r8510;
- setp.ne.s32 %p942, %r8510, 6;
- @%p942 bra $L__BB0_1106;
-
- st.local.u32 [%rd5], %rd2641;
- mov.u32 %r5654, 4;
- sub.s32 %r1449, %r5654, %r1446;
- mov.u32 %r5655, 6;
- sub.s32 %r5656, %r5655, %r1446;
- mul.wide.s32 %rd1763, %r5656, 4;
- add.s64 %rd1764, %rd1, %rd1763;
- ld.local.u32 %r8511, [%rd1764];
- ld.local.u32 %r8512, [%rd1764+-4];
- and.b32 %r1452, %r1444, 31;
- setp.eq.s32 %p943, %r1452, 0;
- @%p943 bra $L__BB0_1109;
-
- mov.u32 %r5657, 32;
- sub.s32 %r5658, %r5657, %r1452;
- shr.u32 %r5659, %r8512, %r5658;
- shl.b32 %r5660, %r8511, %r1452;
- add.s32 %r8511, %r5659, %r5660;
- mul.wide.s32 %rd1765, %r1449, 4;
- add.s64 %rd1766, %rd1, %rd1765;
- ld.local.u32 %r5661, [%rd1766];
- shr.u32 %r5662, %r5661, %r5658;
- shl.b32 %r5663, %r8512, %r1452;
- add.s32 %r8512, %r5662, %r5663;
-
-$L__BB0_1109:
- and.b32 %r5664, %r1443, -2147483648;
- shr.u32 %r5665, %r8512, 30;
- shl.b32 %r5666, %r8511, 2;
- or.b32 %r5667, %r5665, %r5666;
- shr.u32 %r5668, %r5667, 31;
- shr.u32 %r5669, %r8511, 30;
- add.s32 %r5670, %r5668, %r5669;
- neg.s32 %r5671, %r5670;
- setp.eq.s32 %p944, %r5664, 0;
- selp.b32 %r8513, %r5670, %r5671, %p944;
- setp.ne.s32 %p945, %r5668, 0;
- xor.b32 %r5672, %r5664, -2147483648;
- selp.b32 %r5673, %r5672, %r5664, %p945;
- selp.b32 %r5674, -1, 0, %p945;
- xor.b32 %r5675, %r5667, %r5674;
- shl.b32 %r5676, %r8512, 2;
- xor.b32 %r5677, %r5676, %r5674;
- cvt.u64.u32 %rd1767, %r5675;
- cvt.u64.u32 %rd1768, %r5677;
- bfi.b64 %rd1769, %rd1767, %rd1768, 32, 32;
- cvt.rn.f64.s64 %fd143, %rd1769;
- mul.f64 %fd144, %fd143, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3796, %fd144;
- setp.eq.s32 %p946, %r5673, 0;
- neg.f32 %f3797, %f3796;
- selp.f32 %f5603, %f3796, %f3797, %p946;
-
-$L__BB0_1111:
- add.s32 %r1459, %r8513, 1;
- and.b32 %r1460, %r1459, 1;
- setp.eq.s32 %p947, %r1460, 0;
- selp.f32 %f1235, %f5603, 0f3F800000, %p947;
- mul.rn.f32 %f1236, %f5603, %f5603;
- mov.f32 %f5604, 0fB94D4153;
- @%p947 bra $L__BB0_1113;
-
- mov.f32 %f3800, 0fBAB607ED;
- mov.f32 %f3801, 0f37CBAC00;
- fma.rn.f32 %f5604, %f3801, %f1236, %f3800;
-
-$L__BB0_1113:
- selp.f32 %f3802, 0f3C0885E4, 0f3D2AAABB, %p947;
- fma.rn.f32 %f3803, %f5604, %f1236, %f3802;
- selp.f32 %f3804, 0fBE2AAAA8, 0fBEFFFFFF, %p947;
- fma.rn.f32 %f3805, %f3803, %f1236, %f3804;
- mov.f32 %f3806, 0f00000000;
- fma.rn.f32 %f3807, %f1236, %f1235, %f3806;
- fma.rn.f32 %f5215, %f3805, %f3807, %f1235;
- and.b32 %r5679, %r1459, 2;
- setp.eq.s32 %p949, %r5679, 0;
- @%p949 bra $L__BB0_1115;
-
- mov.f32 %f3809, 0fBF800000;
- fma.rn.f32 %f5215, %f5215, %f3809, %f3806;
-
-$L__BB0_1115:
- selp.f32 %f1243, %f5215, %f5216, %p19;
- selp.f32 %f1244, %f5213, %f5214, %p19;
- @%p928 bra $L__BB0_1117;
-
- add.f32 %f5711, %f1244, %f1243;
- mov.f32 %f5214, %f5213;
- mov.f32 %f5216, %f5215;
-
-$L__BB0_1117:
- @%p828 bra $L__BB0_1146;
-
- shl.b32 %r5681, %r12, 5;
- neg.s32 %r1461, %r5681;
- setp.ge.s32 %p953, %r14, %r1461;
- @%p953 bra $L__BB0_1131;
-
- mul.f32 %f3812, %f5344, 0f3F22F983;
- cvt.rni.s32.f32 %r8517, %f3812;
- cvt.rn.f32.s32 %f3813, %r8517;
- mov.f32 %f3814, 0fBFC90FDA;
- fma.rn.f32 %f3815, %f3813, %f3814, %f5344;
- mov.f32 %f3816, 0fB3A22168;
- fma.rn.f32 %f3817, %f3813, %f3816, %f3815;
- mov.f32 %f3818, 0fA7C234C5;
- fma.rn.f32 %f5612, %f3813, %f3818, %f3817;
- abs.f32 %f1252, %f5344;
- setp.ltu.f32 %p954, %f1252, 0f47CE4780;
- @%p954 bra $L__BB0_1127;
-
- setp.eq.f32 %p955, %f1252, 0f7F800000;
- @%p955 bra $L__BB0_1126;
- bra.uni $L__BB0_1121;
-
-$L__BB0_1126:
- mov.f32 %f3821, 0f00000000;
- mul.rn.f32 %f5612, %f5344, %f3821;
- mov.u32 %r8517, 0;
- bra.uni $L__BB0_1127;
-
-$L__BB0_1121:
- mov.b32 %r1463, %f5344;
- shr.u32 %r5683, %r1463, 23;
- and.b32 %r5684, %r5683, 255;
- add.s32 %r1464, %r5684, -128;
- shl.b32 %r5685, %r1463, 8;
- or.b32 %r1465, %r5685, -2147483648;
- shr.u32 %r1466, %r1464, 5;
- mov.u64 %rd2642, 0;
- mov.u32 %r8514, 0;
- mov.u64 %rd1773, __cudart_i2opi_f;
- mov.u64 %rd2643, %rd2642;
-
-$L__BB0_1122:
- .pragma "nounroll";
- shl.b64 %rd1772, %rd2642, 2;
- add.s64 %rd1774, %rd1773, %rd1772;
- ld.global.nc.u32 %r5686, [%rd1774];
- mad.wide.u32 %rd1775, %r5686, %r1465, %rd2643;
- shr.u64 %rd2643, %rd1775, 32;
- add.s64 %rd1776, %rd1, %rd1772;
- st.local.u32 [%rd1776], %rd1775;
- add.s32 %r8514, %r8514, 1;
- cvt.s64.s32 %rd2642, %r8514;
- setp.ne.s32 %p956, %r8514, 6;
- @%p956 bra $L__BB0_1122;
-
- st.local.u32 [%rd5], %rd2643;
- mov.u32 %r5687, 4;
- sub.s32 %r1469, %r5687, %r1466;
- mov.u32 %r5688, 6;
- sub.s32 %r5689, %r5688, %r1466;
- mul.wide.s32 %rd1777, %r5689, 4;
- add.s64 %rd1778, %rd1, %rd1777;
- ld.local.u32 %r8515, [%rd1778];
- ld.local.u32 %r8516, [%rd1778+-4];
- and.b32 %r1472, %r1464, 31;
- setp.eq.s32 %p957, %r1472, 0;
- @%p957 bra $L__BB0_1125;
-
- mov.u32 %r5690, 32;
- sub.s32 %r5691, %r5690, %r1472;
- shr.u32 %r5692, %r8516, %r5691;
- shl.b32 %r5693, %r8515, %r1472;
- add.s32 %r8515, %r5692, %r5693;
- mul.wide.s32 %rd1779, %r1469, 4;
- add.s64 %rd1780, %rd1, %rd1779;
- ld.local.u32 %r5694, [%rd1780];
- shr.u32 %r5695, %r5694, %r5691;
- shl.b32 %r5696, %r8516, %r1472;
- add.s32 %r8516, %r5695, %r5696;
-
-$L__BB0_1125:
- and.b32 %r5697, %r1463, -2147483648;
- shr.u32 %r5698, %r8516, 30;
- shl.b32 %r5699, %r8515, 2;
- or.b32 %r5700, %r5698, %r5699;
- shr.u32 %r5701, %r5700, 31;
- shr.u32 %r5702, %r8515, 30;
- add.s32 %r5703, %r5701, %r5702;
- neg.s32 %r5704, %r5703;
- setp.eq.s32 %p958, %r5697, 0;
- selp.b32 %r8517, %r5703, %r5704, %p958;
- setp.ne.s32 %p959, %r5701, 0;
- xor.b32 %r5705, %r5697, -2147483648;
- selp.b32 %r5706, %r5705, %r5697, %p959;
- selp.b32 %r5707, -1, 0, %p959;
- xor.b32 %r5708, %r5700, %r5707;
- shl.b32 %r5709, %r8516, 2;
- xor.b32 %r5710, %r5709, %r5707;
- cvt.u64.u32 %rd1781, %r5708;
- cvt.u64.u32 %rd1782, %r5710;
- bfi.b64 %rd1783, %rd1781, %rd1782, 32, 32;
- cvt.rn.f64.s64 %fd145, %rd1783;
- mul.f64 %fd146, %fd145, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3819, %fd146;
- setp.eq.s32 %p960, %r5706, 0;
- neg.f32 %f3820, %f3819;
- selp.f32 %f5612, %f3819, %f3820, %p960;
-
-$L__BB0_1127:
- and.b32 %r1479, %r8517, 1;
- setp.eq.s32 %p961, %r1479, 0;
- selp.f32 %f1256, %f5612, 0f3F800000, %p961;
- mul.rn.f32 %f1257, %f5612, %f5612;
- mov.f32 %f5613, 0fB94D4153;
- @%p961 bra $L__BB0_1129;
-
- mov.f32 %f3823, 0fBAB607ED;
- mov.f32 %f3824, 0f37CBAC00;
- fma.rn.f32 %f5613, %f3824, %f1257, %f3823;
-
-$L__BB0_1129:
- selp.f32 %f3825, 0f3C0885E4, 0f3D2AAABB, %p961;
- fma.rn.f32 %f3826, %f5613, %f1257, %f3825;
- selp.f32 %f3827, 0fBE2AAAA8, 0fBEFFFFFF, %p961;
- fma.rn.f32 %f3828, %f3826, %f1257, %f3827;
- mov.f32 %f3829, 0f00000000;
- fma.rn.f32 %f3830, %f1257, %f1256, %f3829;
- fma.rn.f32 %f5213, %f3828, %f3830, %f1256;
- and.b32 %r5712, %r8517, 2;
- setp.eq.s32 %p963, %r5712, 0;
- @%p963 bra $L__BB0_1131;
-
- mov.f32 %f3832, 0fBF800000;
- fma.rn.f32 %f5213, %f5213, %f3832, %f3829;
-
-$L__BB0_1131:
- setp.lt.s32 %p20, %r14, %r1461;
- @%p953 bra $L__BB0_1144;
-
- mul.f32 %f3833, %f5336, 0f3F22F983;
- cvt.rni.s32.f32 %r8521, %f3833;
- cvt.rn.f32.s32 %f3834, %r8521;
- mov.f32 %f3835, 0fBFC90FDA;
- fma.rn.f32 %f3836, %f3834, %f3835, %f5336;
- mov.f32 %f3837, 0fB3A22168;
- fma.rn.f32 %f3838, %f3834, %f3837, %f3836;
- mov.f32 %f3839, 0fA7C234C5;
- fma.rn.f32 %f5616, %f3834, %f3839, %f3838;
- abs.f32 %f1265, %f5336;
- setp.ltu.f32 %p965, %f1265, 0f47CE4780;
- @%p965 bra $L__BB0_1140;
-
- setp.eq.f32 %p966, %f1265, 0f7F800000;
- @%p966 bra $L__BB0_1139;
- bra.uni $L__BB0_1134;
-
-$L__BB0_1139:
- mov.f32 %f3842, 0f00000000;
- mul.rn.f32 %f5616, %f5336, %f3842;
- mov.u32 %r8521, 0;
- bra.uni $L__BB0_1140;
-
-$L__BB0_1134:
- mov.b32 %r1481, %f5336;
- shr.u32 %r5714, %r1481, 23;
- and.b32 %r5715, %r5714, 255;
- add.s32 %r1482, %r5715, -128;
- shl.b32 %r5716, %r1481, 8;
- or.b32 %r1483, %r5716, -2147483648;
- shr.u32 %r1484, %r1482, 5;
- mov.u64 %rd2644, 0;
- mov.u32 %r8518, 0;
- mov.u64 %rd1787, __cudart_i2opi_f;
- mov.u64 %rd2645, %rd2644;
-
-$L__BB0_1135:
- .pragma "nounroll";
- shl.b64 %rd1786, %rd2644, 2;
- add.s64 %rd1788, %rd1787, %rd1786;
- ld.global.nc.u32 %r5717, [%rd1788];
- mad.wide.u32 %rd1789, %r5717, %r1483, %rd2645;
- shr.u64 %rd2645, %rd1789, 32;
- add.s64 %rd1790, %rd1, %rd1786;
- st.local.u32 [%rd1790], %rd1789;
- add.s32 %r8518, %r8518, 1;
- cvt.s64.s32 %rd2644, %r8518;
- setp.ne.s32 %p967, %r8518, 6;
- @%p967 bra $L__BB0_1135;
-
- st.local.u32 [%rd5], %rd2645;
- mov.u32 %r5718, 4;
- sub.s32 %r1487, %r5718, %r1484;
- mov.u32 %r5719, 6;
- sub.s32 %r5720, %r5719, %r1484;
- mul.wide.s32 %rd1791, %r5720, 4;
- add.s64 %rd1792, %rd1, %rd1791;
- ld.local.u32 %r8519, [%rd1792];
- ld.local.u32 %r8520, [%rd1792+-4];
- and.b32 %r1490, %r1482, 31;
- setp.eq.s32 %p968, %r1490, 0;
- @%p968 bra $L__BB0_1138;
-
- mov.u32 %r5721, 32;
- sub.s32 %r5722, %r5721, %r1490;
- shr.u32 %r5723, %r8520, %r5722;
- shl.b32 %r5724, %r8519, %r1490;
- add.s32 %r8519, %r5723, %r5724;
- mul.wide.s32 %rd1793, %r1487, 4;
- add.s64 %rd1794, %rd1, %rd1793;
- ld.local.u32 %r5725, [%rd1794];
- shr.u32 %r5726, %r5725, %r5722;
- shl.b32 %r5727, %r8520, %r1490;
- add.s32 %r8520, %r5726, %r5727;
-
-$L__BB0_1138:
- and.b32 %r5728, %r1481, -2147483648;
- shr.u32 %r5729, %r8520, 30;
- shl.b32 %r5730, %r8519, 2;
- or.b32 %r5731, %r5729, %r5730;
- shr.u32 %r5732, %r5731, 31;
- shr.u32 %r5733, %r8519, 30;
- add.s32 %r5734, %r5732, %r5733;
- neg.s32 %r5735, %r5734;
- setp.eq.s32 %p969, %r5728, 0;
- selp.b32 %r8521, %r5734, %r5735, %p969;
- setp.ne.s32 %p970, %r5732, 0;
- xor.b32 %r5736, %r5728, -2147483648;
- selp.b32 %r5737, %r5736, %r5728, %p970;
- selp.b32 %r5738, -1, 0, %p970;
- xor.b32 %r5739, %r5731, %r5738;
- shl.b32 %r5740, %r8520, 2;
- xor.b32 %r5741, %r5740, %r5738;
- cvt.u64.u32 %rd1795, %r5739;
- cvt.u64.u32 %rd1796, %r5741;
- bfi.b64 %rd1797, %rd1795, %rd1796, 32, 32;
- cvt.rn.f64.s64 %fd147, %rd1797;
- mul.f64 %fd148, %fd147, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3840, %fd148;
- setp.eq.s32 %p971, %r5737, 0;
- neg.f32 %f3841, %f3840;
- selp.f32 %f5616, %f3840, %f3841, %p971;
-
-$L__BB0_1140:
- add.s32 %r1497, %r8521, 1;
- and.b32 %r1498, %r1497, 1;
- setp.eq.s32 %p972, %r1498, 0;
- selp.f32 %f1269, %f5616, 0f3F800000, %p972;
- mul.rn.f32 %f1270, %f5616, %f5616;
- mov.f32 %f5617, 0fB94D4153;
- @%p972 bra $L__BB0_1142;
-
- mov.f32 %f3844, 0fBAB607ED;
- mov.f32 %f3845, 0f37CBAC00;
- fma.rn.f32 %f5617, %f3845, %f1270, %f3844;
-
-$L__BB0_1142:
- selp.f32 %f3846, 0f3C0885E4, 0f3D2AAABB, %p972;
- fma.rn.f32 %f3847, %f5617, %f1270, %f3846;
- selp.f32 %f3848, 0fBE2AAAA8, 0fBEFFFFFF, %p972;
- fma.rn.f32 %f3849, %f3847, %f1270, %f3848;
- mov.f32 %f3850, 0f00000000;
- fma.rn.f32 %f3851, %f1270, %f1269, %f3850;
- fma.rn.f32 %f5215, %f3849, %f3851, %f1269;
- and.b32 %r5743, %r1497, 2;
- setp.eq.s32 %p974, %r5743, 0;
- @%p974 bra $L__BB0_1144;
-
- mov.f32 %f3853, 0fBF800000;
- fma.rn.f32 %f5215, %f5215, %f3853, %f3850;
-
-$L__BB0_1144:
- selp.f32 %f1277, %f5215, %f5216, %p20;
- selp.f32 %f1278, %f5213, %f5214, %p20;
- @%p953 bra $L__BB0_1146;
-
- add.f32 %f5710, %f1278, %f1277;
- mov.f32 %f5214, %f5213;
- mov.f32 %f5216, %f5215;
-
-$L__BB0_1146:
- @%p828 bra $L__BB0_1175;
-
- shl.b32 %r5745, %r12, 5;
- mov.u32 %r5746, -32;
- sub.s32 %r1499, %r5746, %r5745;
- setp.ge.s32 %p978, %r14, %r1499;
- @%p978 bra $L__BB0_1160;
-
- mul.f32 %f3856, %f5343, 0f3F22F983;
- cvt.rni.s32.f32 %r8525, %f3856;
- cvt.rn.f32.s32 %f3857, %r8525;
- mov.f32 %f3858, 0fBFC90FDA;
- fma.rn.f32 %f3859, %f3857, %f3858, %f5343;
- mov.f32 %f3860, 0fB3A22168;
- fma.rn.f32 %f3861, %f3857, %f3860, %f3859;
- mov.f32 %f3862, 0fA7C234C5;
- fma.rn.f32 %f5625, %f3857, %f3862, %f3861;
- abs.f32 %f1286, %f5343;
- setp.ltu.f32 %p979, %f1286, 0f47CE4780;
- @%p979 bra $L__BB0_1156;
-
- setp.eq.f32 %p980, %f1286, 0f7F800000;
- @%p980 bra $L__BB0_1155;
- bra.uni $L__BB0_1150;
-
-$L__BB0_1155:
- mov.f32 %f3865, 0f00000000;
- mul.rn.f32 %f5625, %f5343, %f3865;
- mov.u32 %r8525, 0;
- bra.uni $L__BB0_1156;
-
-$L__BB0_1150:
- mov.b32 %r1501, %f5343;
- shr.u32 %r5748, %r1501, 23;
- and.b32 %r5749, %r5748, 255;
- add.s32 %r1502, %r5749, -128;
- shl.b32 %r5750, %r1501, 8;
- or.b32 %r1503, %r5750, -2147483648;
- shr.u32 %r1504, %r1502, 5;
- mov.u64 %rd2646, 0;
- mov.u32 %r8522, 0;
- mov.u64 %rd1801, __cudart_i2opi_f;
- mov.u64 %rd2647, %rd2646;
-
-$L__BB0_1151:
- .pragma "nounroll";
- shl.b64 %rd1800, %rd2646, 2;
- add.s64 %rd1802, %rd1801, %rd1800;
- ld.global.nc.u32 %r5751, [%rd1802];
- mad.wide.u32 %rd1803, %r5751, %r1503, %rd2647;
- shr.u64 %rd2647, %rd1803, 32;
- add.s64 %rd1804, %rd1, %rd1800;
- st.local.u32 [%rd1804], %rd1803;
- add.s32 %r8522, %r8522, 1;
- cvt.s64.s32 %rd2646, %r8522;
- setp.ne.s32 %p981, %r8522, 6;
- @%p981 bra $L__BB0_1151;
-
- st.local.u32 [%rd5], %rd2647;
- mov.u32 %r5752, 4;
- sub.s32 %r1507, %r5752, %r1504;
- mov.u32 %r5753, 6;
- sub.s32 %r5754, %r5753, %r1504;
- mul.wide.s32 %rd1805, %r5754, 4;
- add.s64 %rd1806, %rd1, %rd1805;
- ld.local.u32 %r8523, [%rd1806];
- ld.local.u32 %r8524, [%rd1806+-4];
- and.b32 %r1510, %r1502, 31;
- setp.eq.s32 %p982, %r1510, 0;
- @%p982 bra $L__BB0_1154;
-
- mov.u32 %r5755, 32;
- sub.s32 %r5756, %r5755, %r1510;
- shr.u32 %r5757, %r8524, %r5756;
- shl.b32 %r5758, %r8523, %r1510;
- add.s32 %r8523, %r5757, %r5758;
- mul.wide.s32 %rd1807, %r1507, 4;
- add.s64 %rd1808, %rd1, %rd1807;
- ld.local.u32 %r5759, [%rd1808];
- shr.u32 %r5760, %r5759, %r5756;
- shl.b32 %r5761, %r8524, %r1510;
- add.s32 %r8524, %r5760, %r5761;
-
-$L__BB0_1154:
- and.b32 %r5762, %r1501, -2147483648;
- shr.u32 %r5763, %r8524, 30;
- shl.b32 %r5764, %r8523, 2;
- or.b32 %r5765, %r5763, %r5764;
- shr.u32 %r5766, %r5765, 31;
- shr.u32 %r5767, %r8523, 30;
- add.s32 %r5768, %r5766, %r5767;
- neg.s32 %r5769, %r5768;
- setp.eq.s32 %p983, %r5762, 0;
- selp.b32 %r8525, %r5768, %r5769, %p983;
- setp.ne.s32 %p984, %r5766, 0;
- xor.b32 %r5770, %r5762, -2147483648;
- selp.b32 %r5771, %r5770, %r5762, %p984;
- selp.b32 %r5772, -1, 0, %p984;
- xor.b32 %r5773, %r5765, %r5772;
- shl.b32 %r5774, %r8524, 2;
- xor.b32 %r5775, %r5774, %r5772;
- cvt.u64.u32 %rd1809, %r5773;
- cvt.u64.u32 %rd1810, %r5775;
- bfi.b64 %rd1811, %rd1809, %rd1810, 32, 32;
- cvt.rn.f64.s64 %fd149, %rd1811;
- mul.f64 %fd150, %fd149, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3863, %fd150;
- setp.eq.s32 %p985, %r5771, 0;
- neg.f32 %f3864, %f3863;
- selp.f32 %f5625, %f3863, %f3864, %p985;
-
-$L__BB0_1156:
- and.b32 %r1517, %r8525, 1;
- setp.eq.s32 %p986, %r1517, 0;
- selp.f32 %f1290, %f5625, 0f3F800000, %p986;
- mul.rn.f32 %f1291, %f5625, %f5625;
- mov.f32 %f5626, 0fB94D4153;
- @%p986 bra $L__BB0_1158;
-
- mov.f32 %f3867, 0fBAB607ED;
- mov.f32 %f3868, 0f37CBAC00;
- fma.rn.f32 %f5626, %f3868, %f1291, %f3867;
-
-$L__BB0_1158:
- selp.f32 %f3869, 0f3C0885E4, 0f3D2AAABB, %p986;
- fma.rn.f32 %f3870, %f5626, %f1291, %f3869;
- selp.f32 %f3871, 0fBE2AAAA8, 0fBEFFFFFF, %p986;
- fma.rn.f32 %f3872, %f3870, %f1291, %f3871;
- mov.f32 %f3873, 0f00000000;
- fma.rn.f32 %f3874, %f1291, %f1290, %f3873;
- fma.rn.f32 %f5213, %f3872, %f3874, %f1290;
- and.b32 %r5777, %r8525, 2;
- setp.eq.s32 %p988, %r5777, 0;
- @%p988 bra $L__BB0_1160;
-
- mov.f32 %f3876, 0fBF800000;
- fma.rn.f32 %f5213, %f5213, %f3876, %f3873;
-
-$L__BB0_1160:
- setp.lt.s32 %p21, %r14, %r1499;
- @%p978 bra $L__BB0_1173;
-
- mul.f32 %f3877, %f5335, 0f3F22F983;
- cvt.rni.s32.f32 %r8529, %f3877;
- cvt.rn.f32.s32 %f3878, %r8529;
- mov.f32 %f3879, 0fBFC90FDA;
- fma.rn.f32 %f3880, %f3878, %f3879, %f5335;
- mov.f32 %f3881, 0fB3A22168;
- fma.rn.f32 %f3882, %f3878, %f3881, %f3880;
- mov.f32 %f3883, 0fA7C234C5;
- fma.rn.f32 %f5629, %f3878, %f3883, %f3882;
- abs.f32 %f1299, %f5335;
- setp.ltu.f32 %p990, %f1299, 0f47CE4780;
- @%p990 bra $L__BB0_1169;
-
- setp.eq.f32 %p991, %f1299, 0f7F800000;
- @%p991 bra $L__BB0_1168;
- bra.uni $L__BB0_1163;
-
-$L__BB0_1168:
- mov.f32 %f3886, 0f00000000;
- mul.rn.f32 %f5629, %f5335, %f3886;
- mov.u32 %r8529, 0;
- bra.uni $L__BB0_1169;
-
-$L__BB0_1163:
- mov.b32 %r1519, %f5335;
- shr.u32 %r5779, %r1519, 23;
- and.b32 %r5780, %r5779, 255;
- add.s32 %r1520, %r5780, -128;
- shl.b32 %r5781, %r1519, 8;
- or.b32 %r1521, %r5781, -2147483648;
- shr.u32 %r1522, %r1520, 5;
- mov.u64 %rd2648, 0;
- mov.u32 %r8526, 0;
- mov.u64 %rd1815, __cudart_i2opi_f;
- mov.u64 %rd2649, %rd2648;
-
-$L__BB0_1164:
- .pragma "nounroll";
- shl.b64 %rd1814, %rd2648, 2;
- add.s64 %rd1816, %rd1815, %rd1814;
- ld.global.nc.u32 %r5782, [%rd1816];
- mad.wide.u32 %rd1817, %r5782, %r1521, %rd2649;
- shr.u64 %rd2649, %rd1817, 32;
- add.s64 %rd1818, %rd1, %rd1814;
- st.local.u32 [%rd1818], %rd1817;
- add.s32 %r8526, %r8526, 1;
- cvt.s64.s32 %rd2648, %r8526;
- setp.ne.s32 %p992, %r8526, 6;
- @%p992 bra $L__BB0_1164;
-
- st.local.u32 [%rd5], %rd2649;
- mov.u32 %r5783, 4;
- sub.s32 %r1525, %r5783, %r1522;
- mov.u32 %r5784, 6;
- sub.s32 %r5785, %r5784, %r1522;
- mul.wide.s32 %rd1819, %r5785, 4;
- add.s64 %rd1820, %rd1, %rd1819;
- ld.local.u32 %r8527, [%rd1820];
- ld.local.u32 %r8528, [%rd1820+-4];
- and.b32 %r1528, %r1520, 31;
- setp.eq.s32 %p993, %r1528, 0;
- @%p993 bra $L__BB0_1167;
-
- mov.u32 %r5786, 32;
- sub.s32 %r5787, %r5786, %r1528;
- shr.u32 %r5788, %r8528, %r5787;
- shl.b32 %r5789, %r8527, %r1528;
- add.s32 %r8527, %r5788, %r5789;
- mul.wide.s32 %rd1821, %r1525, 4;
- add.s64 %rd1822, %rd1, %rd1821;
- ld.local.u32 %r5790, [%rd1822];
- shr.u32 %r5791, %r5790, %r5787;
- shl.b32 %r5792, %r8528, %r1528;
- add.s32 %r8528, %r5791, %r5792;
-
-$L__BB0_1167:
- and.b32 %r5793, %r1519, -2147483648;
- shr.u32 %r5794, %r8528, 30;
- shl.b32 %r5795, %r8527, 2;
- or.b32 %r5796, %r5794, %r5795;
- shr.u32 %r5797, %r5796, 31;
- shr.u32 %r5798, %r8527, 30;
- add.s32 %r5799, %r5797, %r5798;
- neg.s32 %r5800, %r5799;
- setp.eq.s32 %p994, %r5793, 0;
- selp.b32 %r8529, %r5799, %r5800, %p994;
- setp.ne.s32 %p995, %r5797, 0;
- xor.b32 %r5801, %r5793, -2147483648;
- selp.b32 %r5802, %r5801, %r5793, %p995;
- selp.b32 %r5803, -1, 0, %p995;
- xor.b32 %r5804, %r5796, %r5803;
- shl.b32 %r5805, %r8528, 2;
- xor.b32 %r5806, %r5805, %r5803;
- cvt.u64.u32 %rd1823, %r5804;
- cvt.u64.u32 %rd1824, %r5806;
- bfi.b64 %rd1825, %rd1823, %rd1824, 32, 32;
- cvt.rn.f64.s64 %fd151, %rd1825;
- mul.f64 %fd152, %fd151, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3884, %fd152;
- setp.eq.s32 %p996, %r5802, 0;
- neg.f32 %f3885, %f3884;
- selp.f32 %f5629, %f3884, %f3885, %p996;
-
-$L__BB0_1169:
- add.s32 %r1535, %r8529, 1;
- and.b32 %r1536, %r1535, 1;
- setp.eq.s32 %p997, %r1536, 0;
- selp.f32 %f1303, %f5629, 0f3F800000, %p997;
- mul.rn.f32 %f1304, %f5629, %f5629;
- mov.f32 %f5630, 0fB94D4153;
- @%p997 bra $L__BB0_1171;
-
- mov.f32 %f3888, 0fBAB607ED;
- mov.f32 %f3889, 0f37CBAC00;
- fma.rn.f32 %f5630, %f3889, %f1304, %f3888;
-
-$L__BB0_1171:
- selp.f32 %f3890, 0f3C0885E4, 0f3D2AAABB, %p997;
- fma.rn.f32 %f3891, %f5630, %f1304, %f3890;
- selp.f32 %f3892, 0fBE2AAAA8, 0fBEFFFFFF, %p997;
- fma.rn.f32 %f3893, %f3891, %f1304, %f3892;
- mov.f32 %f3894, 0f00000000;
- fma.rn.f32 %f3895, %f1304, %f1303, %f3894;
- fma.rn.f32 %f5215, %f3893, %f3895, %f1303;
- and.b32 %r5808, %r1535, 2;
- setp.eq.s32 %p999, %r5808, 0;
- @%p999 bra $L__BB0_1173;
-
- mov.f32 %f3897, 0fBF800000;
- fma.rn.f32 %f5215, %f5215, %f3897, %f3894;
-
-$L__BB0_1173:
- selp.f32 %f1311, %f5215, %f5216, %p21;
- selp.f32 %f1312, %f5213, %f5214, %p21;
- @%p978 bra $L__BB0_1175;
-
- add.f32 %f5709, %f1312, %f1311;
- mov.f32 %f5214, %f5213;
- mov.f32 %f5216, %f5215;
-
-$L__BB0_1175:
- @%p832 bra $L__BB0_1204;
-
- shl.b32 %r5810, %r12, 5;
- neg.s32 %r1537, %r5810;
- setp.ge.s32 %p1003, %r14, %r1537;
- @%p1003 bra $L__BB0_1189;
-
- mul.f32 %f3900, %f5342, 0f3F22F983;
- cvt.rni.s32.f32 %r8533, %f3900;
- cvt.rn.f32.s32 %f3901, %r8533;
- mov.f32 %f3902, 0fBFC90FDA;
- fma.rn.f32 %f3903, %f3901, %f3902, %f5342;
- mov.f32 %f3904, 0fB3A22168;
- fma.rn.f32 %f3905, %f3901, %f3904, %f3903;
- mov.f32 %f3906, 0fA7C234C5;
- fma.rn.f32 %f5638, %f3901, %f3906, %f3905;
- abs.f32 %f1320, %f5342;
- setp.ltu.f32 %p1004, %f1320, 0f47CE4780;
- @%p1004 bra $L__BB0_1185;
-
- setp.eq.f32 %p1005, %f1320, 0f7F800000;
- @%p1005 bra $L__BB0_1184;
- bra.uni $L__BB0_1179;
-
-$L__BB0_1184:
- mov.f32 %f3909, 0f00000000;
- mul.rn.f32 %f5638, %f5342, %f3909;
- mov.u32 %r8533, 0;
- bra.uni $L__BB0_1185;
-
-$L__BB0_1179:
- mov.b32 %r1539, %f5342;
- shr.u32 %r5812, %r1539, 23;
- and.b32 %r5813, %r5812, 255;
- add.s32 %r1540, %r5813, -128;
- shl.b32 %r5814, %r1539, 8;
- or.b32 %r1541, %r5814, -2147483648;
- shr.u32 %r1542, %r1540, 5;
- mov.u64 %rd2650, 0;
- mov.u32 %r8530, 0;
- mov.u64 %rd1829, __cudart_i2opi_f;
- mov.u64 %rd2651, %rd2650;
-
-$L__BB0_1180:
- .pragma "nounroll";
- shl.b64 %rd1828, %rd2650, 2;
- add.s64 %rd1830, %rd1829, %rd1828;
- ld.global.nc.u32 %r5815, [%rd1830];
- mad.wide.u32 %rd1831, %r5815, %r1541, %rd2651;
- shr.u64 %rd2651, %rd1831, 32;
- add.s64 %rd1832, %rd1, %rd1828;
- st.local.u32 [%rd1832], %rd1831;
- add.s32 %r8530, %r8530, 1;
- cvt.s64.s32 %rd2650, %r8530;
- setp.ne.s32 %p1006, %r8530, 6;
- @%p1006 bra $L__BB0_1180;
-
- st.local.u32 [%rd5], %rd2651;
- mov.u32 %r5816, 4;
- sub.s32 %r1545, %r5816, %r1542;
- mov.u32 %r5817, 6;
- sub.s32 %r5818, %r5817, %r1542;
- mul.wide.s32 %rd1833, %r5818, 4;
- add.s64 %rd1834, %rd1, %rd1833;
- ld.local.u32 %r8531, [%rd1834];
- ld.local.u32 %r8532, [%rd1834+-4];
- and.b32 %r1548, %r1540, 31;
- setp.eq.s32 %p1007, %r1548, 0;
- @%p1007 bra $L__BB0_1183;
-
- mov.u32 %r5819, 32;
- sub.s32 %r5820, %r5819, %r1548;
- shr.u32 %r5821, %r8532, %r5820;
- shl.b32 %r5822, %r8531, %r1548;
- add.s32 %r8531, %r5821, %r5822;
- mul.wide.s32 %rd1835, %r1545, 4;
- add.s64 %rd1836, %rd1, %rd1835;
- ld.local.u32 %r5823, [%rd1836];
- shr.u32 %r5824, %r5823, %r5820;
- shl.b32 %r5825, %r8532, %r1548;
- add.s32 %r8532, %r5824, %r5825;
-
-$L__BB0_1183:
- and.b32 %r5826, %r1539, -2147483648;
- shr.u32 %r5827, %r8532, 30;
- shl.b32 %r5828, %r8531, 2;
- or.b32 %r5829, %r5827, %r5828;
- shr.u32 %r5830, %r5829, 31;
- shr.u32 %r5831, %r8531, 30;
- add.s32 %r5832, %r5830, %r5831;
- neg.s32 %r5833, %r5832;
- setp.eq.s32 %p1008, %r5826, 0;
- selp.b32 %r8533, %r5832, %r5833, %p1008;
- setp.ne.s32 %p1009, %r5830, 0;
- xor.b32 %r5834, %r5826, -2147483648;
- selp.b32 %r5835, %r5834, %r5826, %p1009;
- selp.b32 %r5836, -1, 0, %p1009;
- xor.b32 %r5837, %r5829, %r5836;
- shl.b32 %r5838, %r8532, 2;
- xor.b32 %r5839, %r5838, %r5836;
- cvt.u64.u32 %rd1837, %r5837;
- cvt.u64.u32 %rd1838, %r5839;
- bfi.b64 %rd1839, %rd1837, %rd1838, 32, 32;
- cvt.rn.f64.s64 %fd153, %rd1839;
- mul.f64 %fd154, %fd153, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3907, %fd154;
- setp.eq.s32 %p1010, %r5835, 0;
- neg.f32 %f3908, %f3907;
- selp.f32 %f5638, %f3907, %f3908, %p1010;
-
-$L__BB0_1185:
- and.b32 %r1555, %r8533, 1;
- setp.eq.s32 %p1011, %r1555, 0;
- selp.f32 %f1324, %f5638, 0f3F800000, %p1011;
- mul.rn.f32 %f1325, %f5638, %f5638;
- mov.f32 %f5639, 0fB94D4153;
- @%p1011 bra $L__BB0_1187;
-
- mov.f32 %f3911, 0fBAB607ED;
- mov.f32 %f3912, 0f37CBAC00;
- fma.rn.f32 %f5639, %f3912, %f1325, %f3911;
-
-$L__BB0_1187:
- selp.f32 %f3913, 0f3C0885E4, 0f3D2AAABB, %p1011;
- fma.rn.f32 %f3914, %f5639, %f1325, %f3913;
- selp.f32 %f3915, 0fBE2AAAA8, 0fBEFFFFFF, %p1011;
- fma.rn.f32 %f3916, %f3914, %f1325, %f3915;
- mov.f32 %f3917, 0f00000000;
- fma.rn.f32 %f3918, %f1325, %f1324, %f3917;
- fma.rn.f32 %f5213, %f3916, %f3918, %f1324;
- and.b32 %r5841, %r8533, 2;
- setp.eq.s32 %p1013, %r5841, 0;
- @%p1013 bra $L__BB0_1189;
-
- mov.f32 %f3920, 0fBF800000;
- fma.rn.f32 %f5213, %f5213, %f3920, %f3917;
-
-$L__BB0_1189:
- setp.lt.s32 %p22, %r14, %r1537;
- @%p1003 bra $L__BB0_1202;
-
- mul.f32 %f3921, %f5334, 0f3F22F983;
- cvt.rni.s32.f32 %r8537, %f3921;
- cvt.rn.f32.s32 %f3922, %r8537;
- mov.f32 %f3923, 0fBFC90FDA;
- fma.rn.f32 %f3924, %f3922, %f3923, %f5334;
- mov.f32 %f3925, 0fB3A22168;
- fma.rn.f32 %f3926, %f3922, %f3925, %f3924;
- mov.f32 %f3927, 0fA7C234C5;
- fma.rn.f32 %f5642, %f3922, %f3927, %f3926;
- abs.f32 %f1333, %f5334;
- setp.ltu.f32 %p1015, %f1333, 0f47CE4780;
- @%p1015 bra $L__BB0_1198;
-
- setp.eq.f32 %p1016, %f1333, 0f7F800000;
- @%p1016 bra $L__BB0_1197;
- bra.uni $L__BB0_1192;
-
-$L__BB0_1197:
- mov.f32 %f3930, 0f00000000;
- mul.rn.f32 %f5642, %f5334, %f3930;
- mov.u32 %r8537, 0;
- bra.uni $L__BB0_1198;
-
-$L__BB0_1192:
- mov.b32 %r1557, %f5334;
- shr.u32 %r5843, %r1557, 23;
- and.b32 %r5844, %r5843, 255;
- add.s32 %r1558, %r5844, -128;
- shl.b32 %r5845, %r1557, 8;
- or.b32 %r1559, %r5845, -2147483648;
- shr.u32 %r1560, %r1558, 5;
- mov.u64 %rd2652, 0;
- mov.u32 %r8534, 0;
- mov.u64 %rd1843, __cudart_i2opi_f;
- mov.u64 %rd2653, %rd2652;
-
-$L__BB0_1193:
- .pragma "nounroll";
- shl.b64 %rd1842, %rd2652, 2;
- add.s64 %rd1844, %rd1843, %rd1842;
- ld.global.nc.u32 %r5846, [%rd1844];
- mad.wide.u32 %rd1845, %r5846, %r1559, %rd2653;
- shr.u64 %rd2653, %rd1845, 32;
- add.s64 %rd1846, %rd1, %rd1842;
- st.local.u32 [%rd1846], %rd1845;
- add.s32 %r8534, %r8534, 1;
- cvt.s64.s32 %rd2652, %r8534;
- setp.ne.s32 %p1017, %r8534, 6;
- @%p1017 bra $L__BB0_1193;
-
- st.local.u32 [%rd5], %rd2653;
- mov.u32 %r5847, 4;
- sub.s32 %r1563, %r5847, %r1560;
- mov.u32 %r5848, 6;
- sub.s32 %r5849, %r5848, %r1560;
- mul.wide.s32 %rd1847, %r5849, 4;
- add.s64 %rd1848, %rd1, %rd1847;
- ld.local.u32 %r8535, [%rd1848];
- ld.local.u32 %r8536, [%rd1848+-4];
- and.b32 %r1566, %r1558, 31;
- setp.eq.s32 %p1018, %r1566, 0;
- @%p1018 bra $L__BB0_1196;
-
- mov.u32 %r5850, 32;
- sub.s32 %r5851, %r5850, %r1566;
- shr.u32 %r5852, %r8536, %r5851;
- shl.b32 %r5853, %r8535, %r1566;
- add.s32 %r8535, %r5852, %r5853;
- mul.wide.s32 %rd1849, %r1563, 4;
- add.s64 %rd1850, %rd1, %rd1849;
- ld.local.u32 %r5854, [%rd1850];
- shr.u32 %r5855, %r5854, %r5851;
- shl.b32 %r5856, %r8536, %r1566;
- add.s32 %r8536, %r5855, %r5856;
-
-$L__BB0_1196:
- and.b32 %r5857, %r1557, -2147483648;
- shr.u32 %r5858, %r8536, 30;
- shl.b32 %r5859, %r8535, 2;
- or.b32 %r5860, %r5858, %r5859;
- shr.u32 %r5861, %r5860, 31;
- shr.u32 %r5862, %r8535, 30;
- add.s32 %r5863, %r5861, %r5862;
- neg.s32 %r5864, %r5863;
- setp.eq.s32 %p1019, %r5857, 0;
- selp.b32 %r8537, %r5863, %r5864, %p1019;
- setp.ne.s32 %p1020, %r5861, 0;
- xor.b32 %r5865, %r5857, -2147483648;
- selp.b32 %r5866, %r5865, %r5857, %p1020;
- selp.b32 %r5867, -1, 0, %p1020;
- xor.b32 %r5868, %r5860, %r5867;
- shl.b32 %r5869, %r8536, 2;
- xor.b32 %r5870, %r5869, %r5867;
- cvt.u64.u32 %rd1851, %r5868;
- cvt.u64.u32 %rd1852, %r5870;
- bfi.b64 %rd1853, %rd1851, %rd1852, 32, 32;
- cvt.rn.f64.s64 %fd155, %rd1853;
- mul.f64 %fd156, %fd155, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3928, %fd156;
- setp.eq.s32 %p1021, %r5866, 0;
- neg.f32 %f3929, %f3928;
- selp.f32 %f5642, %f3928, %f3929, %p1021;
-
-$L__BB0_1198:
- add.s32 %r1573, %r8537, 1;
- and.b32 %r1574, %r1573, 1;
- setp.eq.s32 %p1022, %r1574, 0;
- selp.f32 %f1337, %f5642, 0f3F800000, %p1022;
- mul.rn.f32 %f1338, %f5642, %f5642;
- mov.f32 %f5643, 0fB94D4153;
- @%p1022 bra $L__BB0_1200;
-
- mov.f32 %f3932, 0fBAB607ED;
- mov.f32 %f3933, 0f37CBAC00;
- fma.rn.f32 %f5643, %f3933, %f1338, %f3932;
-
-$L__BB0_1200:
- selp.f32 %f3934, 0f3C0885E4, 0f3D2AAABB, %p1022;
- fma.rn.f32 %f3935, %f5643, %f1338, %f3934;
- selp.f32 %f3936, 0fBE2AAAA8, 0fBEFFFFFF, %p1022;
- fma.rn.f32 %f3937, %f3935, %f1338, %f3936;
- mov.f32 %f3938, 0f00000000;
- fma.rn.f32 %f3939, %f1338, %f1337, %f3938;
- fma.rn.f32 %f5215, %f3937, %f3939, %f1337;
- and.b32 %r5872, %r1573, 2;
- setp.eq.s32 %p1024, %r5872, 0;
- @%p1024 bra $L__BB0_1202;
-
- mov.f32 %f3941, 0fBF800000;
- fma.rn.f32 %f5215, %f5215, %f3941, %f3938;
-
-$L__BB0_1202:
- selp.f32 %f1345, %f5215, %f5216, %p22;
- selp.f32 %f1346, %f5213, %f5214, %p22;
- @%p1003 bra $L__BB0_1204;
-
- add.f32 %f5708, %f1346, %f1345;
- mov.f32 %f5214, %f5213;
- mov.f32 %f5216, %f5215;
-
-$L__BB0_1204:
- @%p832 bra $L__BB0_1426;
-
- shl.b32 %r5874, %r12, 5;
- mov.u32 %r5875, -32;
- sub.s32 %r1575, %r5875, %r5874;
- setp.ge.s32 %p1028, %r14, %r1575;
- @%p1028 bra $L__BB0_1218;
-
- mul.f32 %f3944, %f5341, 0f3F22F983;
- cvt.rni.s32.f32 %r8541, %f3944;
- cvt.rn.f32.s32 %f3945, %r8541;
- mov.f32 %f3946, 0fBFC90FDA;
- fma.rn.f32 %f3947, %f3945, %f3946, %f5341;
- mov.f32 %f3948, 0fB3A22168;
- fma.rn.f32 %f3949, %f3945, %f3948, %f3947;
- mov.f32 %f3950, 0fA7C234C5;
- fma.rn.f32 %f5651, %f3945, %f3950, %f3949;
- abs.f32 %f1354, %f5341;
- setp.ltu.f32 %p1029, %f1354, 0f47CE4780;
- @%p1029 bra $L__BB0_1214;
-
- setp.eq.f32 %p1030, %f1354, 0f7F800000;
- @%p1030 bra $L__BB0_1213;
- bra.uni $L__BB0_1208;
-
-$L__BB0_1213:
- mov.f32 %f3953, 0f00000000;
- mul.rn.f32 %f5651, %f5341, %f3953;
- mov.u32 %r8541, 0;
- bra.uni $L__BB0_1214;
-
-$L__BB0_1208:
- mov.b32 %r1577, %f5341;
- shr.u32 %r5877, %r1577, 23;
- and.b32 %r5878, %r5877, 255;
- add.s32 %r1578, %r5878, -128;
- shl.b32 %r5879, %r1577, 8;
- or.b32 %r1579, %r5879, -2147483648;
- shr.u32 %r1580, %r1578, 5;
- mov.u64 %rd2656, 0;
- mov.u32 %r8538, 0;
- mov.u64 %rd2654, __cudart_i2opi_f;
- mov.u64 %rd2655, %rd1;
-
-$L__BB0_1209:
- .pragma "nounroll";
- ld.global.nc.u32 %r5880, [%rd2654];
- mad.wide.u32 %rd1856, %r5880, %r1579, %rd2656;
- shr.u64 %rd2656, %rd1856, 32;
- st.local.u32 [%rd2655], %rd1856;
- add.s64 %rd2655, %rd2655, 4;
- add.s64 %rd2654, %rd2654, 4;
- add.s32 %r8538, %r8538, 1;
- setp.ne.s32 %p1031, %r8538, 6;
- @%p1031 bra $L__BB0_1209;
-
- st.local.u32 [%rd5], %rd2656;
- mov.u32 %r5881, 4;
- sub.s32 %r1583, %r5881, %r1580;
- mov.u32 %r5882, 6;
- sub.s32 %r5883, %r5882, %r1580;
- mul.wide.s32 %rd1857, %r5883, 4;
- add.s64 %rd1858, %rd1, %rd1857;
- ld.local.u32 %r8539, [%rd1858];
- ld.local.u32 %r8540, [%rd1858+-4];
- and.b32 %r1586, %r1578, 31;
- setp.eq.s32 %p1032, %r1586, 0;
- @%p1032 bra $L__BB0_1212;
-
- mov.u32 %r5884, 32;
- sub.s32 %r5885, %r5884, %r1586;
- shr.u32 %r5886, %r8540, %r5885;
- shl.b32 %r5887, %r8539, %r1586;
- add.s32 %r8539, %r5886, %r5887;
- mul.wide.s32 %rd1859, %r1583, 4;
- add.s64 %rd1860, %rd1, %rd1859;
- ld.local.u32 %r5888, [%rd1860];
- shr.u32 %r5889, %r5888, %r5885;
- shl.b32 %r5890, %r8540, %r1586;
- add.s32 %r8540, %r5889, %r5890;
-
-$L__BB0_1212:
- and.b32 %r5891, %r1577, -2147483648;
- shr.u32 %r5892, %r8540, 30;
- shl.b32 %r5893, %r8539, 2;
- or.b32 %r5894, %r5892, %r5893;
- shr.u32 %r5895, %r5894, 31;
- shr.u32 %r5896, %r8539, 30;
- add.s32 %r5897, %r5895, %r5896;
- neg.s32 %r5898, %r5897;
- setp.eq.s32 %p1033, %r5891, 0;
- selp.b32 %r8541, %r5897, %r5898, %p1033;
- setp.ne.s32 %p1034, %r5895, 0;
- xor.b32 %r5899, %r5891, -2147483648;
- selp.b32 %r5900, %r5899, %r5891, %p1034;
- selp.b32 %r5901, -1, 0, %p1034;
- xor.b32 %r5902, %r5894, %r5901;
- shl.b32 %r5903, %r8540, 2;
- xor.b32 %r5904, %r5903, %r5901;
- cvt.u64.u32 %rd1861, %r5902;
- cvt.u64.u32 %rd1862, %r5904;
- bfi.b64 %rd1863, %rd1861, %rd1862, 32, 32;
- cvt.rn.f64.s64 %fd157, %rd1863;
- mul.f64 %fd158, %fd157, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3951, %fd158;
- setp.eq.s32 %p1035, %r5900, 0;
- neg.f32 %f3952, %f3951;
- selp.f32 %f5651, %f3951, %f3952, %p1035;
-
-$L__BB0_1214:
- and.b32 %r1593, %r8541, 1;
- setp.eq.s32 %p1036, %r1593, 0;
- selp.f32 %f1358, %f5651, 0f3F800000, %p1036;
- mul.rn.f32 %f1359, %f5651, %f5651;
- mov.f32 %f5652, 0fB94D4153;
- @%p1036 bra $L__BB0_1216;
-
- mov.f32 %f3955, 0fBAB607ED;
- mov.f32 %f3956, 0f37CBAC00;
- fma.rn.f32 %f5652, %f3956, %f1359, %f3955;
-
-$L__BB0_1216:
- selp.f32 %f3957, 0f3C0885E4, 0f3D2AAABB, %p1036;
- fma.rn.f32 %f3958, %f5652, %f1359, %f3957;
- selp.f32 %f3959, 0fBE2AAAA8, 0fBEFFFFFF, %p1036;
- fma.rn.f32 %f3960, %f3958, %f1359, %f3959;
- mov.f32 %f3961, 0f00000000;
- fma.rn.f32 %f3962, %f1359, %f1358, %f3961;
- fma.rn.f32 %f5213, %f3960, %f3962, %f1358;
- and.b32 %r5906, %r8541, 2;
- setp.eq.s32 %p1038, %r5906, 0;
- @%p1038 bra $L__BB0_1218;
-
- mov.f32 %f3964, 0fBF800000;
- fma.rn.f32 %f5213, %f5213, %f3964, %f3961;
-
-$L__BB0_1218:
- setp.lt.s32 %p23, %r14, %r1575;
- @%p1028 bra $L__BB0_1231;
-
- mul.f32 %f3965, %f5333, 0f3F22F983;
- cvt.rni.s32.f32 %r8545, %f3965;
- cvt.rn.f32.s32 %f3966, %r8545;
- mov.f32 %f3967, 0fBFC90FDA;
- fma.rn.f32 %f3968, %f3966, %f3967, %f5333;
- mov.f32 %f3969, 0fB3A22168;
- fma.rn.f32 %f3970, %f3966, %f3969, %f3968;
- mov.f32 %f3971, 0fA7C234C5;
- fma.rn.f32 %f5655, %f3966, %f3971, %f3970;
- abs.f32 %f1367, %f5333;
- setp.ltu.f32 %p1040, %f1367, 0f47CE4780;
- @%p1040 bra $L__BB0_1227;
-
- setp.eq.f32 %p1041, %f1367, 0f7F800000;
- @%p1041 bra $L__BB0_1226;
- bra.uni $L__BB0_1221;
-
-$L__BB0_1226:
- mov.f32 %f3974, 0f00000000;
- mul.rn.f32 %f5655, %f5333, %f3974;
- mov.u32 %r8545, 0;
- bra.uni $L__BB0_1227;
-
-$L__BB0_1221:
- mov.b32 %r1595, %f5333;
- shr.u32 %r5908, %r1595, 23;
- and.b32 %r5909, %r5908, 255;
- add.s32 %r1596, %r5909, -128;
- shl.b32 %r5910, %r1595, 8;
- or.b32 %r1597, %r5910, -2147483648;
- shr.u32 %r1598, %r1596, 5;
- mov.u64 %rd2659, 0;
- mov.u32 %r8542, 0;
- mov.u64 %rd2657, __cudart_i2opi_f;
- mov.u64 %rd2658, %rd1;
-
-$L__BB0_1222:
- .pragma "nounroll";
- ld.global.nc.u32 %r5911, [%rd2657];
- mad.wide.u32 %rd1866, %r5911, %r1597, %rd2659;
- shr.u64 %rd2659, %rd1866, 32;
- st.local.u32 [%rd2658], %rd1866;
- add.s64 %rd2658, %rd2658, 4;
- add.s64 %rd2657, %rd2657, 4;
- add.s32 %r8542, %r8542, 1;
- setp.ne.s32 %p1042, %r8542, 6;
- @%p1042 bra $L__BB0_1222;
-
- st.local.u32 [%rd5], %rd2659;
- mov.u32 %r5912, 4;
- sub.s32 %r1601, %r5912, %r1598;
- mov.u32 %r5913, 6;
- sub.s32 %r5914, %r5913, %r1598;
- mul.wide.s32 %rd1867, %r5914, 4;
- add.s64 %rd1868, %rd1, %rd1867;
- ld.local.u32 %r8543, [%rd1868];
- ld.local.u32 %r8544, [%rd1868+-4];
- and.b32 %r1604, %r1596, 31;
- setp.eq.s32 %p1043, %r1604, 0;
- @%p1043 bra $L__BB0_1225;
-
- mov.u32 %r5915, 32;
- sub.s32 %r5916, %r5915, %r1604;
- shr.u32 %r5917, %r8544, %r5916;
- shl.b32 %r5918, %r8543, %r1604;
- add.s32 %r8543, %r5917, %r5918;
- mul.wide.s32 %rd1869, %r1601, 4;
- add.s64 %rd1870, %rd1, %rd1869;
- ld.local.u32 %r5919, [%rd1870];
- shr.u32 %r5920, %r5919, %r5916;
- shl.b32 %r5921, %r8544, %r1604;
- add.s32 %r8544, %r5920, %r5921;
-
-$L__BB0_1225:
- and.b32 %r5922, %r1595, -2147483648;
- shr.u32 %r5923, %r8544, 30;
- shl.b32 %r5924, %r8543, 2;
- or.b32 %r5925, %r5923, %r5924;
- shr.u32 %r5926, %r5925, 31;
- shr.u32 %r5927, %r8543, 30;
- add.s32 %r5928, %r5926, %r5927;
- neg.s32 %r5929, %r5928;
- setp.eq.s32 %p1044, %r5922, 0;
- selp.b32 %r8545, %r5928, %r5929, %p1044;
- setp.ne.s32 %p1045, %r5926, 0;
- xor.b32 %r5930, %r5922, -2147483648;
- selp.b32 %r5931, %r5930, %r5922, %p1045;
- selp.b32 %r5932, -1, 0, %p1045;
- xor.b32 %r5933, %r5925, %r5932;
- shl.b32 %r5934, %r8544, 2;
- xor.b32 %r5935, %r5934, %r5932;
- cvt.u64.u32 %rd1871, %r5933;
- cvt.u64.u32 %rd1872, %r5935;
- bfi.b64 %rd1873, %rd1871, %rd1872, 32, 32;
- cvt.rn.f64.s64 %fd159, %rd1873;
- mul.f64 %fd160, %fd159, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f3972, %fd160;
- setp.eq.s32 %p1046, %r5931, 0;
- neg.f32 %f3973, %f3972;
- selp.f32 %f5655, %f3972, %f3973, %p1046;
-
-$L__BB0_1227:
- add.s32 %r1611, %r8545, 1;
- and.b32 %r1612, %r1611, 1;
- setp.eq.s32 %p1047, %r1612, 0;
- selp.f32 %f1371, %f5655, 0f3F800000, %p1047;
- mul.rn.f32 %f1372, %f5655, %f5655;
- mov.f32 %f5656, 0fB94D4153;
- @%p1047 bra $L__BB0_1229;
-
- mov.f32 %f3976, 0fBAB607ED;
- mov.f32 %f3977, 0f37CBAC00;
- fma.rn.f32 %f5656, %f3977, %f1372, %f3976;
-
-$L__BB0_1229:
- selp.f32 %f3978, 0f3C0885E4, 0f3D2AAABB, %p1047;
- fma.rn.f32 %f3979, %f5656, %f1372, %f3978;
- selp.f32 %f3980, 0fBE2AAAA8, 0fBEFFFFFF, %p1047;
- fma.rn.f32 %f3981, %f3979, %f1372, %f3980;
- mov.f32 %f3982, 0f00000000;
- fma.rn.f32 %f3983, %f1372, %f1371, %f3982;
- fma.rn.f32 %f5215, %f3981, %f3983, %f1371;
- and.b32 %r5937, %r1611, 2;
- setp.eq.s32 %p1049, %r5937, 0;
- @%p1049 bra $L__BB0_1231;
-
- mov.f32 %f3985, 0fBF800000;
- fma.rn.f32 %f5215, %f5215, %f3985, %f3982;
-
-$L__BB0_1231:
- selp.f32 %f1379, %f5215, %f5216, %p23;
- selp.f32 %f1380, %f5213, %f5214, %p23;
- @%p1028 bra $L__BB0_1426;
-
- add.f32 %f5707, %f1380, %f1379;
- mov.f32 %f5214, %f5213;
- mov.f32 %f5216, %f5215;
-
-$L__BB0_1426:
- @%p32 bra $L__BB0_1428;
-
- shl.b32 %r6509, %r12, 2;
- mov.u32 %r6510, -12;
- sub.s32 %r6511, %r6510, %r6509;
- add.s32 %r6512, %r13, -12;
- setp.lt.s32 %p1213, %r6512, %r6511;
- @%p1213 bra $L__BB0_1708;
- bra.uni $L__BB0_1428;
-
-$L__BB0_1708:
- mov.u32 %r7240, %ctaid.x;
- mul.lo.s32 %r7241, %r2614, %r7240;
- shl.b32 %r7242, %r12, 5;
- add.s32 %r7243, %r7242, %r1;
- mul.hi.s32 %r7244, %r7243, -1840700269;
- add.s32 %r7245, %r7244, %r7243;
- shr.u32 %r7246, %r7245, 31;
- shr.s32 %r7247, %r7245, 2;
- add.s32 %r7248, %r7247, %r7246;
- mul.lo.s32 %r7249, %r7248, %r2615;
- mul.lo.s32 %r7250, %r7248, 7;
- sub.s32 %r7251, %r7243, %r7250;
- mul.lo.s32 %r7252, %r7251, %r2616;
- add.s32 %r7253, %r13, 2;
- shl.b32 %r7254, %r2612, 1;
- add.s32 %r7255, %r7254, %r7241;
- mad.lo.s32 %r7256, %r7253, %r2613, %r7255;
- add.s32 %r7257, %r7256, %r7249;
- add.s32 %r7258, %r7257, %r7252;
- mul.wide.s32 %rd2287, %r7258, 4;
- add.s64 %rd2288, %rd3, %rd2287;
- ld.global.f32 %f1925, [%rd2288];
- add.s32 %r7259, %r7243, 32;
- mul.hi.s32 %r7260, %r7259, -1840700269;
- add.s32 %r7261, %r7260, %r7259;
- shr.u32 %r7262, %r7261, 31;
- shr.s32 %r7263, %r7261, 2;
- add.s32 %r7264, %r7263, %r7262;
- mul.lo.s32 %r7265, %r7264, %r2615;
- mul.lo.s32 %r7266, %r7264, 7;
- sub.s32 %r7267, %r7259, %r7266;
- mul.lo.s32 %r7268, %r7267, %r2616;
- add.s32 %r7269, %r7256, %r7265;
- add.s32 %r7270, %r7269, %r7268;
- mul.wide.s32 %rd2289, %r7270, 4;
- add.s64 %rd2290, %rd3, %rd2289;
- ld.global.f32 %f1926, [%rd2290];
- mul.wide.s32 %rd2291, %r2613, 4;
- add.s64 %rd2292, %rd2288, %rd2291;
- ld.global.f32 %f1927, [%rd2292];
- add.s64 %rd2293, %rd2290, %rd2291;
- ld.global.f32 %f1928, [%rd2293];
- add.s64 %rd2294, %rd2292, %rd2291;
- ld.global.f32 %f1929, [%rd2294];
- add.s64 %rd2295, %rd2293, %rd2291;
- ld.global.f32 %f1930, [%rd2295];
- mad.lo.s32 %r7271, %r2612, 3, %r7241;
- mad.lo.s32 %r7272, %r13, %r2613, %r7271;
- add.s32 %r7273, %r7272, %r7249;
- add.s32 %r7274, %r7273, %r7252;
- mul.wide.s32 %rd2296, %r7274, 4;
- add.s64 %rd2297, %rd3, %rd2296;
- ld.global.f32 %f1931, [%rd2297];
- add.s32 %r7275, %r7272, %r7265;
- add.s32 %r7276, %r7275, %r7268;
- mul.wide.s32 %rd2298, %r7276, 4;
- add.s64 %rd2299, %rd3, %rd2298;
- ld.global.f32 %f1932, [%rd2299];
- mul.lo.s32 %r7277, %r2604, %r7240;
- mul.hi.s32 %r7278, %r7243, 954437177;
- shr.u32 %r7279, %r7278, 31;
- shr.s32 %r7280, %r7278, 1;
- add.s32 %r7281, %r7280, %r7279;
- mul.lo.s32 %r7282, %r7281, %r2605;
- mul.lo.s32 %r7283, %r7281, 9;
- sub.s32 %r7284, %r7243, %r7283;
- mul.lo.s32 %r7285, %r7284, %r2606;
- shl.b32 %r7286, %r2602, 2;
- add.s32 %r7287, %r7286, %r7277;
- mul.lo.s32 %r7288, %r13, %r2603;
- add.s32 %r7289, %r7287, %r7288;
- add.s32 %r7290, %r7289, %r7282;
- add.s32 %r7291, %r7290, %r7285;
- mul.wide.s32 %rd2300, %r7291, 4;
- add.s64 %rd2301, %rd2, %rd2300;
- ld.global.f32 %f1933, [%rd2301];
- mul.hi.s32 %r7292, %r7259, 954437177;
- shr.u32 %r7293, %r7292, 31;
- shr.s32 %r7294, %r7292, 1;
- add.s32 %r7295, %r7294, %r7293;
- mul.lo.s32 %r7296, %r7295, %r2605;
- mul.lo.s32 %r7297, %r7295, 9;
- sub.s32 %r7298, %r7259, %r7297;
- mul.lo.s32 %r7299, %r7298, %r2606;
- add.s32 %r7300, %r7289, %r7296;
- add.s32 %r7301, %r7300, %r7299;
- mul.wide.s32 %rd2302, %r7301, 4;
- add.s64 %rd2303, %rd2, %rd2302;
- ld.global.f32 %f1934, [%rd2303];
- mul.wide.s32 %rd2304, %r2603, 4;
- add.s64 %rd2305, %rd2301, %rd2304;
- ld.global.f32 %f1935, [%rd2305];
- add.s64 %rd2306, %rd2303, %rd2304;
- ld.global.f32 %f1936, [%rd2306];
- add.s64 %rd2307, %rd2305, %rd2304;
- ld.global.f32 %f1937, [%rd2307];
- add.s64 %rd2308, %rd2306, %rd2304;
- ld.global.f32 %f1938, [%rd2308];
- mad.lo.s32 %r7302, %r2602, 5, %r7277;
- add.s32 %r7303, %r7302, %r7288;
- add.s32 %r7304, %r7303, %r7282;
- add.s32 %r7305, %r7304, %r7285;
- mul.wide.s32 %rd2309, %r7305, 4;
- add.s64 %rd2310, %rd2, %rd2309;
- ld.global.f32 %f1939, [%rd2310];
- add.s32 %r7306, %r7303, %r7296;
- add.s32 %r7307, %r7306, %r7299;
- mul.wide.s32 %rd2311, %r7307, 4;
- add.s64 %rd2312, %rd2, %rd2311;
- ld.global.f32 %f1940, [%rd2312];
- mul.f32 %f4675, %f1933, 0f3F22F983;
- cvt.rni.s32.f32 %r8677, %f4675;
- cvt.rn.f32.s32 %f4676, %r8677;
- mov.f32 %f4677, 0fBFC90FDA;
- fma.rn.f32 %f4678, %f4676, %f4677, %f1933;
- mov.f32 %f4679, 0fB3A22168;
- fma.rn.f32 %f4680, %f4676, %f4679, %f4678;
- mov.f32 %f4681, 0fA7C234C5;
- fma.rn.f32 %f5850, %f4676, %f4681, %f4680;
- abs.f32 %f1942, %f1933;
- setp.ltu.f32 %p1447, %f1942, 0f47CE4780;
- @%p1447 bra $L__BB0_1716;
-
- setp.eq.f32 %p1448, %f1942, 0f7F800000;
- @%p1448 bra $L__BB0_1715;
- bra.uni $L__BB0_1710;
-
-$L__BB0_1715:
- mov.f32 %f4684, 0f00000000;
- mul.rn.f32 %f5850, %f1933, %f4684;
- mov.u32 %r8677, 0;
- bra.uni $L__BB0_1716;
-
-$L__BB0_1428:
- mov.u32 %r1909, %ctaid.x;
- mul.lo.s32 %r1910, %r2614, %r1909;
- add.s32 %r6513, %r13, -15;
- mov.u32 %r6514, -12;
- sub.s32 %r1911, %r6514, %r12;
- setp.ge.s32 %p1214, %r6513, %r1911;
- add.s32 %r6515, %r13, 2;
- shl.b32 %r6516, %r2612, 1;
- add.s32 %r6517, %r6516, %r1910;
- mad.lo.s32 %r1912, %r6515, %r2613, %r6517;
- @%p1214 bra $L__BB0_1431;
-
- shl.b32 %r1913, %r12, 5;
- neg.s32 %r6518, %r1913;
- setp.ge.s32 %p1215, %r14, %r6518;
- @%p1215 bra $L__BB0_1431;
-
- add.s32 %r6519, %r1913, %r1;
- mul.hi.s32 %r6520, %r6519, -1840700269;
- add.s32 %r6521, %r6520, %r6519;
- shr.u32 %r6522, %r6521, 31;
- shr.s32 %r6523, %r6521, 2;
- add.s32 %r6524, %r6523, %r6522;
- mul.lo.s32 %r6525, %r6524, 7;
- sub.s32 %r6526, %r6519, %r6525;
- mad.lo.s32 %r6527, %r6524, %r2615, %r1912;
- mad.lo.s32 %r6528, %r6526, %r2616, %r6527;
- mul.wide.s32 %rd2095, %r6528, 4;
- add.s64 %rd2096, %rd3, %rd2095;
- ld.global.f32 %f5531, [%rd2096];
-
-$L__BB0_1431:
- @%p1214 bra $L__BB0_1434;
-
- shl.b32 %r1914, %r12, 5;
- mov.u32 %r6530, -32;
- sub.s32 %r6531, %r6530, %r1914;
- setp.ge.s32 %p1217, %r14, %r6531;
- @%p1217 bra $L__BB0_1434;
-
- add.s32 %r6532, %r1914, %r1;
- add.s32 %r6533, %r6532, 32;
- mul.hi.s32 %r6534, %r6533, -1840700269;
- add.s32 %r6535, %r6534, %r6533;
- shr.u32 %r6536, %r6535, 31;
- shr.s32 %r6537, %r6535, 2;
- add.s32 %r6538, %r6537, %r6536;
- mul.lo.s32 %r6539, %r6538, 7;
- sub.s32 %r6540, %r6533, %r6539;
- mad.lo.s32 %r6541, %r6538, %r2615, %r1912;
- mad.lo.s32 %r6542, %r6540, %r2616, %r6541;
- mul.wide.s32 %rd2097, %r6542, 4;
- add.s64 %rd2098, %rd3, %rd2097;
- ld.global.f32 %f5339, [%rd2098];
-
-$L__BB0_1434:
- mov.u32 %r6544, -13;
- sub.s32 %r1915, %r6544, %r12;
- setp.ge.s32 %p1218, %r6513, %r1915;
- add.s32 %r1916, %r1912, %r2613;
- @%p1218 bra $L__BB0_1437;
-
- shl.b32 %r1917, %r12, 5;
- neg.s32 %r6545, %r1917;
- setp.ge.s32 %p1219, %r14, %r6545;
- @%p1219 bra $L__BB0_1437;
-
- add.s32 %r6546, %r1917, %r1;
- mul.hi.s32 %r6547, %r6546, -1840700269;
- add.s32 %r6548, %r6547, %r6546;
- shr.u32 %r6549, %r6548, 31;
- shr.s32 %r6550, %r6548, 2;
- add.s32 %r6551, %r6550, %r6549;
- mul.lo.s32 %r6552, %r6551, 7;
- sub.s32 %r6553, %r6546, %r6552;
- mad.lo.s32 %r6554, %r6551, %r2615, %r1916;
- mad.lo.s32 %r6555, %r6553, %r2616, %r6554;
- mul.wide.s32 %rd2099, %r6555, 4;
- add.s64 %rd2100, %rd3, %rd2099;
- ld.global.f32 %f5338, [%rd2100];
-
-$L__BB0_1437:
- @%p1218 bra $L__BB0_1440;
-
- shl.b32 %r1918, %r12, 5;
- mov.u32 %r6557, -32;
- sub.s32 %r6558, %r6557, %r1918;
- setp.ge.s32 %p1221, %r14, %r6558;
- @%p1221 bra $L__BB0_1440;
-
- add.s32 %r6559, %r1918, %r1;
- add.s32 %r6560, %r6559, 32;
- mul.hi.s32 %r6561, %r6560, -1840700269;
- add.s32 %r6562, %r6561, %r6560;
- shr.u32 %r6563, %r6562, 31;
- shr.s32 %r6564, %r6562, 2;
- add.s32 %r6565, %r6564, %r6563;
- mul.lo.s32 %r6566, %r6565, 7;
- sub.s32 %r6567, %r6560, %r6566;
- mad.lo.s32 %r6568, %r6565, %r2615, %r1916;
- mad.lo.s32 %r6569, %r6567, %r2616, %r6568;
- mul.wide.s32 %rd2101, %r6569, 4;
- add.s64 %rd2102, %rd3, %rd2101;
- ld.global.f32 %f5337, [%rd2102];
-
-$L__BB0_1440:
- mov.u32 %r6571, -14;
- sub.s32 %r1919, %r6571, %r12;
- setp.ge.s32 %p1222, %r6513, %r1919;
- add.s32 %r1920, %r1916, %r2613;
- @%p1222 bra $L__BB0_1443;
-
- shl.b32 %r1921, %r12, 5;
- neg.s32 %r6572, %r1921;
- setp.ge.s32 %p1223, %r14, %r6572;
- @%p1223 bra $L__BB0_1443;
-
- add.s32 %r6573, %r1921, %r1;
- mul.hi.s32 %r6574, %r6573, -1840700269;
- add.s32 %r6575, %r6574, %r6573;
- shr.u32 %r6576, %r6575, 31;
- shr.s32 %r6577, %r6575, 2;
- add.s32 %r6578, %r6577, %r6576;
- mul.lo.s32 %r6579, %r6578, 7;
- sub.s32 %r6580, %r6573, %r6579;
- mad.lo.s32 %r6581, %r6578, %r2615, %r1920;
- mad.lo.s32 %r6582, %r6580, %r2616, %r6581;
- mul.wide.s32 %rd2103, %r6582, 4;
- add.s64 %rd2104, %rd3, %rd2103;
- ld.global.f32 %f5336, [%rd2104];
-
-$L__BB0_1443:
- @%p1222 bra $L__BB0_1446;
-
- shl.b32 %r1922, %r12, 5;
- mov.u32 %r6584, -32;
- sub.s32 %r6585, %r6584, %r1922;
- setp.ge.s32 %p1225, %r14, %r6585;
- @%p1225 bra $L__BB0_1446;
-
- add.s32 %r6586, %r1922, %r1;
- add.s32 %r6587, %r6586, 32;
- mul.hi.s32 %r6588, %r6587, -1840700269;
- add.s32 %r6589, %r6588, %r6587;
- shr.u32 %r6590, %r6589, 31;
- shr.s32 %r6591, %r6589, 2;
- add.s32 %r6592, %r6591, %r6590;
- mul.lo.s32 %r6593, %r6592, 7;
- sub.s32 %r6594, %r6587, %r6593;
- mad.lo.s32 %r6595, %r6592, %r2615, %r1920;
- mad.lo.s32 %r6596, %r6594, %r2616, %r6595;
- mul.wide.s32 %rd2105, %r6596, 4;
- add.s64 %rd2106, %rd3, %rd2105;
- ld.global.f32 %f5335, [%rd2106];
-
-$L__BB0_1446:
- mov.u32 %r6598, -15;
- sub.s32 %r1923, %r6598, %r12;
- setp.ge.s32 %p1226, %r6513, %r1923;
- mad.lo.s32 %r6599, %r2612, 3, %r1910;
- mad.lo.s32 %r1924, %r13, %r2613, %r6599;
- @%p1226 bra $L__BB0_1449;
-
- shl.b32 %r1925, %r12, 5;
- neg.s32 %r6600, %r1925;
- setp.ge.s32 %p1227, %r14, %r6600;
- @%p1227 bra $L__BB0_1449;
-
- add.s32 %r6601, %r1925, %r1;
- mul.hi.s32 %r6602, %r6601, -1840700269;
- add.s32 %r6603, %r6602, %r6601;
- shr.u32 %r6604, %r6603, 31;
- shr.s32 %r6605, %r6603, 2;
- add.s32 %r6606, %r6605, %r6604;
- mul.lo.s32 %r6607, %r6606, 7;
- sub.s32 %r6608, %r6601, %r6607;
- mad.lo.s32 %r6609, %r6606, %r2615, %r1924;
- mad.lo.s32 %r6610, %r6608, %r2616, %r6609;
- mul.wide.s32 %rd2107, %r6610, 4;
- add.s64 %rd2108, %rd3, %rd2107;
- ld.global.f32 %f5334, [%rd2108];
-
-$L__BB0_1449:
- @%p1226 bra $L__BB0_1452;
-
- shl.b32 %r1926, %r12, 5;
- mov.u32 %r6612, -32;
- sub.s32 %r6613, %r6612, %r1926;
- setp.ge.s32 %p1229, %r14, %r6613;
- @%p1229 bra $L__BB0_1452;
-
- add.s32 %r6614, %r1926, %r1;
- add.s32 %r6615, %r6614, 32;
- mul.hi.s32 %r6616, %r6615, -1840700269;
- add.s32 %r6617, %r6616, %r6615;
- shr.u32 %r6618, %r6617, 31;
- shr.s32 %r6619, %r6617, 2;
- add.s32 %r6620, %r6619, %r6618;
- mul.lo.s32 %r6621, %r6620, 7;
- sub.s32 %r6622, %r6615, %r6621;
- mad.lo.s32 %r6623, %r6620, %r2615, %r1924;
- mad.lo.s32 %r6624, %r6622, %r2616, %r6623;
- mul.wide.s32 %rd2109, %r6624, 4;
- add.s64 %rd2110, %rd3, %rd2109;
- ld.global.f32 %f5333, [%rd2110];
+ add.s32 %r8351, %r8351, 1;
+ setp.ne.s32 %p1222, %r8351, 6;
+ @%p1222 bra $L__BB0_1445;
+
+ st.local.u32 [%rd4], %rd2699;
+ mov.u32 %r6568, 4;
+ sub.s32 %r1901, %r6568, %r1898;
+ mov.u32 %r6569, 6;
+ sub.s32 %r6570, %r6569, %r1898;
+ mul.wide.s32 %rd2146, %r6570, 4;
+ add.s64 %rd2147, %rd1, %rd2146;
+ ld.local.u32 %r8352, [%rd2147];
+ ld.local.u32 %r8353, [%rd2147+-4];
+ and.b32 %r1904, %r1896, 31;
+ setp.eq.s32 %p1223, %r1904, 0;
+ @%p1223 bra $L__BB0_1448;
+
+ mov.u32 %r6571, 32;
+ sub.s32 %r6572, %r6571, %r1904;
+ shr.u32 %r6573, %r8353, %r6572;
+ shl.b32 %r6574, %r8352, %r1904;
+ add.s32 %r8352, %r6573, %r6574;
+ mul.wide.s32 %rd2148, %r1901, 4;
+ add.s64 %rd2149, %rd1, %rd2148;
+ ld.local.u32 %r6575, [%rd2149];
+ shr.u32 %r6576, %r6575, %r6572;
+ shl.b32 %r6577, %r8353, %r1904;
+ add.s32 %r8353, %r6576, %r6577;
+
+$L__BB0_1448:
+ and.b32 %r6578, %r1895, -2147483648;
+ shr.u32 %r6579, %r8353, 30;
+ shl.b32 %r6580, %r8352, 2;
+ or.b32 %r6581, %r6579, %r6580;
+ shr.u32 %r6582, %r6581, 31;
+ shr.u32 %r6583, %r8352, 30;
+ add.s32 %r6584, %r6582, %r6583;
+ neg.s32 %r6585, %r6584;
+ setp.eq.s32 %p1224, %r6578, 0;
+ selp.b32 %r8354, %r6584, %r6585, %p1224;
+ setp.ne.s32 %p1225, %r6582, 0;
+ xor.b32 %r6586, %r6578, -2147483648;
+ selp.b32 %r6587, %r6586, %r6578, %p1225;
+ selp.b32 %r6588, -1, 0, %p1225;
+ xor.b32 %r6589, %r6581, %r6588;
+ shl.b32 %r6590, %r8353, 2;
+ xor.b32 %r6591, %r6590, %r6588;
+ cvt.u64.u32 %rd2150, %r6589;
+ cvt.u64.u32 %rd2151, %r6591;
+ bfi.b64 %rd2152, %rd2150, %rd2151, 32, 32;
+ cvt.rn.f64.s64 %fd193, %rd2152;
+ mul.f64 %fd194, %fd193, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4387, %fd194;
+ setp.eq.s32 %p1226, %r6587, 0;
+ neg.f32 %f4388, %f4387;
+ selp.f32 %f5843, %f4387, %f4388, %p1226;
+
+$L__BB0_1450:
+ and.b32 %r1911, %r8354, 1;
+ setp.eq.s32 %p1227, %r1911, 0;
+ selp.f32 %f1695, %f5843, 0f3F800000, %p1227;
+ mul.rn.f32 %f1696, %f5843, %f5843;
+ mov.f32 %f5844, 0fB94D4153;
+ @%p1227 bra $L__BB0_1452;
+
+ mov.f32 %f4391, 0fBAB607ED;
+ mov.f32 %f4392, 0f37CBAC00;
+ fma.rn.f32 %f5844, %f4392, %f1696, %f4391;
$L__BB0_1452:
- shl.b32 %r6626, %r2602, 2;
- mul.lo.s32 %r1927, %r2604, %r1909;
- add.s32 %r6627, %r6626, %r1927;
- mul.lo.s32 %r1928, %r13, %r2603;
- add.s32 %r1929, %r6627, %r1928;
- @%p1214 bra $L__BB0_1455;
-
- shl.b32 %r1930, %r12, 5;
- neg.s32 %r6628, %r1930;
- setp.ge.s32 %p1231, %r14, %r6628;
- @%p1231 bra $L__BB0_1455;
-
- add.s32 %r6629, %r1930, %r1;
- mul.hi.s32 %r6630, %r6629, 954437177;
- shr.u32 %r6631, %r6630, 31;
- shr.s32 %r6632, %r6630, 1;
- add.s32 %r6633, %r6632, %r6631;
- mul.lo.s32 %r6634, %r6633, 9;
- sub.s32 %r6635, %r6629, %r6634;
- mad.lo.s32 %r6636, %r6633, %r2605, %r1929;
- mad.lo.s32 %r6637, %r6635, %r2606, %r6636;
- mul.wide.s32 %rd2111, %r6637, 4;
- add.s64 %rd2112, %rd2, %rd2111;
- ld.global.f32 %f5348, [%rd2112];
-
-$L__BB0_1455:
- @%p1214 bra $L__BB0_1458;
-
- shl.b32 %r1931, %r12, 5;
- mov.u32 %r6639, -32;
- sub.s32 %r6640, %r6639, %r1931;
- setp.ge.s32 %p1233, %r14, %r6640;
- @%p1233 bra $L__BB0_1458;
-
- add.s32 %r6641, %r1931, %r1;
- add.s32 %r6642, %r6641, 32;
- mul.hi.s32 %r6643, %r6642, 954437177;
- shr.u32 %r6644, %r6643, 31;
- shr.s32 %r6645, %r6643, 1;
- add.s32 %r6646, %r6645, %r6644;
- mul.lo.s32 %r6647, %r6646, 9;
- sub.s32 %r6648, %r6642, %r6647;
- mad.lo.s32 %r6649, %r6646, %r2605, %r1929;
- mad.lo.s32 %r6650, %r6648, %r2606, %r6649;
- mul.wide.s32 %rd2113, %r6650, 4;
- add.s64 %rd2114, %rd2, %rd2113;
- ld.global.f32 %f5347, [%rd2114];
-
-$L__BB0_1458:
- add.s32 %r1932, %r1929, %r2603;
- @%p1218 bra $L__BB0_1461;
-
- shl.b32 %r1933, %r12, 5;
- neg.s32 %r6652, %r1933;
- setp.ge.s32 %p1235, %r14, %r6652;
- @%p1235 bra $L__BB0_1461;
-
- add.s32 %r6653, %r1933, %r1;
- mul.hi.s32 %r6654, %r6653, 954437177;
- shr.u32 %r6655, %r6654, 31;
- shr.s32 %r6656, %r6654, 1;
- add.s32 %r6657, %r6656, %r6655;
- mul.lo.s32 %r6658, %r6657, 9;
- sub.s32 %r6659, %r6653, %r6658;
- mad.lo.s32 %r6660, %r6657, %r2605, %r1932;
- mad.lo.s32 %r6661, %r6659, %r2606, %r6660;
- mul.wide.s32 %rd2115, %r6661, 4;
- add.s64 %rd2116, %rd2, %rd2115;
- ld.global.f32 %f5346, [%rd2116];
-
-$L__BB0_1461:
- @%p1218 bra $L__BB0_1464;
-
- shl.b32 %r1934, %r12, 5;
- mov.u32 %r6663, -32;
- sub.s32 %r6664, %r6663, %r1934;
- setp.ge.s32 %p1237, %r14, %r6664;
- @%p1237 bra $L__BB0_1464;
-
- add.s32 %r6665, %r1934, %r1;
- add.s32 %r6666, %r6665, 32;
- mul.hi.s32 %r6667, %r6666, 954437177;
- shr.u32 %r6668, %r6667, 31;
- shr.s32 %r6669, %r6667, 1;
- add.s32 %r6670, %r6669, %r6668;
- mul.lo.s32 %r6671, %r6670, 9;
- sub.s32 %r6672, %r6666, %r6671;
- mad.lo.s32 %r6673, %r6670, %r2605, %r1932;
- mad.lo.s32 %r6674, %r6672, %r2606, %r6673;
- mul.wide.s32 %rd2117, %r6674, 4;
- add.s64 %rd2118, %rd2, %rd2117;
- ld.global.f32 %f5345, [%rd2118];
-
-$L__BB0_1464:
- add.s32 %r1935, %r1932, %r2603;
- @%p1222 bra $L__BB0_1467;
-
- shl.b32 %r1936, %r12, 5;
- neg.s32 %r6676, %r1936;
- setp.ge.s32 %p1239, %r14, %r6676;
- @%p1239 bra $L__BB0_1467;
-
- add.s32 %r6677, %r1936, %r1;
- mul.hi.s32 %r6678, %r6677, 954437177;
- shr.u32 %r6679, %r6678, 31;
- shr.s32 %r6680, %r6678, 1;
- add.s32 %r6681, %r6680, %r6679;
- mul.lo.s32 %r6682, %r6681, 9;
- sub.s32 %r6683, %r6677, %r6682;
- mad.lo.s32 %r6684, %r6681, %r2605, %r1935;
- mad.lo.s32 %r6685, %r6683, %r2606, %r6684;
- mul.wide.s32 %rd2119, %r6685, 4;
- add.s64 %rd2120, %rd2, %rd2119;
- ld.global.f32 %f5344, [%rd2120];
-
-$L__BB0_1467:
- @%p1222 bra $L__BB0_1470;
-
- shl.b32 %r1937, %r12, 5;
- mov.u32 %r6687, -32;
- sub.s32 %r6688, %r6687, %r1937;
- setp.ge.s32 %p1241, %r14, %r6688;
- @%p1241 bra $L__BB0_1470;
-
- add.s32 %r6689, %r1937, %r1;
- add.s32 %r6690, %r6689, 32;
- mul.hi.s32 %r6691, %r6690, 954437177;
- shr.u32 %r6692, %r6691, 31;
- shr.s32 %r6693, %r6691, 1;
- add.s32 %r6694, %r6693, %r6692;
- mul.lo.s32 %r6695, %r6694, 9;
- sub.s32 %r6696, %r6690, %r6695;
- mad.lo.s32 %r6697, %r6694, %r2605, %r1935;
- mad.lo.s32 %r6698, %r6696, %r2606, %r6697;
- mul.wide.s32 %rd2121, %r6698, 4;
- add.s64 %rd2122, %rd2, %rd2121;
- ld.global.f32 %f5343, [%rd2122];
-
-$L__BB0_1470:
- mad.lo.s32 %r6700, %r2602, 5, %r1927;
- add.s32 %r1938, %r6700, %r1928;
- @%p1226 bra $L__BB0_1473;
-
- shl.b32 %r1939, %r12, 5;
- neg.s32 %r6701, %r1939;
- setp.ge.s32 %p1243, %r14, %r6701;
- @%p1243 bra $L__BB0_1473;
-
- add.s32 %r6702, %r1939, %r1;
- mul.hi.s32 %r6703, %r6702, 954437177;
- shr.u32 %r6704, %r6703, 31;
- shr.s32 %r6705, %r6703, 1;
- add.s32 %r6706, %r6705, %r6704;
- mul.lo.s32 %r6707, %r6706, 9;
- sub.s32 %r6708, %r6702, %r6707;
- mad.lo.s32 %r6709, %r6706, %r2605, %r1938;
- mad.lo.s32 %r6710, %r6708, %r2606, %r6709;
- mul.wide.s32 %rd2123, %r6710, 4;
- add.s64 %rd2124, %rd2, %rd2123;
- ld.global.f32 %f5342, [%rd2124];
-
-$L__BB0_1473:
- @%p1226 bra $L__BB0_1476;
-
- shl.b32 %r1940, %r12, 5;
- mov.u32 %r6712, -32;
- sub.s32 %r6713, %r6712, %r1940;
- setp.ge.s32 %p1245, %r14, %r6713;
- @%p1245 bra $L__BB0_1476;
-
- add.s32 %r6714, %r1940, %r1;
- add.s32 %r6715, %r6714, 32;
- mul.hi.s32 %r6716, %r6715, 954437177;
- shr.u32 %r6717, %r6716, 31;
- shr.s32 %r6718, %r6716, 1;
- add.s32 %r6719, %r6718, %r6717;
- mul.lo.s32 %r6720, %r6719, 9;
- sub.s32 %r6721, %r6715, %r6720;
- mad.lo.s32 %r6722, %r6719, %r2605, %r1938;
- mad.lo.s32 %r6723, %r6721, %r2606, %r6722;
- mul.wide.s32 %rd2125, %r6723, 4;
- add.s64 %rd2126, %rd2, %rd2125;
- ld.global.f32 %f5341, [%rd2126];
-
-$L__BB0_1476:
- @%p1214 bra $L__BB0_1505;
-
- shl.b32 %r6725, %r12, 5;
- neg.s32 %r1941, %r6725;
- setp.ge.s32 %p1247, %r14, %r1941;
- @%p1247 bra $L__BB0_1490;
-
- mul.f32 %f4324, %f5348, 0f3F22F983;
- cvt.rni.s32.f32 %r8613, %f4324;
- cvt.rn.f32.s32 %f4325, %r8613;
- mov.f32 %f4326, 0fBFC90FDA;
- fma.rn.f32 %f4327, %f4325, %f4326, %f5348;
- mov.f32 %f4328, 0fB3A22168;
- fma.rn.f32 %f4329, %f4325, %f4328, %f4327;
- mov.f32 %f4330, 0fA7C234C5;
- fma.rn.f32 %f5751, %f4325, %f4330, %f4329;
- abs.f32 %f1659, %f5348;
- setp.ltu.f32 %p1248, %f1659, 0f47CE4780;
- @%p1248 bra $L__BB0_1486;
-
- setp.eq.f32 %p1249, %f1659, 0f7F800000;
- @%p1249 bra $L__BB0_1485;
- bra.uni $L__BB0_1480;
-
-$L__BB0_1485:
- mov.f32 %f4333, 0f00000000;
- mul.rn.f32 %f5751, %f5348, %f4333;
- mov.u32 %r8613, 0;
- bra.uni $L__BB0_1486;
-
-$L__BB0_1710:
- mov.b32 %r2246, %f1933;
- shr.u32 %r7309, %r2246, 23;
- and.b32 %r7310, %r7309, 255;
- add.s32 %r2247, %r7310, -128;
- shl.b32 %r7311, %r2246, 8;
- or.b32 %r2248, %r7311, -2147483648;
- shr.u32 %r2249, %r2247, 5;
- mov.u64 %rd2750, 0;
- mov.u32 %r8674, 0;
- mov.u64 %rd2748, __cudart_i2opi_f;
- mov.u64 %rd2749, %rd1;
-
-$L__BB0_1711:
- .pragma "nounroll";
- ld.global.nc.u32 %r7312, [%rd2748];
- mad.wide.u32 %rd2315, %r7312, %r2248, %rd2750;
- shr.u64 %rd2750, %rd2315, 32;
- st.local.u32 [%rd2749], %rd2315;
- add.s64 %rd2749, %rd2749, 4;
- add.s64 %rd2748, %rd2748, 4;
- add.s32 %r8674, %r8674, 1;
- setp.ne.s32 %p1449, %r8674, 6;
- @%p1449 bra $L__BB0_1711;
-
- st.local.u32 [%rd5], %rd2750;
- mov.u32 %r7313, 4;
- sub.s32 %r2252, %r7313, %r2249;
- mov.u32 %r7314, 6;
- sub.s32 %r7315, %r7314, %r2249;
- mul.wide.s32 %rd2316, %r7315, 4;
- add.s64 %rd2317, %rd1, %rd2316;
- ld.local.u32 %r8675, [%rd2317];
- ld.local.u32 %r8676, [%rd2317+-4];
- and.b32 %r2255, %r2247, 31;
- setp.eq.s32 %p1450, %r2255, 0;
- @%p1450 bra $L__BB0_1714;
-
- mov.u32 %r7316, 32;
- sub.s32 %r7317, %r7316, %r2255;
- shr.u32 %r7318, %r8676, %r7317;
- shl.b32 %r7319, %r8675, %r2255;
- add.s32 %r8675, %r7318, %r7319;
- mul.wide.s32 %rd2318, %r2252, 4;
- add.s64 %rd2319, %rd1, %rd2318;
- ld.local.u32 %r7320, [%rd2319];
- shr.u32 %r7321, %r7320, %r7317;
- shl.b32 %r7322, %r8676, %r2255;
- add.s32 %r8676, %r7321, %r7322;
-
-$L__BB0_1714:
- and.b32 %r7323, %r2246, -2147483648;
- shr.u32 %r7324, %r8676, 30;
- shl.b32 %r7325, %r8675, 2;
- or.b32 %r7326, %r7324, %r7325;
- shr.u32 %r7327, %r7326, 31;
- shr.u32 %r7328, %r8675, 30;
- add.s32 %r7329, %r7327, %r7328;
- neg.s32 %r7330, %r7329;
- setp.eq.s32 %p1451, %r7323, 0;
- selp.b32 %r8677, %r7329, %r7330, %p1451;
- setp.ne.s32 %p1452, %r7327, 0;
- xor.b32 %r7331, %r7323, -2147483648;
- selp.b32 %r7332, %r7331, %r7323, %p1452;
- selp.b32 %r7333, -1, 0, %p1452;
- xor.b32 %r7334, %r7326, %r7333;
- shl.b32 %r7335, %r8676, 2;
- xor.b32 %r7336, %r7335, %r7333;
- cvt.u64.u32 %rd2320, %r7334;
- cvt.u64.u32 %rd2321, %r7336;
- bfi.b64 %rd2322, %rd2320, %rd2321, 32, 32;
- cvt.rn.f64.s64 %fd225, %rd2322;
- mul.f64 %fd226, %fd225, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4682, %fd226;
- setp.eq.s32 %p1453, %r7332, 0;
- neg.f32 %f4683, %f4682;
- selp.f32 %f5850, %f4682, %f4683, %p1453;
-
-$L__BB0_1716:
- and.b32 %r2262, %r8677, 1;
- setp.eq.s32 %p1454, %r2262, 0;
- selp.f32 %f1946, %f5850, 0f3F800000, %p1454;
- mul.rn.f32 %f1947, %f5850, %f5850;
- mov.f32 %f5851, 0fB94D4153;
- @%p1454 bra $L__BB0_1718;
-
- mov.f32 %f4686, 0fBAB607ED;
- mov.f32 %f4687, 0f37CBAC00;
- fma.rn.f32 %f5851, %f4687, %f1947, %f4686;
-
-$L__BB0_1718:
- selp.f32 %f4688, 0f3C0885E4, 0f3D2AAABB, %p1454;
- fma.rn.f32 %f4689, %f5851, %f1947, %f4688;
- selp.f32 %f4690, 0fBE2AAAA8, 0fBEFFFFFF, %p1454;
- fma.rn.f32 %f4691, %f4689, %f1947, %f4690;
- mov.f32 %f4692, 0f00000000;
- fma.rn.f32 %f4693, %f1947, %f1946, %f4692;
- fma.rn.f32 %f5852, %f4691, %f4693, %f1946;
- and.b32 %r7338, %r8677, 2;
- setp.eq.s32 %p1456, %r7338, 0;
- @%p1456 bra $L__BB0_1720;
-
- mov.f32 %f4695, 0fBF800000;
- fma.rn.f32 %f5852, %f5852, %f4695, %f4692;
-
-$L__BB0_1720:
- mul.f32 %f4696, %f1925, 0f3F22F983;
- cvt.rni.s32.f32 %r8681, %f4696;
- cvt.rn.f32.s32 %f4697, %r8681;
- mov.f32 %f4698, 0fBFC90FDA;
- fma.rn.f32 %f4699, %f4697, %f4698, %f1925;
- mov.f32 %f4700, 0fB3A22168;
- fma.rn.f32 %f4701, %f4697, %f4700, %f4699;
- mov.f32 %f4702, 0fA7C234C5;
- fma.rn.f32 %f5853, %f4697, %f4702, %f4701;
- abs.f32 %f1954, %f1925;
- setp.ltu.f32 %p1457, %f1954, 0f47CE4780;
- @%p1457 bra $L__BB0_1728;
-
- setp.eq.f32 %p1458, %f1954, 0f7F800000;
- @%p1458 bra $L__BB0_1727;
- bra.uni $L__BB0_1722;
-
-$L__BB0_1727:
- mov.f32 %f4705, 0f00000000;
- mul.rn.f32 %f5853, %f1925, %f4705;
- mov.u32 %r8681, 0;
- bra.uni $L__BB0_1728;
-
-$L__BB0_1722:
- mov.b32 %r2264, %f1925;
- shr.u32 %r7340, %r2264, 23;
- and.b32 %r7341, %r7340, 255;
- add.s32 %r2265, %r7341, -128;
- shl.b32 %r7342, %r2264, 8;
- or.b32 %r2266, %r7342, -2147483648;
- shr.u32 %r2267, %r2265, 5;
- mov.u64 %rd2753, 0;
- mov.u32 %r8678, 0;
- mov.u64 %rd2751, __cudart_i2opi_f;
- mov.u64 %rd2752, %rd1;
-
-$L__BB0_1723:
- .pragma "nounroll";
- ld.global.nc.u32 %r7343, [%rd2751];
- mad.wide.u32 %rd2325, %r7343, %r2266, %rd2753;
- shr.u64 %rd2753, %rd2325, 32;
- st.local.u32 [%rd2752], %rd2325;
- add.s64 %rd2752, %rd2752, 4;
- add.s64 %rd2751, %rd2751, 4;
- add.s32 %r8678, %r8678, 1;
- setp.ne.s32 %p1459, %r8678, 6;
- @%p1459 bra $L__BB0_1723;
-
- st.local.u32 [%rd5], %rd2753;
- mov.u32 %r7344, 4;
- sub.s32 %r2270, %r7344, %r2267;
- mov.u32 %r7345, 6;
- sub.s32 %r7346, %r7345, %r2267;
- mul.wide.s32 %rd2326, %r7346, 4;
- add.s64 %rd2327, %rd1, %rd2326;
- ld.local.u32 %r8679, [%rd2327];
- ld.local.u32 %r8680, [%rd2327+-4];
- and.b32 %r2273, %r2265, 31;
- setp.eq.s32 %p1460, %r2273, 0;
- @%p1460 bra $L__BB0_1726;
-
- mov.u32 %r7347, 32;
- sub.s32 %r7348, %r7347, %r2273;
- shr.u32 %r7349, %r8680, %r7348;
- shl.b32 %r7350, %r8679, %r2273;
- add.s32 %r8679, %r7349, %r7350;
- mul.wide.s32 %rd2328, %r2270, 4;
- add.s64 %rd2329, %rd1, %rd2328;
- ld.local.u32 %r7351, [%rd2329];
- shr.u32 %r7352, %r7351, %r7348;
- shl.b32 %r7353, %r8680, %r2273;
- add.s32 %r8680, %r7352, %r7353;
-
-$L__BB0_1726:
- and.b32 %r7354, %r2264, -2147483648;
- shr.u32 %r7355, %r8680, 30;
- shl.b32 %r7356, %r8679, 2;
- or.b32 %r7357, %r7355, %r7356;
- shr.u32 %r7358, %r7357, 31;
- shr.u32 %r7359, %r8679, 30;
- add.s32 %r7360, %r7358, %r7359;
- neg.s32 %r7361, %r7360;
- setp.eq.s32 %p1461, %r7354, 0;
- selp.b32 %r8681, %r7360, %r7361, %p1461;
- setp.ne.s32 %p1462, %r7358, 0;
- xor.b32 %r7362, %r7354, -2147483648;
- selp.b32 %r7363, %r7362, %r7354, %p1462;
- selp.b32 %r7364, -1, 0, %p1462;
- xor.b32 %r7365, %r7357, %r7364;
- shl.b32 %r7366, %r8680, 2;
- xor.b32 %r7367, %r7366, %r7364;
- cvt.u64.u32 %rd2330, %r7365;
- cvt.u64.u32 %rd2331, %r7367;
- bfi.b64 %rd2332, %rd2330, %rd2331, 32, 32;
- cvt.rn.f64.s64 %fd227, %rd2332;
- mul.f64 %fd228, %fd227, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4703, %fd228;
- setp.eq.s32 %p1463, %r7363, 0;
- neg.f32 %f4704, %f4703;
- selp.f32 %f5853, %f4703, %f4704, %p1463;
-
-$L__BB0_1728:
- add.s32 %r2280, %r8681, 1;
- and.b32 %r2281, %r2280, 1;
- setp.eq.s32 %p1464, %r2281, 0;
- selp.f32 %f1958, %f5853, 0f3F800000, %p1464;
- mul.rn.f32 %f1959, %f5853, %f5853;
- mov.f32 %f5854, 0fB94D4153;
- @%p1464 bra $L__BB0_1730;
-
- mov.f32 %f4707, 0fBAB607ED;
- mov.f32 %f4708, 0f37CBAC00;
- fma.rn.f32 %f5854, %f4708, %f1959, %f4707;
-
-$L__BB0_1730:
- selp.f32 %f4709, 0f3C0885E4, 0f3D2AAABB, %p1464;
- fma.rn.f32 %f4710, %f5854, %f1959, %f4709;
- selp.f32 %f4711, 0fBE2AAAA8, 0fBEFFFFFF, %p1464;
- fma.rn.f32 %f4712, %f4710, %f1959, %f4711;
- mov.f32 %f4713, 0f00000000;
- fma.rn.f32 %f4714, %f1959, %f1958, %f4713;
- fma.rn.f32 %f5855, %f4712, %f4714, %f1958;
- and.b32 %r7369, %r2280, 2;
- setp.eq.s32 %p1466, %r7369, 0;
- @%p1466 bra $L__BB0_1732;
-
- mov.f32 %f4716, 0fBF800000;
- fma.rn.f32 %f5855, %f5855, %f4716, %f4713;
-
-$L__BB0_1732:
- add.f32 %f5905, %f5852, %f5855;
- mul.f32 %f4717, %f1934, 0f3F22F983;
- cvt.rni.s32.f32 %r8685, %f4717;
- cvt.rn.f32.s32 %f4718, %r8685;
- mov.f32 %f4719, 0fBFC90FDA;
- fma.rn.f32 %f4720, %f4718, %f4719, %f1934;
- mov.f32 %f4721, 0fB3A22168;
- fma.rn.f32 %f4722, %f4718, %f4721, %f4720;
- mov.f32 %f4723, 0fA7C234C5;
- fma.rn.f32 %f5856, %f4718, %f4723, %f4722;
- abs.f32 %f1967, %f1934;
- setp.ltu.f32 %p1467, %f1967, 0f47CE4780;
- @%p1467 bra $L__BB0_1740;
-
- setp.eq.f32 %p1468, %f1967, 0f7F800000;
- @%p1468 bra $L__BB0_1739;
- bra.uni $L__BB0_1734;
-
-$L__BB0_1739:
- mov.f32 %f4726, 0f00000000;
- mul.rn.f32 %f5856, %f1934, %f4726;
- mov.u32 %r8685, 0;
- bra.uni $L__BB0_1740;
-
-$L__BB0_1734:
- mov.b32 %r2283, %f1934;
- shr.u32 %r7371, %r2283, 23;
- and.b32 %r7372, %r7371, 255;
- add.s32 %r2284, %r7372, -128;
- shl.b32 %r7373, %r2283, 8;
- or.b32 %r2285, %r7373, -2147483648;
- shr.u32 %r2286, %r2284, 5;
- mov.u64 %rd2756, 0;
- mov.u32 %r8682, 0;
- mov.u64 %rd2754, __cudart_i2opi_f;
- mov.u64 %rd2755, %rd1;
-
-$L__BB0_1735:
- .pragma "nounroll";
- ld.global.nc.u32 %r7374, [%rd2754];
- mad.wide.u32 %rd2335, %r7374, %r2285, %rd2756;
- shr.u64 %rd2756, %rd2335, 32;
- st.local.u32 [%rd2755], %rd2335;
- add.s64 %rd2755, %rd2755, 4;
- add.s64 %rd2754, %rd2754, 4;
- add.s32 %r8682, %r8682, 1;
- setp.ne.s32 %p1469, %r8682, 6;
- @%p1469 bra $L__BB0_1735;
-
- st.local.u32 [%rd5], %rd2756;
- mov.u32 %r7375, 4;
- sub.s32 %r2289, %r7375, %r2286;
- mov.u32 %r7376, 6;
- sub.s32 %r7377, %r7376, %r2286;
- mul.wide.s32 %rd2336, %r7377, 4;
- add.s64 %rd2337, %rd1, %rd2336;
- ld.local.u32 %r8683, [%rd2337];
- ld.local.u32 %r8684, [%rd2337+-4];
- and.b32 %r2292, %r2284, 31;
- setp.eq.s32 %p1470, %r2292, 0;
- @%p1470 bra $L__BB0_1738;
-
- mov.u32 %r7378, 32;
- sub.s32 %r7379, %r7378, %r2292;
- shr.u32 %r7380, %r8684, %r7379;
- shl.b32 %r7381, %r8683, %r2292;
- add.s32 %r8683, %r7380, %r7381;
- mul.wide.s32 %rd2338, %r2289, 4;
- add.s64 %rd2339, %rd1, %rd2338;
- ld.local.u32 %r7382, [%rd2339];
- shr.u32 %r7383, %r7382, %r7379;
- shl.b32 %r7384, %r8684, %r2292;
- add.s32 %r8684, %r7383, %r7384;
-
-$L__BB0_1738:
- and.b32 %r7385, %r2283, -2147483648;
- shr.u32 %r7386, %r8684, 30;
- shl.b32 %r7387, %r8683, 2;
- or.b32 %r7388, %r7386, %r7387;
- shr.u32 %r7389, %r7388, 31;
- shr.u32 %r7390, %r8683, 30;
- add.s32 %r7391, %r7389, %r7390;
- neg.s32 %r7392, %r7391;
- setp.eq.s32 %p1471, %r7385, 0;
- selp.b32 %r8685, %r7391, %r7392, %p1471;
- setp.ne.s32 %p1472, %r7389, 0;
- xor.b32 %r7393, %r7385, -2147483648;
- selp.b32 %r7394, %r7393, %r7385, %p1472;
- selp.b32 %r7395, -1, 0, %p1472;
- xor.b32 %r7396, %r7388, %r7395;
- shl.b32 %r7397, %r8684, 2;
- xor.b32 %r7398, %r7397, %r7395;
- cvt.u64.u32 %rd2340, %r7396;
- cvt.u64.u32 %rd2341, %r7398;
- bfi.b64 %rd2342, %rd2340, %rd2341, 32, 32;
- cvt.rn.f64.s64 %fd229, %rd2342;
- mul.f64 %fd230, %fd229, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4724, %fd230;
- setp.eq.s32 %p1473, %r7394, 0;
- neg.f32 %f4725, %f4724;
- selp.f32 %f5856, %f4724, %f4725, %p1473;
-
-$L__BB0_1740:
- and.b32 %r2299, %r8685, 1;
- setp.eq.s32 %p1474, %r2299, 0;
- selp.f32 %f1971, %f5856, 0f3F800000, %p1474;
- mul.rn.f32 %f1972, %f5856, %f5856;
- mov.f32 %f5857, 0fB94D4153;
- @%p1474 bra $L__BB0_1742;
-
- mov.f32 %f4728, 0fBAB607ED;
- mov.f32 %f4729, 0f37CBAC00;
- fma.rn.f32 %f5857, %f4729, %f1972, %f4728;
-
-$L__BB0_1742:
- selp.f32 %f4730, 0f3C0885E4, 0f3D2AAABB, %p1474;
- fma.rn.f32 %f4731, %f5857, %f1972, %f4730;
- selp.f32 %f4732, 0fBE2AAAA8, 0fBEFFFFFF, %p1474;
- fma.rn.f32 %f4733, %f4731, %f1972, %f4732;
- mov.f32 %f4734, 0f00000000;
- fma.rn.f32 %f4735, %f1972, %f1971, %f4734;
- fma.rn.f32 %f5858, %f4733, %f4735, %f1971;
- and.b32 %r7400, %r8685, 2;
- setp.eq.s32 %p1476, %r7400, 0;
- @%p1476 bra $L__BB0_1744;
-
- mov.f32 %f4737, 0fBF800000;
- fma.rn.f32 %f5858, %f5858, %f4737, %f4734;
-
-$L__BB0_1744:
- mul.f32 %f4738, %f1926, 0f3F22F983;
- cvt.rni.s32.f32 %r8689, %f4738;
- cvt.rn.f32.s32 %f4739, %r8689;
- mov.f32 %f4740, 0fBFC90FDA;
- fma.rn.f32 %f4741, %f4739, %f4740, %f1926;
- mov.f32 %f4742, 0fB3A22168;
- fma.rn.f32 %f4743, %f4739, %f4742, %f4741;
- mov.f32 %f4744, 0fA7C234C5;
- fma.rn.f32 %f5859, %f4739, %f4744, %f4743;
- abs.f32 %f1979, %f1926;
- setp.ltu.f32 %p1477, %f1979, 0f47CE4780;
- @%p1477 bra $L__BB0_1752;
-
- setp.eq.f32 %p1478, %f1979, 0f7F800000;
- @%p1478 bra $L__BB0_1751;
- bra.uni $L__BB0_1746;
-
-$L__BB0_1751:
- mov.f32 %f4747, 0f00000000;
- mul.rn.f32 %f5859, %f1926, %f4747;
- mov.u32 %r8689, 0;
- bra.uni $L__BB0_1752;
-
-$L__BB0_1746:
- mov.b32 %r2301, %f1926;
- shr.u32 %r7402, %r2301, 23;
- and.b32 %r7403, %r7402, 255;
- add.s32 %r2302, %r7403, -128;
- shl.b32 %r7404, %r2301, 8;
- or.b32 %r2303, %r7404, -2147483648;
- shr.u32 %r2304, %r2302, 5;
- mov.u64 %rd2759, 0;
- mov.u32 %r8686, 0;
- mov.u64 %rd2757, __cudart_i2opi_f;
- mov.u64 %rd2758, %rd1;
-
-$L__BB0_1747:
- .pragma "nounroll";
- ld.global.nc.u32 %r7405, [%rd2757];
- mad.wide.u32 %rd2345, %r7405, %r2303, %rd2759;
- shr.u64 %rd2759, %rd2345, 32;
- st.local.u32 [%rd2758], %rd2345;
- add.s64 %rd2758, %rd2758, 4;
- add.s64 %rd2757, %rd2757, 4;
- add.s32 %r8686, %r8686, 1;
- setp.ne.s32 %p1479, %r8686, 6;
- @%p1479 bra $L__BB0_1747;
-
- st.local.u32 [%rd5], %rd2759;
- mov.u32 %r7406, 4;
- sub.s32 %r2307, %r7406, %r2304;
- mov.u32 %r7407, 6;
- sub.s32 %r7408, %r7407, %r2304;
- mul.wide.s32 %rd2346, %r7408, 4;
- add.s64 %rd2347, %rd1, %rd2346;
- ld.local.u32 %r8687, [%rd2347];
- ld.local.u32 %r8688, [%rd2347+-4];
- and.b32 %r2310, %r2302, 31;
- setp.eq.s32 %p1480, %r2310, 0;
- @%p1480 bra $L__BB0_1750;
-
- mov.u32 %r7409, 32;
- sub.s32 %r7410, %r7409, %r2310;
- shr.u32 %r7411, %r8688, %r7410;
- shl.b32 %r7412, %r8687, %r2310;
- add.s32 %r8687, %r7411, %r7412;
- mul.wide.s32 %rd2348, %r2307, 4;
- add.s64 %rd2349, %rd1, %rd2348;
- ld.local.u32 %r7413, [%rd2349];
- shr.u32 %r7414, %r7413, %r7410;
- shl.b32 %r7415, %r8688, %r2310;
- add.s32 %r8688, %r7414, %r7415;
-
-$L__BB0_1750:
- and.b32 %r7416, %r2301, -2147483648;
- shr.u32 %r7417, %r8688, 30;
- shl.b32 %r7418, %r8687, 2;
- or.b32 %r7419, %r7417, %r7418;
- shr.u32 %r7420, %r7419, 31;
- shr.u32 %r7421, %r8687, 30;
- add.s32 %r7422, %r7420, %r7421;
- neg.s32 %r7423, %r7422;
- setp.eq.s32 %p1481, %r7416, 0;
- selp.b32 %r8689, %r7422, %r7423, %p1481;
- setp.ne.s32 %p1482, %r7420, 0;
- xor.b32 %r7424, %r7416, -2147483648;
- selp.b32 %r7425, %r7424, %r7416, %p1482;
- selp.b32 %r7426, -1, 0, %p1482;
- xor.b32 %r7427, %r7419, %r7426;
- shl.b32 %r7428, %r8688, 2;
- xor.b32 %r7429, %r7428, %r7426;
- cvt.u64.u32 %rd2350, %r7427;
- cvt.u64.u32 %rd2351, %r7429;
- bfi.b64 %rd2352, %rd2350, %rd2351, 32, 32;
- cvt.rn.f64.s64 %fd231, %rd2352;
- mul.f64 %fd232, %fd231, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4745, %fd232;
- setp.eq.s32 %p1483, %r7425, 0;
- neg.f32 %f4746, %f4745;
- selp.f32 %f5859, %f4745, %f4746, %p1483;
-
-$L__BB0_1752:
- add.s32 %r2317, %r8689, 1;
- and.b32 %r2318, %r2317, 1;
- setp.eq.s32 %p1484, %r2318, 0;
- selp.f32 %f1983, %f5859, 0f3F800000, %p1484;
- mul.rn.f32 %f1984, %f5859, %f5859;
- mov.f32 %f5860, 0fB94D4153;
- @%p1484 bra $L__BB0_1754;
-
- mov.f32 %f4749, 0fBAB607ED;
- mov.f32 %f4750, 0f37CBAC00;
- fma.rn.f32 %f5860, %f4750, %f1984, %f4749;
-
-$L__BB0_1754:
- selp.f32 %f4751, 0f3C0885E4, 0f3D2AAABB, %p1484;
- fma.rn.f32 %f4752, %f5860, %f1984, %f4751;
- selp.f32 %f4753, 0fBE2AAAA8, 0fBEFFFFFF, %p1484;
- fma.rn.f32 %f4754, %f4752, %f1984, %f4753;
- mov.f32 %f4755, 0f00000000;
- fma.rn.f32 %f4756, %f1984, %f1983, %f4755;
- fma.rn.f32 %f5861, %f4754, %f4756, %f1983;
- and.b32 %r7431, %r2317, 2;
- setp.eq.s32 %p1486, %r7431, 0;
- @%p1486 bra $L__BB0_1756;
-
- mov.f32 %f4758, 0fBF800000;
- fma.rn.f32 %f5861, %f5861, %f4758, %f4755;
-
-$L__BB0_1756:
- add.f32 %f5904, %f5858, %f5861;
- mul.f32 %f4759, %f1935, 0f3F22F983;
- cvt.rni.s32.f32 %r8693, %f4759;
- cvt.rn.f32.s32 %f4760, %r8693;
- mov.f32 %f4761, 0fBFC90FDA;
- fma.rn.f32 %f4762, %f4760, %f4761, %f1935;
- mov.f32 %f4763, 0fB3A22168;
- fma.rn.f32 %f4764, %f4760, %f4763, %f4762;
- mov.f32 %f4765, 0fA7C234C5;
- fma.rn.f32 %f5862, %f4760, %f4765, %f4764;
- abs.f32 %f1992, %f1935;
- setp.ltu.f32 %p1487, %f1992, 0f47CE4780;
- @%p1487 bra $L__BB0_1764;
-
- setp.eq.f32 %p1488, %f1992, 0f7F800000;
- @%p1488 bra $L__BB0_1763;
- bra.uni $L__BB0_1758;
-
-$L__BB0_1763:
- mov.f32 %f4768, 0f00000000;
- mul.rn.f32 %f5862, %f1935, %f4768;
- mov.u32 %r8693, 0;
- bra.uni $L__BB0_1764;
-
-$L__BB0_1758:
- mov.b32 %r2320, %f1935;
- shr.u32 %r7433, %r2320, 23;
- and.b32 %r7434, %r7433, 255;
- add.s32 %r2321, %r7434, -128;
- shl.b32 %r7435, %r2320, 8;
- or.b32 %r2322, %r7435, -2147483648;
- shr.u32 %r2323, %r2321, 5;
- mov.u64 %rd2762, 0;
- mov.u32 %r8690, 0;
- mov.u64 %rd2760, __cudart_i2opi_f;
- mov.u64 %rd2761, %rd1;
-
-$L__BB0_1759:
- .pragma "nounroll";
- ld.global.nc.u32 %r7436, [%rd2760];
- mad.wide.u32 %rd2355, %r7436, %r2322, %rd2762;
- shr.u64 %rd2762, %rd2355, 32;
- st.local.u32 [%rd2761], %rd2355;
- add.s64 %rd2761, %rd2761, 4;
- add.s64 %rd2760, %rd2760, 4;
- add.s32 %r8690, %r8690, 1;
- setp.ne.s32 %p1489, %r8690, 6;
- @%p1489 bra $L__BB0_1759;
-
- st.local.u32 [%rd5], %rd2762;
- mov.u32 %r7437, 4;
- sub.s32 %r2326, %r7437, %r2323;
- mov.u32 %r7438, 6;
- sub.s32 %r7439, %r7438, %r2323;
- mul.wide.s32 %rd2356, %r7439, 4;
- add.s64 %rd2357, %rd1, %rd2356;
- ld.local.u32 %r8691, [%rd2357];
- ld.local.u32 %r8692, [%rd2357+-4];
- and.b32 %r2329, %r2321, 31;
- setp.eq.s32 %p1490, %r2329, 0;
- @%p1490 bra $L__BB0_1762;
-
- mov.u32 %r7440, 32;
- sub.s32 %r7441, %r7440, %r2329;
- shr.u32 %r7442, %r8692, %r7441;
- shl.b32 %r7443, %r8691, %r2329;
- add.s32 %r8691, %r7442, %r7443;
- mul.wide.s32 %rd2358, %r2326, 4;
- add.s64 %rd2359, %rd1, %rd2358;
- ld.local.u32 %r7444, [%rd2359];
- shr.u32 %r7445, %r7444, %r7441;
- shl.b32 %r7446, %r8692, %r2329;
- add.s32 %r8692, %r7445, %r7446;
-
-$L__BB0_1762:
- and.b32 %r7447, %r2320, -2147483648;
- shr.u32 %r7448, %r8692, 30;
- shl.b32 %r7449, %r8691, 2;
- or.b32 %r7450, %r7448, %r7449;
- shr.u32 %r7451, %r7450, 31;
- shr.u32 %r7452, %r8691, 30;
- add.s32 %r7453, %r7451, %r7452;
- neg.s32 %r7454, %r7453;
- setp.eq.s32 %p1491, %r7447, 0;
- selp.b32 %r8693, %r7453, %r7454, %p1491;
- setp.ne.s32 %p1492, %r7451, 0;
- xor.b32 %r7455, %r7447, -2147483648;
- selp.b32 %r7456, %r7455, %r7447, %p1492;
- selp.b32 %r7457, -1, 0, %p1492;
- xor.b32 %r7458, %r7450, %r7457;
- shl.b32 %r7459, %r8692, 2;
- xor.b32 %r7460, %r7459, %r7457;
- cvt.u64.u32 %rd2360, %r7458;
- cvt.u64.u32 %rd2361, %r7460;
- bfi.b64 %rd2362, %rd2360, %rd2361, 32, 32;
- cvt.rn.f64.s64 %fd233, %rd2362;
- mul.f64 %fd234, %fd233, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4766, %fd234;
- setp.eq.s32 %p1493, %r7456, 0;
- neg.f32 %f4767, %f4766;
- selp.f32 %f5862, %f4766, %f4767, %p1493;
-
-$L__BB0_1764:
- and.b32 %r2336, %r8693, 1;
- setp.eq.s32 %p1494, %r2336, 0;
- selp.f32 %f1996, %f5862, 0f3F800000, %p1494;
- mul.rn.f32 %f1997, %f5862, %f5862;
- mov.f32 %f5863, 0fB94D4153;
- @%p1494 bra $L__BB0_1766;
-
- mov.f32 %f4770, 0fBAB607ED;
- mov.f32 %f4771, 0f37CBAC00;
- fma.rn.f32 %f5863, %f4771, %f1997, %f4770;
-
-$L__BB0_1766:
- selp.f32 %f4772, 0f3C0885E4, 0f3D2AAABB, %p1494;
- fma.rn.f32 %f4773, %f5863, %f1997, %f4772;
- selp.f32 %f4774, 0fBE2AAAA8, 0fBEFFFFFF, %p1494;
- fma.rn.f32 %f4775, %f4773, %f1997, %f4774;
- mov.f32 %f4776, 0f00000000;
- fma.rn.f32 %f4777, %f1997, %f1996, %f4776;
- fma.rn.f32 %f5864, %f4775, %f4777, %f1996;
- and.b32 %r7462, %r8693, 2;
- setp.eq.s32 %p1496, %r7462, 0;
- @%p1496 bra $L__BB0_1768;
-
- mov.f32 %f4779, 0fBF800000;
- fma.rn.f32 %f5864, %f5864, %f4779, %f4776;
-
-$L__BB0_1768:
- mul.f32 %f4780, %f1927, 0f3F22F983;
- cvt.rni.s32.f32 %r8697, %f4780;
- cvt.rn.f32.s32 %f4781, %r8697;
- mov.f32 %f4782, 0fBFC90FDA;
- fma.rn.f32 %f4783, %f4781, %f4782, %f1927;
- mov.f32 %f4784, 0fB3A22168;
- fma.rn.f32 %f4785, %f4781, %f4784, %f4783;
- mov.f32 %f4786, 0fA7C234C5;
- fma.rn.f32 %f5865, %f4781, %f4786, %f4785;
- abs.f32 %f2004, %f1927;
- setp.ltu.f32 %p1497, %f2004, 0f47CE4780;
- @%p1497 bra $L__BB0_1776;
-
- setp.eq.f32 %p1498, %f2004, 0f7F800000;
- @%p1498 bra $L__BB0_1775;
- bra.uni $L__BB0_1770;
-
-$L__BB0_1775:
- mov.f32 %f4789, 0f00000000;
- mul.rn.f32 %f5865, %f1927, %f4789;
- mov.u32 %r8697, 0;
- bra.uni $L__BB0_1776;
-
-$L__BB0_1770:
- mov.b32 %r2338, %f1927;
- shr.u32 %r7464, %r2338, 23;
- and.b32 %r7465, %r7464, 255;
- add.s32 %r2339, %r7465, -128;
- shl.b32 %r7466, %r2338, 8;
- or.b32 %r2340, %r7466, -2147483648;
- shr.u32 %r2341, %r2339, 5;
- mov.u64 %rd2765, 0;
- mov.u32 %r8694, 0;
- mov.u64 %rd2763, __cudart_i2opi_f;
- mov.u64 %rd2764, %rd1;
-
-$L__BB0_1771:
- .pragma "nounroll";
- ld.global.nc.u32 %r7467, [%rd2763];
- mad.wide.u32 %rd2365, %r7467, %r2340, %rd2765;
- shr.u64 %rd2765, %rd2365, 32;
- st.local.u32 [%rd2764], %rd2365;
- add.s64 %rd2764, %rd2764, 4;
- add.s64 %rd2763, %rd2763, 4;
- add.s32 %r8694, %r8694, 1;
- setp.ne.s32 %p1499, %r8694, 6;
- @%p1499 bra $L__BB0_1771;
-
- st.local.u32 [%rd5], %rd2765;
- mov.u32 %r7468, 4;
- sub.s32 %r2344, %r7468, %r2341;
- mov.u32 %r7469, 6;
- sub.s32 %r7470, %r7469, %r2341;
- mul.wide.s32 %rd2366, %r7470, 4;
- add.s64 %rd2367, %rd1, %rd2366;
- ld.local.u32 %r8695, [%rd2367];
- ld.local.u32 %r8696, [%rd2367+-4];
- and.b32 %r2347, %r2339, 31;
- setp.eq.s32 %p1500, %r2347, 0;
- @%p1500 bra $L__BB0_1774;
-
- mov.u32 %r7471, 32;
- sub.s32 %r7472, %r7471, %r2347;
- shr.u32 %r7473, %r8696, %r7472;
- shl.b32 %r7474, %r8695, %r2347;
- add.s32 %r8695, %r7473, %r7474;
- mul.wide.s32 %rd2368, %r2344, 4;
- add.s64 %rd2369, %rd1, %rd2368;
- ld.local.u32 %r7475, [%rd2369];
- shr.u32 %r7476, %r7475, %r7472;
- shl.b32 %r7477, %r8696, %r2347;
- add.s32 %r8696, %r7476, %r7477;
-
-$L__BB0_1774:
- and.b32 %r7478, %r2338, -2147483648;
- shr.u32 %r7479, %r8696, 30;
- shl.b32 %r7480, %r8695, 2;
- or.b32 %r7481, %r7479, %r7480;
- shr.u32 %r7482, %r7481, 31;
- shr.u32 %r7483, %r8695, 30;
- add.s32 %r7484, %r7482, %r7483;
- neg.s32 %r7485, %r7484;
- setp.eq.s32 %p1501, %r7478, 0;
- selp.b32 %r8697, %r7484, %r7485, %p1501;
- setp.ne.s32 %p1502, %r7482, 0;
- xor.b32 %r7486, %r7478, -2147483648;
- selp.b32 %r7487, %r7486, %r7478, %p1502;
- selp.b32 %r7488, -1, 0, %p1502;
- xor.b32 %r7489, %r7481, %r7488;
- shl.b32 %r7490, %r8696, 2;
- xor.b32 %r7491, %r7490, %r7488;
- cvt.u64.u32 %rd2370, %r7489;
- cvt.u64.u32 %rd2371, %r7491;
- bfi.b64 %rd2372, %rd2370, %rd2371, 32, 32;
- cvt.rn.f64.s64 %fd235, %rd2372;
- mul.f64 %fd236, %fd235, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4787, %fd236;
- setp.eq.s32 %p1503, %r7487, 0;
- neg.f32 %f4788, %f4787;
- selp.f32 %f5865, %f4787, %f4788, %p1503;
-
-$L__BB0_1776:
- add.s32 %r2354, %r8697, 1;
- and.b32 %r2355, %r2354, 1;
- setp.eq.s32 %p1504, %r2355, 0;
- selp.f32 %f2008, %f5865, 0f3F800000, %p1504;
- mul.rn.f32 %f2009, %f5865, %f5865;
- mov.f32 %f5866, 0fB94D4153;
- @%p1504 bra $L__BB0_1778;
-
- mov.f32 %f4791, 0fBAB607ED;
- mov.f32 %f4792, 0f37CBAC00;
- fma.rn.f32 %f5866, %f4792, %f2009, %f4791;
-
-$L__BB0_1778:
- selp.f32 %f4793, 0f3C0885E4, 0f3D2AAABB, %p1504;
- fma.rn.f32 %f4794, %f5866, %f2009, %f4793;
- selp.f32 %f4795, 0fBE2AAAA8, 0fBEFFFFFF, %p1504;
- fma.rn.f32 %f4796, %f4794, %f2009, %f4795;
- mov.f32 %f4797, 0f00000000;
- fma.rn.f32 %f4798, %f2009, %f2008, %f4797;
- fma.rn.f32 %f5867, %f4796, %f4798, %f2008;
- and.b32 %r7493, %r2354, 2;
- setp.eq.s32 %p1506, %r7493, 0;
- @%p1506 bra $L__BB0_1780;
-
- mov.f32 %f4800, 0fBF800000;
- fma.rn.f32 %f5867, %f5867, %f4800, %f4797;
-
-$L__BB0_1780:
- add.f32 %f5903, %f5864, %f5867;
- mul.f32 %f4801, %f1936, 0f3F22F983;
- cvt.rni.s32.f32 %r8701, %f4801;
- cvt.rn.f32.s32 %f4802, %r8701;
- mov.f32 %f4803, 0fBFC90FDA;
- fma.rn.f32 %f4804, %f4802, %f4803, %f1936;
- mov.f32 %f4805, 0fB3A22168;
- fma.rn.f32 %f4806, %f4802, %f4805, %f4804;
- mov.f32 %f4807, 0fA7C234C5;
- fma.rn.f32 %f5868, %f4802, %f4807, %f4806;
- abs.f32 %f2017, %f1936;
- setp.ltu.f32 %p1507, %f2017, 0f47CE4780;
- @%p1507 bra $L__BB0_1788;
-
- setp.eq.f32 %p1508, %f2017, 0f7F800000;
- @%p1508 bra $L__BB0_1787;
- bra.uni $L__BB0_1782;
-
-$L__BB0_1787:
- mov.f32 %f4810, 0f00000000;
- mul.rn.f32 %f5868, %f1936, %f4810;
- mov.u32 %r8701, 0;
- bra.uni $L__BB0_1788;
-
-$L__BB0_1782:
- mov.b32 %r2357, %f1936;
- shr.u32 %r7495, %r2357, 23;
- and.b32 %r7496, %r7495, 255;
- add.s32 %r2358, %r7496, -128;
- shl.b32 %r7497, %r2357, 8;
- or.b32 %r2359, %r7497, -2147483648;
- shr.u32 %r2360, %r2358, 5;
- mov.u64 %rd2768, 0;
- mov.u32 %r8698, 0;
- mov.u64 %rd2766, __cudart_i2opi_f;
- mov.u64 %rd2767, %rd1;
-
-$L__BB0_1783:
- .pragma "nounroll";
- ld.global.nc.u32 %r7498, [%rd2766];
- mad.wide.u32 %rd2375, %r7498, %r2359, %rd2768;
- shr.u64 %rd2768, %rd2375, 32;
- st.local.u32 [%rd2767], %rd2375;
- add.s64 %rd2767, %rd2767, 4;
- add.s64 %rd2766, %rd2766, 4;
- add.s32 %r8698, %r8698, 1;
- setp.ne.s32 %p1509, %r8698, 6;
- @%p1509 bra $L__BB0_1783;
-
- st.local.u32 [%rd5], %rd2768;
- mov.u32 %r7499, 4;
- sub.s32 %r2363, %r7499, %r2360;
- mov.u32 %r7500, 6;
- sub.s32 %r7501, %r7500, %r2360;
- mul.wide.s32 %rd2376, %r7501, 4;
- add.s64 %rd2377, %rd1, %rd2376;
- ld.local.u32 %r8699, [%rd2377];
- ld.local.u32 %r8700, [%rd2377+-4];
- and.b32 %r2366, %r2358, 31;
- setp.eq.s32 %p1510, %r2366, 0;
- @%p1510 bra $L__BB0_1786;
-
- mov.u32 %r7502, 32;
- sub.s32 %r7503, %r7502, %r2366;
- shr.u32 %r7504, %r8700, %r7503;
- shl.b32 %r7505, %r8699, %r2366;
- add.s32 %r8699, %r7504, %r7505;
- mul.wide.s32 %rd2378, %r2363, 4;
- add.s64 %rd2379, %rd1, %rd2378;
- ld.local.u32 %r7506, [%rd2379];
- shr.u32 %r7507, %r7506, %r7503;
- shl.b32 %r7508, %r8700, %r2366;
- add.s32 %r8700, %r7507, %r7508;
-
-$L__BB0_1786:
- and.b32 %r7509, %r2357, -2147483648;
- shr.u32 %r7510, %r8700, 30;
- shl.b32 %r7511, %r8699, 2;
- or.b32 %r7512, %r7510, %r7511;
- shr.u32 %r7513, %r7512, 31;
- shr.u32 %r7514, %r8699, 30;
- add.s32 %r7515, %r7513, %r7514;
- neg.s32 %r7516, %r7515;
- setp.eq.s32 %p1511, %r7509, 0;
- selp.b32 %r8701, %r7515, %r7516, %p1511;
- setp.ne.s32 %p1512, %r7513, 0;
- xor.b32 %r7517, %r7509, -2147483648;
- selp.b32 %r7518, %r7517, %r7509, %p1512;
- selp.b32 %r7519, -1, 0, %p1512;
- xor.b32 %r7520, %r7512, %r7519;
- shl.b32 %r7521, %r8700, 2;
- xor.b32 %r7522, %r7521, %r7519;
- cvt.u64.u32 %rd2380, %r7520;
- cvt.u64.u32 %rd2381, %r7522;
- bfi.b64 %rd2382, %rd2380, %rd2381, 32, 32;
- cvt.rn.f64.s64 %fd237, %rd2382;
- mul.f64 %fd238, %fd237, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4808, %fd238;
- setp.eq.s32 %p1513, %r7518, 0;
- neg.f32 %f4809, %f4808;
- selp.f32 %f5868, %f4808, %f4809, %p1513;
-
-$L__BB0_1788:
- and.b32 %r2373, %r8701, 1;
- setp.eq.s32 %p1514, %r2373, 0;
- selp.f32 %f2021, %f5868, 0f3F800000, %p1514;
- mul.rn.f32 %f2022, %f5868, %f5868;
- mov.f32 %f5869, 0fB94D4153;
- @%p1514 bra $L__BB0_1790;
-
- mov.f32 %f4812, 0fBAB607ED;
- mov.f32 %f4813, 0f37CBAC00;
- fma.rn.f32 %f5869, %f4813, %f2022, %f4812;
-
-$L__BB0_1790:
- selp.f32 %f4814, 0f3C0885E4, 0f3D2AAABB, %p1514;
- fma.rn.f32 %f4815, %f5869, %f2022, %f4814;
- selp.f32 %f4816, 0fBE2AAAA8, 0fBEFFFFFF, %p1514;
- fma.rn.f32 %f4817, %f4815, %f2022, %f4816;
- mov.f32 %f4818, 0f00000000;
- fma.rn.f32 %f4819, %f2022, %f2021, %f4818;
- fma.rn.f32 %f5870, %f4817, %f4819, %f2021;
- and.b32 %r7524, %r8701, 2;
- setp.eq.s32 %p1516, %r7524, 0;
- @%p1516 bra $L__BB0_1792;
-
- mov.f32 %f4821, 0fBF800000;
- fma.rn.f32 %f5870, %f5870, %f4821, %f4818;
-
-$L__BB0_1792:
- mul.f32 %f4822, %f1928, 0f3F22F983;
- cvt.rni.s32.f32 %r8705, %f4822;
- cvt.rn.f32.s32 %f4823, %r8705;
- mov.f32 %f4824, 0fBFC90FDA;
- fma.rn.f32 %f4825, %f4823, %f4824, %f1928;
- mov.f32 %f4826, 0fB3A22168;
- fma.rn.f32 %f4827, %f4823, %f4826, %f4825;
- mov.f32 %f4828, 0fA7C234C5;
- fma.rn.f32 %f5871, %f4823, %f4828, %f4827;
- abs.f32 %f2029, %f1928;
- setp.ltu.f32 %p1517, %f2029, 0f47CE4780;
- @%p1517 bra $L__BB0_1800;
-
- setp.eq.f32 %p1518, %f2029, 0f7F800000;
- @%p1518 bra $L__BB0_1799;
- bra.uni $L__BB0_1794;
-
-$L__BB0_1799:
- mov.f32 %f4831, 0f00000000;
- mul.rn.f32 %f5871, %f1928, %f4831;
- mov.u32 %r8705, 0;
- bra.uni $L__BB0_1800;
-
-$L__BB0_1794:
- mov.b32 %r2375, %f1928;
- shr.u32 %r7526, %r2375, 23;
- and.b32 %r7527, %r7526, 255;
- add.s32 %r2376, %r7527, -128;
- shl.b32 %r7528, %r2375, 8;
- or.b32 %r2377, %r7528, -2147483648;
- shr.u32 %r2378, %r2376, 5;
- mov.u64 %rd2771, 0;
- mov.u32 %r8702, 0;
- mov.u64 %rd2769, __cudart_i2opi_f;
- mov.u64 %rd2770, %rd1;
-
-$L__BB0_1795:
- .pragma "nounroll";
- ld.global.nc.u32 %r7529, [%rd2769];
- mad.wide.u32 %rd2385, %r7529, %r2377, %rd2771;
- shr.u64 %rd2771, %rd2385, 32;
- st.local.u32 [%rd2770], %rd2385;
- add.s64 %rd2770, %rd2770, 4;
- add.s64 %rd2769, %rd2769, 4;
- add.s32 %r8702, %r8702, 1;
- setp.ne.s32 %p1519, %r8702, 6;
- @%p1519 bra $L__BB0_1795;
-
- st.local.u32 [%rd5], %rd2771;
- mov.u32 %r7530, 4;
- sub.s32 %r2381, %r7530, %r2378;
- mov.u32 %r7531, 6;
- sub.s32 %r7532, %r7531, %r2378;
- mul.wide.s32 %rd2386, %r7532, 4;
- add.s64 %rd2387, %rd1, %rd2386;
- ld.local.u32 %r8703, [%rd2387];
- ld.local.u32 %r8704, [%rd2387+-4];
- and.b32 %r2384, %r2376, 31;
- setp.eq.s32 %p1520, %r2384, 0;
- @%p1520 bra $L__BB0_1798;
-
- mov.u32 %r7533, 32;
- sub.s32 %r7534, %r7533, %r2384;
- shr.u32 %r7535, %r8704, %r7534;
- shl.b32 %r7536, %r8703, %r2384;
- add.s32 %r8703, %r7535, %r7536;
- mul.wide.s32 %rd2388, %r2381, 4;
- add.s64 %rd2389, %rd1, %rd2388;
- ld.local.u32 %r7537, [%rd2389];
- shr.u32 %r7538, %r7537, %r7534;
- shl.b32 %r7539, %r8704, %r2384;
- add.s32 %r8704, %r7538, %r7539;
-
-$L__BB0_1798:
- and.b32 %r7540, %r2375, -2147483648;
- shr.u32 %r7541, %r8704, 30;
- shl.b32 %r7542, %r8703, 2;
- or.b32 %r7543, %r7541, %r7542;
- shr.u32 %r7544, %r7543, 31;
- shr.u32 %r7545, %r8703, 30;
- add.s32 %r7546, %r7544, %r7545;
- neg.s32 %r7547, %r7546;
- setp.eq.s32 %p1521, %r7540, 0;
- selp.b32 %r8705, %r7546, %r7547, %p1521;
- setp.ne.s32 %p1522, %r7544, 0;
- xor.b32 %r7548, %r7540, -2147483648;
- selp.b32 %r7549, %r7548, %r7540, %p1522;
- selp.b32 %r7550, -1, 0, %p1522;
- xor.b32 %r7551, %r7543, %r7550;
- shl.b32 %r7552, %r8704, 2;
- xor.b32 %r7553, %r7552, %r7550;
- cvt.u64.u32 %rd2390, %r7551;
- cvt.u64.u32 %rd2391, %r7553;
- bfi.b64 %rd2392, %rd2390, %rd2391, 32, 32;
- cvt.rn.f64.s64 %fd239, %rd2392;
- mul.f64 %fd240, %fd239, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4829, %fd240;
- setp.eq.s32 %p1523, %r7549, 0;
- neg.f32 %f4830, %f4829;
- selp.f32 %f5871, %f4829, %f4830, %p1523;
-
-$L__BB0_1800:
- add.s32 %r2391, %r8705, 1;
- and.b32 %r2392, %r2391, 1;
- setp.eq.s32 %p1524, %r2392, 0;
- selp.f32 %f2033, %f5871, 0f3F800000, %p1524;
- mul.rn.f32 %f2034, %f5871, %f5871;
- mov.f32 %f5872, 0fB94D4153;
- @%p1524 bra $L__BB0_1802;
-
- mov.f32 %f4833, 0fBAB607ED;
- mov.f32 %f4834, 0f37CBAC00;
- fma.rn.f32 %f5872, %f4834, %f2034, %f4833;
-
-$L__BB0_1802:
- selp.f32 %f4835, 0f3C0885E4, 0f3D2AAABB, %p1524;
- fma.rn.f32 %f4836, %f5872, %f2034, %f4835;
- selp.f32 %f4837, 0fBE2AAAA8, 0fBEFFFFFF, %p1524;
- fma.rn.f32 %f4838, %f4836, %f2034, %f4837;
- mov.f32 %f4839, 0f00000000;
- fma.rn.f32 %f4840, %f2034, %f2033, %f4839;
- fma.rn.f32 %f5873, %f4838, %f4840, %f2033;
- and.b32 %r7555, %r2391, 2;
- setp.eq.s32 %p1526, %r7555, 0;
- @%p1526 bra $L__BB0_1804;
-
- mov.f32 %f4842, 0fBF800000;
- fma.rn.f32 %f5873, %f5873, %f4842, %f4839;
-
-$L__BB0_1804:
- add.f32 %f5902, %f5870, %f5873;
- mul.f32 %f4843, %f1937, 0f3F22F983;
- cvt.rni.s32.f32 %r8709, %f4843;
- cvt.rn.f32.s32 %f4844, %r8709;
- mov.f32 %f4845, 0fBFC90FDA;
- fma.rn.f32 %f4846, %f4844, %f4845, %f1937;
- mov.f32 %f4847, 0fB3A22168;
- fma.rn.f32 %f4848, %f4844, %f4847, %f4846;
- mov.f32 %f4849, 0fA7C234C5;
- fma.rn.f32 %f5874, %f4844, %f4849, %f4848;
- abs.f32 %f2042, %f1937;
- setp.ltu.f32 %p1527, %f2042, 0f47CE4780;
- @%p1527 bra $L__BB0_1812;
-
- setp.eq.f32 %p1528, %f2042, 0f7F800000;
- @%p1528 bra $L__BB0_1811;
- bra.uni $L__BB0_1806;
-
-$L__BB0_1811:
- mov.f32 %f4852, 0f00000000;
- mul.rn.f32 %f5874, %f1937, %f4852;
- mov.u32 %r8709, 0;
- bra.uni $L__BB0_1812;
-
-$L__BB0_1806:
- mov.b32 %r2394, %f1937;
- shr.u32 %r7557, %r2394, 23;
- and.b32 %r7558, %r7557, 255;
- add.s32 %r2395, %r7558, -128;
- shl.b32 %r7559, %r2394, 8;
- or.b32 %r2396, %r7559, -2147483648;
- shr.u32 %r2397, %r2395, 5;
- mov.u64 %rd2774, 0;
- mov.u32 %r8706, 0;
- mov.u64 %rd2772, __cudart_i2opi_f;
- mov.u64 %rd2773, %rd1;
-
-$L__BB0_1807:
- .pragma "nounroll";
- ld.global.nc.u32 %r7560, [%rd2772];
- mad.wide.u32 %rd2395, %r7560, %r2396, %rd2774;
- shr.u64 %rd2774, %rd2395, 32;
- st.local.u32 [%rd2773], %rd2395;
- add.s64 %rd2773, %rd2773, 4;
- add.s64 %rd2772, %rd2772, 4;
- add.s32 %r8706, %r8706, 1;
- setp.ne.s32 %p1529, %r8706, 6;
- @%p1529 bra $L__BB0_1807;
-
- st.local.u32 [%rd5], %rd2774;
- mov.u32 %r7561, 4;
- sub.s32 %r2400, %r7561, %r2397;
- mov.u32 %r7562, 6;
- sub.s32 %r7563, %r7562, %r2397;
- mul.wide.s32 %rd2396, %r7563, 4;
- add.s64 %rd2397, %rd1, %rd2396;
- ld.local.u32 %r8707, [%rd2397];
- ld.local.u32 %r8708, [%rd2397+-4];
- and.b32 %r2403, %r2395, 31;
- setp.eq.s32 %p1530, %r2403, 0;
- @%p1530 bra $L__BB0_1810;
-
- mov.u32 %r7564, 32;
- sub.s32 %r7565, %r7564, %r2403;
- shr.u32 %r7566, %r8708, %r7565;
- shl.b32 %r7567, %r8707, %r2403;
- add.s32 %r8707, %r7566, %r7567;
- mul.wide.s32 %rd2398, %r2400, 4;
- add.s64 %rd2399, %rd1, %rd2398;
- ld.local.u32 %r7568, [%rd2399];
- shr.u32 %r7569, %r7568, %r7565;
- shl.b32 %r7570, %r8708, %r2403;
- add.s32 %r8708, %r7569, %r7570;
-
-$L__BB0_1810:
- and.b32 %r7571, %r2394, -2147483648;
- shr.u32 %r7572, %r8708, 30;
- shl.b32 %r7573, %r8707, 2;
- or.b32 %r7574, %r7572, %r7573;
- shr.u32 %r7575, %r7574, 31;
- shr.u32 %r7576, %r8707, 30;
- add.s32 %r7577, %r7575, %r7576;
- neg.s32 %r7578, %r7577;
- setp.eq.s32 %p1531, %r7571, 0;
- selp.b32 %r8709, %r7577, %r7578, %p1531;
- setp.ne.s32 %p1532, %r7575, 0;
- xor.b32 %r7579, %r7571, -2147483648;
- selp.b32 %r7580, %r7579, %r7571, %p1532;
- selp.b32 %r7581, -1, 0, %p1532;
- xor.b32 %r7582, %r7574, %r7581;
- shl.b32 %r7583, %r8708, 2;
- xor.b32 %r7584, %r7583, %r7581;
- cvt.u64.u32 %rd2400, %r7582;
- cvt.u64.u32 %rd2401, %r7584;
- bfi.b64 %rd2402, %rd2400, %rd2401, 32, 32;
- cvt.rn.f64.s64 %fd241, %rd2402;
- mul.f64 %fd242, %fd241, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4850, %fd242;
- setp.eq.s32 %p1533, %r7580, 0;
- neg.f32 %f4851, %f4850;
- selp.f32 %f5874, %f4850, %f4851, %p1533;
-
-$L__BB0_1812:
- and.b32 %r2410, %r8709, 1;
- setp.eq.s32 %p1534, %r2410, 0;
- selp.f32 %f2046, %f5874, 0f3F800000, %p1534;
- mul.rn.f32 %f2047, %f5874, %f5874;
- mov.f32 %f5875, 0fB94D4153;
- @%p1534 bra $L__BB0_1814;
-
- mov.f32 %f4854, 0fBAB607ED;
- mov.f32 %f4855, 0f37CBAC00;
- fma.rn.f32 %f5875, %f4855, %f2047, %f4854;
-
-$L__BB0_1814:
- selp.f32 %f4856, 0f3C0885E4, 0f3D2AAABB, %p1534;
- fma.rn.f32 %f4857, %f5875, %f2047, %f4856;
- selp.f32 %f4858, 0fBE2AAAA8, 0fBEFFFFFF, %p1534;
- fma.rn.f32 %f4859, %f4857, %f2047, %f4858;
- mov.f32 %f4860, 0f00000000;
- fma.rn.f32 %f4861, %f2047, %f2046, %f4860;
- fma.rn.f32 %f5876, %f4859, %f4861, %f2046;
- and.b32 %r7586, %r8709, 2;
- setp.eq.s32 %p1536, %r7586, 0;
- @%p1536 bra $L__BB0_1816;
-
- mov.f32 %f4863, 0fBF800000;
- fma.rn.f32 %f5876, %f5876, %f4863, %f4860;
-
-$L__BB0_1816:
- mul.f32 %f4864, %f1929, 0f3F22F983;
- cvt.rni.s32.f32 %r8713, %f4864;
- cvt.rn.f32.s32 %f4865, %r8713;
- mov.f32 %f4866, 0fBFC90FDA;
- fma.rn.f32 %f4867, %f4865, %f4866, %f1929;
- mov.f32 %f4868, 0fB3A22168;
- fma.rn.f32 %f4869, %f4865, %f4868, %f4867;
- mov.f32 %f4870, 0fA7C234C5;
- fma.rn.f32 %f5877, %f4865, %f4870, %f4869;
- abs.f32 %f2054, %f1929;
- setp.ltu.f32 %p1537, %f2054, 0f47CE4780;
- @%p1537 bra $L__BB0_1824;
-
- setp.eq.f32 %p1538, %f2054, 0f7F800000;
- @%p1538 bra $L__BB0_1823;
- bra.uni $L__BB0_1818;
-
-$L__BB0_1823:
- mov.f32 %f4873, 0f00000000;
- mul.rn.f32 %f5877, %f1929, %f4873;
- mov.u32 %r8713, 0;
- bra.uni $L__BB0_1824;
-
-$L__BB0_1818:
- mov.b32 %r2412, %f1929;
- shr.u32 %r7588, %r2412, 23;
- and.b32 %r7589, %r7588, 255;
- add.s32 %r2413, %r7589, -128;
- shl.b32 %r7590, %r2412, 8;
- or.b32 %r2414, %r7590, -2147483648;
- shr.u32 %r2415, %r2413, 5;
- mov.u64 %rd2777, 0;
- mov.u32 %r8710, 0;
- mov.u64 %rd2775, __cudart_i2opi_f;
- mov.u64 %rd2776, %rd1;
-
-$L__BB0_1819:
- .pragma "nounroll";
- ld.global.nc.u32 %r7591, [%rd2775];
- mad.wide.u32 %rd2405, %r7591, %r2414, %rd2777;
- shr.u64 %rd2777, %rd2405, 32;
- st.local.u32 [%rd2776], %rd2405;
- add.s64 %rd2776, %rd2776, 4;
- add.s64 %rd2775, %rd2775, 4;
- add.s32 %r8710, %r8710, 1;
- setp.ne.s32 %p1539, %r8710, 6;
- @%p1539 bra $L__BB0_1819;
-
- st.local.u32 [%rd5], %rd2777;
- mov.u32 %r7592, 4;
- sub.s32 %r2418, %r7592, %r2415;
- mov.u32 %r7593, 6;
- sub.s32 %r7594, %r7593, %r2415;
- mul.wide.s32 %rd2406, %r7594, 4;
- add.s64 %rd2407, %rd1, %rd2406;
- ld.local.u32 %r8711, [%rd2407];
- ld.local.u32 %r8712, [%rd2407+-4];
- and.b32 %r2421, %r2413, 31;
- setp.eq.s32 %p1540, %r2421, 0;
- @%p1540 bra $L__BB0_1822;
-
- mov.u32 %r7595, 32;
- sub.s32 %r7596, %r7595, %r2421;
- shr.u32 %r7597, %r8712, %r7596;
- shl.b32 %r7598, %r8711, %r2421;
- add.s32 %r8711, %r7597, %r7598;
- mul.wide.s32 %rd2408, %r2418, 4;
- add.s64 %rd2409, %rd1, %rd2408;
- ld.local.u32 %r7599, [%rd2409];
- shr.u32 %r7600, %r7599, %r7596;
- shl.b32 %r7601, %r8712, %r2421;
- add.s32 %r8712, %r7600, %r7601;
-
-$L__BB0_1822:
- and.b32 %r7602, %r2412, -2147483648;
- shr.u32 %r7603, %r8712, 30;
- shl.b32 %r7604, %r8711, 2;
- or.b32 %r7605, %r7603, %r7604;
- shr.u32 %r7606, %r7605, 31;
- shr.u32 %r7607, %r8711, 30;
- add.s32 %r7608, %r7606, %r7607;
- neg.s32 %r7609, %r7608;
- setp.eq.s32 %p1541, %r7602, 0;
- selp.b32 %r8713, %r7608, %r7609, %p1541;
- setp.ne.s32 %p1542, %r7606, 0;
- xor.b32 %r7610, %r7602, -2147483648;
- selp.b32 %r7611, %r7610, %r7602, %p1542;
- selp.b32 %r7612, -1, 0, %p1542;
- xor.b32 %r7613, %r7605, %r7612;
- shl.b32 %r7614, %r8712, 2;
- xor.b32 %r7615, %r7614, %r7612;
- cvt.u64.u32 %rd2410, %r7613;
- cvt.u64.u32 %rd2411, %r7615;
- bfi.b64 %rd2412, %rd2410, %rd2411, 32, 32;
- cvt.rn.f64.s64 %fd243, %rd2412;
- mul.f64 %fd244, %fd243, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4871, %fd244;
- setp.eq.s32 %p1543, %r7611, 0;
- neg.f32 %f4872, %f4871;
- selp.f32 %f5877, %f4871, %f4872, %p1543;
-
-$L__BB0_1824:
- add.s32 %r2428, %r8713, 1;
- and.b32 %r2429, %r2428, 1;
- setp.eq.s32 %p1544, %r2429, 0;
- selp.f32 %f2058, %f5877, 0f3F800000, %p1544;
- mul.rn.f32 %f2059, %f5877, %f5877;
- mov.f32 %f5878, 0fB94D4153;
- @%p1544 bra $L__BB0_1826;
-
- mov.f32 %f4875, 0fBAB607ED;
- mov.f32 %f4876, 0f37CBAC00;
- fma.rn.f32 %f5878, %f4876, %f2059, %f4875;
-
-$L__BB0_1826:
- selp.f32 %f4877, 0f3C0885E4, 0f3D2AAABB, %p1544;
- fma.rn.f32 %f4878, %f5878, %f2059, %f4877;
- selp.f32 %f4879, 0fBE2AAAA8, 0fBEFFFFFF, %p1544;
- fma.rn.f32 %f4880, %f4878, %f2059, %f4879;
- mov.f32 %f4881, 0f00000000;
- fma.rn.f32 %f4882, %f2059, %f2058, %f4881;
- fma.rn.f32 %f5879, %f4880, %f4882, %f2058;
- and.b32 %r7617, %r2428, 2;
- setp.eq.s32 %p1546, %r7617, 0;
- @%p1546 bra $L__BB0_1828;
-
- mov.f32 %f4884, 0fBF800000;
- fma.rn.f32 %f5879, %f5879, %f4884, %f4881;
-
-$L__BB0_1828:
- add.f32 %f5901, %f5876, %f5879;
- mul.f32 %f4885, %f1938, 0f3F22F983;
- cvt.rni.s32.f32 %r8717, %f4885;
- cvt.rn.f32.s32 %f4886, %r8717;
- mov.f32 %f4887, 0fBFC90FDA;
- fma.rn.f32 %f4888, %f4886, %f4887, %f1938;
- mov.f32 %f4889, 0fB3A22168;
- fma.rn.f32 %f4890, %f4886, %f4889, %f4888;
- mov.f32 %f4891, 0fA7C234C5;
- fma.rn.f32 %f5880, %f4886, %f4891, %f4890;
- abs.f32 %f2067, %f1938;
- setp.ltu.f32 %p1547, %f2067, 0f47CE4780;
- @%p1547 bra $L__BB0_1836;
-
- setp.eq.f32 %p1548, %f2067, 0f7F800000;
- @%p1548 bra $L__BB0_1835;
- bra.uni $L__BB0_1830;
-
-$L__BB0_1835:
- mov.f32 %f4894, 0f00000000;
- mul.rn.f32 %f5880, %f1938, %f4894;
- mov.u32 %r8717, 0;
- bra.uni $L__BB0_1836;
-
-$L__BB0_1830:
- mov.b32 %r2431, %f1938;
- shr.u32 %r7619, %r2431, 23;
- and.b32 %r7620, %r7619, 255;
- add.s32 %r2432, %r7620, -128;
- shl.b32 %r7621, %r2431, 8;
- or.b32 %r2433, %r7621, -2147483648;
- shr.u32 %r2434, %r2432, 5;
- mov.u64 %rd2780, 0;
- mov.u32 %r8714, 0;
- mov.u64 %rd2778, __cudart_i2opi_f;
- mov.u64 %rd2779, %rd1;
-
-$L__BB0_1831:
- .pragma "nounroll";
- ld.global.nc.u32 %r7622, [%rd2778];
- mad.wide.u32 %rd2415, %r7622, %r2433, %rd2780;
- shr.u64 %rd2780, %rd2415, 32;
- st.local.u32 [%rd2779], %rd2415;
- add.s64 %rd2779, %rd2779, 4;
- add.s64 %rd2778, %rd2778, 4;
- add.s32 %r8714, %r8714, 1;
- setp.ne.s32 %p1549, %r8714, 6;
- @%p1549 bra $L__BB0_1831;
-
- st.local.u32 [%rd5], %rd2780;
- mov.u32 %r7623, 4;
- sub.s32 %r2437, %r7623, %r2434;
- mov.u32 %r7624, 6;
- sub.s32 %r7625, %r7624, %r2434;
- mul.wide.s32 %rd2416, %r7625, 4;
- add.s64 %rd2417, %rd1, %rd2416;
- ld.local.u32 %r8715, [%rd2417];
- ld.local.u32 %r8716, [%rd2417+-4];
- and.b32 %r2440, %r2432, 31;
- setp.eq.s32 %p1550, %r2440, 0;
- @%p1550 bra $L__BB0_1834;
-
- mov.u32 %r7626, 32;
- sub.s32 %r7627, %r7626, %r2440;
- shr.u32 %r7628, %r8716, %r7627;
- shl.b32 %r7629, %r8715, %r2440;
- add.s32 %r8715, %r7628, %r7629;
- mul.wide.s32 %rd2418, %r2437, 4;
- add.s64 %rd2419, %rd1, %rd2418;
- ld.local.u32 %r7630, [%rd2419];
- shr.u32 %r7631, %r7630, %r7627;
- shl.b32 %r7632, %r8716, %r2440;
- add.s32 %r8716, %r7631, %r7632;
-
-$L__BB0_1834:
- and.b32 %r7633, %r2431, -2147483648;
- shr.u32 %r7634, %r8716, 30;
- shl.b32 %r7635, %r8715, 2;
- or.b32 %r7636, %r7634, %r7635;
- shr.u32 %r7637, %r7636, 31;
- shr.u32 %r7638, %r8715, 30;
- add.s32 %r7639, %r7637, %r7638;
- neg.s32 %r7640, %r7639;
- setp.eq.s32 %p1551, %r7633, 0;
- selp.b32 %r8717, %r7639, %r7640, %p1551;
- setp.ne.s32 %p1552, %r7637, 0;
- xor.b32 %r7641, %r7633, -2147483648;
- selp.b32 %r7642, %r7641, %r7633, %p1552;
- selp.b32 %r7643, -1, 0, %p1552;
- xor.b32 %r7644, %r7636, %r7643;
- shl.b32 %r7645, %r8716, 2;
- xor.b32 %r7646, %r7645, %r7643;
- cvt.u64.u32 %rd2420, %r7644;
- cvt.u64.u32 %rd2421, %r7646;
- bfi.b64 %rd2422, %rd2420, %rd2421, 32, 32;
- cvt.rn.f64.s64 %fd245, %rd2422;
- mul.f64 %fd246, %fd245, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4892, %fd246;
- setp.eq.s32 %p1553, %r7642, 0;
- neg.f32 %f4893, %f4892;
- selp.f32 %f5880, %f4892, %f4893, %p1553;
-
-$L__BB0_1836:
- and.b32 %r2447, %r8717, 1;
- setp.eq.s32 %p1554, %r2447, 0;
- selp.f32 %f2071, %f5880, 0f3F800000, %p1554;
- mul.rn.f32 %f2072, %f5880, %f5880;
- mov.f32 %f5881, 0fB94D4153;
- @%p1554 bra $L__BB0_1838;
-
- mov.f32 %f4896, 0fBAB607ED;
- mov.f32 %f4897, 0f37CBAC00;
- fma.rn.f32 %f5881, %f4897, %f2072, %f4896;
-
-$L__BB0_1838:
- selp.f32 %f4898, 0f3C0885E4, 0f3D2AAABB, %p1554;
- fma.rn.f32 %f4899, %f5881, %f2072, %f4898;
- selp.f32 %f4900, 0fBE2AAAA8, 0fBEFFFFFF, %p1554;
- fma.rn.f32 %f4901, %f4899, %f2072, %f4900;
- mov.f32 %f4902, 0f00000000;
- fma.rn.f32 %f4903, %f2072, %f2071, %f4902;
- fma.rn.f32 %f5882, %f4901, %f4903, %f2071;
- and.b32 %r7648, %r8717, 2;
- setp.eq.s32 %p1556, %r7648, 0;
- @%p1556 bra $L__BB0_1840;
-
- mov.f32 %f4905, 0fBF800000;
- fma.rn.f32 %f5882, %f5882, %f4905, %f4902;
-
-$L__BB0_1840:
- mul.f32 %f4906, %f1930, 0f3F22F983;
- cvt.rni.s32.f32 %r8721, %f4906;
- cvt.rn.f32.s32 %f4907, %r8721;
- mov.f32 %f4908, 0fBFC90FDA;
- fma.rn.f32 %f4909, %f4907, %f4908, %f1930;
- mov.f32 %f4910, 0fB3A22168;
- fma.rn.f32 %f4911, %f4907, %f4910, %f4909;
- mov.f32 %f4912, 0fA7C234C5;
- fma.rn.f32 %f5883, %f4907, %f4912, %f4911;
- abs.f32 %f2079, %f1930;
- setp.ltu.f32 %p1557, %f2079, 0f47CE4780;
- @%p1557 bra $L__BB0_1848;
-
- setp.eq.f32 %p1558, %f2079, 0f7F800000;
- @%p1558 bra $L__BB0_1847;
- bra.uni $L__BB0_1842;
-
-$L__BB0_1847:
- mov.f32 %f4915, 0f00000000;
- mul.rn.f32 %f5883, %f1930, %f4915;
- mov.u32 %r8721, 0;
- bra.uni $L__BB0_1848;
-
-$L__BB0_1842:
- mov.b32 %r2449, %f1930;
- shr.u32 %r7650, %r2449, 23;
- and.b32 %r7651, %r7650, 255;
- add.s32 %r2450, %r7651, -128;
- shl.b32 %r7652, %r2449, 8;
- or.b32 %r2451, %r7652, -2147483648;
- shr.u32 %r2452, %r2450, 5;
- mov.u64 %rd2783, 0;
- mov.u32 %r8718, 0;
- mov.u64 %rd2781, __cudart_i2opi_f;
- mov.u64 %rd2782, %rd1;
-
-$L__BB0_1843:
- .pragma "nounroll";
- ld.global.nc.u32 %r7653, [%rd2781];
- mad.wide.u32 %rd2425, %r7653, %r2451, %rd2783;
- shr.u64 %rd2783, %rd2425, 32;
- st.local.u32 [%rd2782], %rd2425;
- add.s64 %rd2782, %rd2782, 4;
- add.s64 %rd2781, %rd2781, 4;
- add.s32 %r8718, %r8718, 1;
- setp.ne.s32 %p1559, %r8718, 6;
- @%p1559 bra $L__BB0_1843;
-
- st.local.u32 [%rd5], %rd2783;
- mov.u32 %r7654, 4;
- sub.s32 %r2455, %r7654, %r2452;
- mov.u32 %r7655, 6;
- sub.s32 %r7656, %r7655, %r2452;
- mul.wide.s32 %rd2426, %r7656, 4;
- add.s64 %rd2427, %rd1, %rd2426;
- ld.local.u32 %r8719, [%rd2427];
- ld.local.u32 %r8720, [%rd2427+-4];
- and.b32 %r2458, %r2450, 31;
- setp.eq.s32 %p1560, %r2458, 0;
- @%p1560 bra $L__BB0_1846;
-
- mov.u32 %r7657, 32;
- sub.s32 %r7658, %r7657, %r2458;
- shr.u32 %r7659, %r8720, %r7658;
- shl.b32 %r7660, %r8719, %r2458;
- add.s32 %r8719, %r7659, %r7660;
- mul.wide.s32 %rd2428, %r2455, 4;
- add.s64 %rd2429, %rd1, %rd2428;
- ld.local.u32 %r7661, [%rd2429];
- shr.u32 %r7662, %r7661, %r7658;
- shl.b32 %r7663, %r8720, %r2458;
- add.s32 %r8720, %r7662, %r7663;
-
-$L__BB0_1846:
- and.b32 %r7664, %r2449, -2147483648;
- shr.u32 %r7665, %r8720, 30;
- shl.b32 %r7666, %r8719, 2;
- or.b32 %r7667, %r7665, %r7666;
- shr.u32 %r7668, %r7667, 31;
- shr.u32 %r7669, %r8719, 30;
- add.s32 %r7670, %r7668, %r7669;
- neg.s32 %r7671, %r7670;
- setp.eq.s32 %p1561, %r7664, 0;
- selp.b32 %r8721, %r7670, %r7671, %p1561;
- setp.ne.s32 %p1562, %r7668, 0;
- xor.b32 %r7672, %r7664, -2147483648;
- selp.b32 %r7673, %r7672, %r7664, %p1562;
- selp.b32 %r7674, -1, 0, %p1562;
- xor.b32 %r7675, %r7667, %r7674;
- shl.b32 %r7676, %r8720, 2;
- xor.b32 %r7677, %r7676, %r7674;
- cvt.u64.u32 %rd2430, %r7675;
- cvt.u64.u32 %rd2431, %r7677;
- bfi.b64 %rd2432, %rd2430, %rd2431, 32, 32;
- cvt.rn.f64.s64 %fd247, %rd2432;
- mul.f64 %fd248, %fd247, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4913, %fd248;
- setp.eq.s32 %p1563, %r7673, 0;
- neg.f32 %f4914, %f4913;
- selp.f32 %f5883, %f4913, %f4914, %p1563;
-
-$L__BB0_1848:
- add.s32 %r2465, %r8721, 1;
- and.b32 %r2466, %r2465, 1;
- setp.eq.s32 %p1564, %r2466, 0;
- selp.f32 %f2083, %f5883, 0f3F800000, %p1564;
- mul.rn.f32 %f2084, %f5883, %f5883;
- mov.f32 %f5884, 0fB94D4153;
- @%p1564 bra $L__BB0_1850;
-
- mov.f32 %f4917, 0fBAB607ED;
- mov.f32 %f4918, 0f37CBAC00;
- fma.rn.f32 %f5884, %f4918, %f2084, %f4917;
-
-$L__BB0_1850:
- selp.f32 %f4919, 0f3C0885E4, 0f3D2AAABB, %p1564;
- fma.rn.f32 %f4920, %f5884, %f2084, %f4919;
- selp.f32 %f4921, 0fBE2AAAA8, 0fBEFFFFFF, %p1564;
- fma.rn.f32 %f4922, %f4920, %f2084, %f4921;
- mov.f32 %f4923, 0f00000000;
- fma.rn.f32 %f4924, %f2084, %f2083, %f4923;
- fma.rn.f32 %f5885, %f4922, %f4924, %f2083;
- and.b32 %r7679, %r2465, 2;
- setp.eq.s32 %p1566, %r7679, 0;
- @%p1566 bra $L__BB0_1852;
-
- mov.f32 %f4926, 0fBF800000;
- fma.rn.f32 %f5885, %f5885, %f4926, %f4923;
-
-$L__BB0_1852:
- add.f32 %f5900, %f5882, %f5885;
- mul.f32 %f4927, %f1939, 0f3F22F983;
- cvt.rni.s32.f32 %r8725, %f4927;
- cvt.rn.f32.s32 %f4928, %r8725;
- mov.f32 %f4929, 0fBFC90FDA;
- fma.rn.f32 %f4930, %f4928, %f4929, %f1939;
- mov.f32 %f4931, 0fB3A22168;
- fma.rn.f32 %f4932, %f4928, %f4931, %f4930;
- mov.f32 %f4933, 0fA7C234C5;
- fma.rn.f32 %f5886, %f4928, %f4933, %f4932;
- abs.f32 %f2092, %f1939;
- setp.ltu.f32 %p1567, %f2092, 0f47CE4780;
- @%p1567 bra $L__BB0_1860;
-
- setp.eq.f32 %p1568, %f2092, 0f7F800000;
- @%p1568 bra $L__BB0_1859;
- bra.uni $L__BB0_1854;
-
-$L__BB0_1859:
- mov.f32 %f4936, 0f00000000;
- mul.rn.f32 %f5886, %f1939, %f4936;
- mov.u32 %r8725, 0;
- bra.uni $L__BB0_1860;
-
-$L__BB0_1854:
- mov.b32 %r2468, %f1939;
- shr.u32 %r7681, %r2468, 23;
- and.b32 %r7682, %r7681, 255;
- add.s32 %r2469, %r7682, -128;
- shl.b32 %r7683, %r2468, 8;
- or.b32 %r2470, %r7683, -2147483648;
- shr.u32 %r2471, %r2469, 5;
- mov.u64 %rd2786, 0;
- mov.u32 %r8722, 0;
- mov.u64 %rd2784, __cudart_i2opi_f;
- mov.u64 %rd2785, %rd1;
-
-$L__BB0_1855:
- .pragma "nounroll";
- ld.global.nc.u32 %r7684, [%rd2784];
- mad.wide.u32 %rd2435, %r7684, %r2470, %rd2786;
- shr.u64 %rd2786, %rd2435, 32;
- st.local.u32 [%rd2785], %rd2435;
- add.s64 %rd2785, %rd2785, 4;
- add.s64 %rd2784, %rd2784, 4;
- add.s32 %r8722, %r8722, 1;
- setp.ne.s32 %p1569, %r8722, 6;
- @%p1569 bra $L__BB0_1855;
-
- st.local.u32 [%rd5], %rd2786;
- mov.u32 %r7685, 4;
- sub.s32 %r2474, %r7685, %r2471;
- mov.u32 %r7686, 6;
- sub.s32 %r7687, %r7686, %r2471;
- mul.wide.s32 %rd2436, %r7687, 4;
- add.s64 %rd2437, %rd1, %rd2436;
- ld.local.u32 %r8723, [%rd2437];
- ld.local.u32 %r8724, [%rd2437+-4];
- and.b32 %r2477, %r2469, 31;
- setp.eq.s32 %p1570, %r2477, 0;
- @%p1570 bra $L__BB0_1858;
-
- mov.u32 %r7688, 32;
- sub.s32 %r7689, %r7688, %r2477;
- shr.u32 %r7690, %r8724, %r7689;
- shl.b32 %r7691, %r8723, %r2477;
- add.s32 %r8723, %r7690, %r7691;
- mul.wide.s32 %rd2438, %r2474, 4;
- add.s64 %rd2439, %rd1, %rd2438;
- ld.local.u32 %r7692, [%rd2439];
- shr.u32 %r7693, %r7692, %r7689;
- shl.b32 %r7694, %r8724, %r2477;
- add.s32 %r8724, %r7693, %r7694;
-
-$L__BB0_1858:
- and.b32 %r7695, %r2468, -2147483648;
- shr.u32 %r7696, %r8724, 30;
- shl.b32 %r7697, %r8723, 2;
- or.b32 %r7698, %r7696, %r7697;
- shr.u32 %r7699, %r7698, 31;
- shr.u32 %r7700, %r8723, 30;
- add.s32 %r7701, %r7699, %r7700;
- neg.s32 %r7702, %r7701;
- setp.eq.s32 %p1571, %r7695, 0;
- selp.b32 %r8725, %r7701, %r7702, %p1571;
- setp.ne.s32 %p1572, %r7699, 0;
- xor.b32 %r7703, %r7695, -2147483648;
- selp.b32 %r7704, %r7703, %r7695, %p1572;
- selp.b32 %r7705, -1, 0, %p1572;
- xor.b32 %r7706, %r7698, %r7705;
- shl.b32 %r7707, %r8724, 2;
- xor.b32 %r7708, %r7707, %r7705;
- cvt.u64.u32 %rd2440, %r7706;
- cvt.u64.u32 %rd2441, %r7708;
- bfi.b64 %rd2442, %rd2440, %rd2441, 32, 32;
- cvt.rn.f64.s64 %fd249, %rd2442;
- mul.f64 %fd250, %fd249, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4934, %fd250;
- setp.eq.s32 %p1573, %r7704, 0;
- neg.f32 %f4935, %f4934;
- selp.f32 %f5886, %f4934, %f4935, %p1573;
-
-$L__BB0_1860:
- and.b32 %r2484, %r8725, 1;
- setp.eq.s32 %p1574, %r2484, 0;
- selp.f32 %f2096, %f5886, 0f3F800000, %p1574;
- mul.rn.f32 %f2097, %f5886, %f5886;
- mov.f32 %f5887, 0fB94D4153;
- @%p1574 bra $L__BB0_1862;
-
- mov.f32 %f4938, 0fBAB607ED;
- mov.f32 %f4939, 0f37CBAC00;
- fma.rn.f32 %f5887, %f4939, %f2097, %f4938;
-
-$L__BB0_1862:
- selp.f32 %f4940, 0f3C0885E4, 0f3D2AAABB, %p1574;
- fma.rn.f32 %f4941, %f5887, %f2097, %f4940;
- selp.f32 %f4942, 0fBE2AAAA8, 0fBEFFFFFF, %p1574;
- fma.rn.f32 %f4943, %f4941, %f2097, %f4942;
- mov.f32 %f4944, 0f00000000;
- fma.rn.f32 %f4945, %f2097, %f2096, %f4944;
- fma.rn.f32 %f5888, %f4943, %f4945, %f2096;
- and.b32 %r7710, %r8725, 2;
- setp.eq.s32 %p1576, %r7710, 0;
- @%p1576 bra $L__BB0_1864;
-
- mov.f32 %f4947, 0fBF800000;
- fma.rn.f32 %f5888, %f5888, %f4947, %f4944;
-
-$L__BB0_1864:
- mul.f32 %f4948, %f1931, 0f3F22F983;
- cvt.rni.s32.f32 %r8729, %f4948;
- cvt.rn.f32.s32 %f4949, %r8729;
- mov.f32 %f4950, 0fBFC90FDA;
- fma.rn.f32 %f4951, %f4949, %f4950, %f1931;
- mov.f32 %f4952, 0fB3A22168;
- fma.rn.f32 %f4953, %f4949, %f4952, %f4951;
- mov.f32 %f4954, 0fA7C234C5;
- fma.rn.f32 %f5889, %f4949, %f4954, %f4953;
- abs.f32 %f2104, %f1931;
- setp.ltu.f32 %p1577, %f2104, 0f47CE4780;
- @%p1577 bra $L__BB0_1872;
-
- setp.eq.f32 %p1578, %f2104, 0f7F800000;
- @%p1578 bra $L__BB0_1871;
- bra.uni $L__BB0_1866;
-
-$L__BB0_1871:
- mov.f32 %f4957, 0f00000000;
- mul.rn.f32 %f5889, %f1931, %f4957;
- mov.u32 %r8729, 0;
- bra.uni $L__BB0_1872;
-
-$L__BB0_1866:
- mov.b32 %r2486, %f1931;
- shr.u32 %r7712, %r2486, 23;
- and.b32 %r7713, %r7712, 255;
- add.s32 %r2487, %r7713, -128;
- shl.b32 %r7714, %r2486, 8;
- or.b32 %r2488, %r7714, -2147483648;
- shr.u32 %r2489, %r2487, 5;
- mov.u64 %rd2789, 0;
- mov.u32 %r8726, 0;
- mov.u64 %rd2787, __cudart_i2opi_f;
- mov.u64 %rd2788, %rd1;
-
-$L__BB0_1867:
- .pragma "nounroll";
- ld.global.nc.u32 %r7715, [%rd2787];
- mad.wide.u32 %rd2445, %r7715, %r2488, %rd2789;
- shr.u64 %rd2789, %rd2445, 32;
- st.local.u32 [%rd2788], %rd2445;
- add.s64 %rd2788, %rd2788, 4;
- add.s64 %rd2787, %rd2787, 4;
- add.s32 %r8726, %r8726, 1;
- setp.ne.s32 %p1579, %r8726, 6;
- @%p1579 bra $L__BB0_1867;
-
- st.local.u32 [%rd5], %rd2789;
- mov.u32 %r7716, 4;
- sub.s32 %r2492, %r7716, %r2489;
- mov.u32 %r7717, 6;
- sub.s32 %r7718, %r7717, %r2489;
- mul.wide.s32 %rd2446, %r7718, 4;
- add.s64 %rd2447, %rd1, %rd2446;
- ld.local.u32 %r8727, [%rd2447];
- ld.local.u32 %r8728, [%rd2447+-4];
- and.b32 %r2495, %r2487, 31;
- setp.eq.s32 %p1580, %r2495, 0;
- @%p1580 bra $L__BB0_1870;
-
- mov.u32 %r7719, 32;
- sub.s32 %r7720, %r7719, %r2495;
- shr.u32 %r7721, %r8728, %r7720;
- shl.b32 %r7722, %r8727, %r2495;
- add.s32 %r8727, %r7721, %r7722;
- mul.wide.s32 %rd2448, %r2492, 4;
- add.s64 %rd2449, %rd1, %rd2448;
- ld.local.u32 %r7723, [%rd2449];
- shr.u32 %r7724, %r7723, %r7720;
- shl.b32 %r7725, %r8728, %r2495;
- add.s32 %r8728, %r7724, %r7725;
-
-$L__BB0_1870:
- and.b32 %r7726, %r2486, -2147483648;
- shr.u32 %r7727, %r8728, 30;
- shl.b32 %r7728, %r8727, 2;
- or.b32 %r7729, %r7727, %r7728;
- shr.u32 %r7730, %r7729, 31;
- shr.u32 %r7731, %r8727, 30;
- add.s32 %r7732, %r7730, %r7731;
- neg.s32 %r7733, %r7732;
- setp.eq.s32 %p1581, %r7726, 0;
- selp.b32 %r8729, %r7732, %r7733, %p1581;
- setp.ne.s32 %p1582, %r7730, 0;
- xor.b32 %r7734, %r7726, -2147483648;
- selp.b32 %r7735, %r7734, %r7726, %p1582;
- selp.b32 %r7736, -1, 0, %p1582;
- xor.b32 %r7737, %r7729, %r7736;
- shl.b32 %r7738, %r8728, 2;
- xor.b32 %r7739, %r7738, %r7736;
- cvt.u64.u32 %rd2450, %r7737;
- cvt.u64.u32 %rd2451, %r7739;
- bfi.b64 %rd2452, %rd2450, %rd2451, 32, 32;
- cvt.rn.f64.s64 %fd251, %rd2452;
- mul.f64 %fd252, %fd251, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4955, %fd252;
- setp.eq.s32 %p1583, %r7735, 0;
- neg.f32 %f4956, %f4955;
- selp.f32 %f5889, %f4955, %f4956, %p1583;
-
-$L__BB0_1872:
- add.s32 %r2502, %r8729, 1;
- and.b32 %r2503, %r2502, 1;
- setp.eq.s32 %p1584, %r2503, 0;
- selp.f32 %f2108, %f5889, 0f3F800000, %p1584;
- mul.rn.f32 %f2109, %f5889, %f5889;
- mov.f32 %f5890, 0fB94D4153;
- @%p1584 bra $L__BB0_1874;
-
- mov.f32 %f4959, 0fBAB607ED;
- mov.f32 %f4960, 0f37CBAC00;
- fma.rn.f32 %f5890, %f4960, %f2109, %f4959;
-
-$L__BB0_1874:
- selp.f32 %f4961, 0f3C0885E4, 0f3D2AAABB, %p1584;
- fma.rn.f32 %f4962, %f5890, %f2109, %f4961;
- selp.f32 %f4963, 0fBE2AAAA8, 0fBEFFFFFF, %p1584;
- fma.rn.f32 %f4964, %f4962, %f2109, %f4963;
- mov.f32 %f4965, 0f00000000;
- fma.rn.f32 %f4966, %f2109, %f2108, %f4965;
- fma.rn.f32 %f5891, %f4964, %f4966, %f2108;
- and.b32 %r7741, %r2502, 2;
- setp.eq.s32 %p1586, %r7741, 0;
- @%p1586 bra $L__BB0_1876;
-
- mov.f32 %f4968, 0fBF800000;
- fma.rn.f32 %f5891, %f5891, %f4968, %f4965;
-
-$L__BB0_1876:
- add.f32 %f5899, %f5888, %f5891;
- mul.f32 %f4969, %f1940, 0f3F22F983;
- cvt.rni.s32.f32 %r8733, %f4969;
- cvt.rn.f32.s32 %f4970, %r8733;
- mov.f32 %f4971, 0fBFC90FDA;
- fma.rn.f32 %f4972, %f4970, %f4971, %f1940;
- mov.f32 %f4973, 0fB3A22168;
- fma.rn.f32 %f4974, %f4970, %f4973, %f4972;
- mov.f32 %f4975, 0fA7C234C5;
- fma.rn.f32 %f5892, %f4970, %f4975, %f4974;
- abs.f32 %f2117, %f1940;
- setp.ltu.f32 %p1587, %f2117, 0f47CE4780;
- @%p1587 bra $L__BB0_1884;
-
- setp.eq.f32 %p1588, %f2117, 0f7F800000;
- @%p1588 bra $L__BB0_1883;
- bra.uni $L__BB0_1878;
-
-$L__BB0_1883:
- mov.f32 %f4978, 0f00000000;
- mul.rn.f32 %f5892, %f1940, %f4978;
- mov.u32 %r8733, 0;
- bra.uni $L__BB0_1884;
-
-$L__BB0_1878:
- mov.b32 %r2505, %f1940;
- shr.u32 %r7743, %r2505, 23;
- and.b32 %r7744, %r7743, 255;
- add.s32 %r2506, %r7744, -128;
- shl.b32 %r7745, %r2505, 8;
- or.b32 %r2507, %r7745, -2147483648;
- shr.u32 %r2508, %r2506, 5;
- mov.u64 %rd2792, 0;
- mov.u32 %r8730, 0;
- mov.u64 %rd2790, __cudart_i2opi_f;
- mov.u64 %rd2791, %rd1;
-
-$L__BB0_1879:
- .pragma "nounroll";
- ld.global.nc.u32 %r7746, [%rd2790];
- mad.wide.u32 %rd2455, %r7746, %r2507, %rd2792;
- shr.u64 %rd2792, %rd2455, 32;
- st.local.u32 [%rd2791], %rd2455;
- add.s64 %rd2791, %rd2791, 4;
- add.s64 %rd2790, %rd2790, 4;
- add.s32 %r8730, %r8730, 1;
- setp.ne.s32 %p1589, %r8730, 6;
- @%p1589 bra $L__BB0_1879;
-
- st.local.u32 [%rd5], %rd2792;
- mov.u32 %r7747, 4;
- sub.s32 %r2511, %r7747, %r2508;
- mov.u32 %r7748, 6;
- sub.s32 %r7749, %r7748, %r2508;
- mul.wide.s32 %rd2456, %r7749, 4;
- add.s64 %rd2457, %rd1, %rd2456;
- ld.local.u32 %r8731, [%rd2457];
- ld.local.u32 %r8732, [%rd2457+-4];
- and.b32 %r2514, %r2506, 31;
- setp.eq.s32 %p1590, %r2514, 0;
- @%p1590 bra $L__BB0_1882;
-
- mov.u32 %r7750, 32;
- sub.s32 %r7751, %r7750, %r2514;
- shr.u32 %r7752, %r8732, %r7751;
- shl.b32 %r7753, %r8731, %r2514;
- add.s32 %r8731, %r7752, %r7753;
- mul.wide.s32 %rd2458, %r2511, 4;
- add.s64 %rd2459, %rd1, %rd2458;
- ld.local.u32 %r7754, [%rd2459];
- shr.u32 %r7755, %r7754, %r7751;
- shl.b32 %r7756, %r8732, %r2514;
- add.s32 %r8732, %r7755, %r7756;
-
-$L__BB0_1882:
- and.b32 %r7757, %r2505, -2147483648;
- shr.u32 %r7758, %r8732, 30;
- shl.b32 %r7759, %r8731, 2;
- or.b32 %r7760, %r7758, %r7759;
- shr.u32 %r7761, %r7760, 31;
- shr.u32 %r7762, %r8731, 30;
- add.s32 %r7763, %r7761, %r7762;
- neg.s32 %r7764, %r7763;
- setp.eq.s32 %p1591, %r7757, 0;
- selp.b32 %r8733, %r7763, %r7764, %p1591;
- setp.ne.s32 %p1592, %r7761, 0;
- xor.b32 %r7765, %r7757, -2147483648;
- selp.b32 %r7766, %r7765, %r7757, %p1592;
- selp.b32 %r7767, -1, 0, %p1592;
- xor.b32 %r7768, %r7760, %r7767;
- shl.b32 %r7769, %r8732, 2;
- xor.b32 %r7770, %r7769, %r7767;
- cvt.u64.u32 %rd2460, %r7768;
- cvt.u64.u32 %rd2461, %r7770;
- bfi.b64 %rd2462, %rd2460, %rd2461, 32, 32;
- cvt.rn.f64.s64 %fd253, %rd2462;
- mul.f64 %fd254, %fd253, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4976, %fd254;
- setp.eq.s32 %p1593, %r7766, 0;
- neg.f32 %f4977, %f4976;
- selp.f32 %f5892, %f4976, %f4977, %p1593;
-
-$L__BB0_1884:
- and.b32 %r2521, %r8733, 1;
- setp.eq.s32 %p1594, %r2521, 0;
- selp.f32 %f2121, %f5892, 0f3F800000, %p1594;
- mul.rn.f32 %f2122, %f5892, %f5892;
- mov.f32 %f5893, 0fB94D4153;
- @%p1594 bra $L__BB0_1886;
-
- mov.f32 %f4980, 0fBAB607ED;
- mov.f32 %f4981, 0f37CBAC00;
- fma.rn.f32 %f5893, %f4981, %f2122, %f4980;
-
-$L__BB0_1886:
- selp.f32 %f4982, 0f3C0885E4, 0f3D2AAABB, %p1594;
- fma.rn.f32 %f4983, %f5893, %f2122, %f4982;
- selp.f32 %f4984, 0fBE2AAAA8, 0fBEFFFFFF, %p1594;
- fma.rn.f32 %f4985, %f4983, %f2122, %f4984;
- mov.f32 %f4986, 0f00000000;
- fma.rn.f32 %f4987, %f2122, %f2121, %f4986;
- fma.rn.f32 %f5894, %f4985, %f4987, %f2121;
- and.b32 %r7772, %r8733, 2;
- setp.eq.s32 %p1596, %r7772, 0;
- @%p1596 bra $L__BB0_1888;
-
- mov.f32 %f4989, 0fBF800000;
- fma.rn.f32 %f5894, %f5894, %f4989, %f4986;
-
-$L__BB0_1888:
- mul.f32 %f4990, %f1932, 0f3F22F983;
- cvt.rni.s32.f32 %r8737, %f4990;
- cvt.rn.f32.s32 %f4991, %r8737;
- mov.f32 %f4992, 0fBFC90FDA;
- fma.rn.f32 %f4993, %f4991, %f4992, %f1932;
- mov.f32 %f4994, 0fB3A22168;
- fma.rn.f32 %f4995, %f4991, %f4994, %f4993;
- mov.f32 %f4996, 0fA7C234C5;
- fma.rn.f32 %f5895, %f4991, %f4996, %f4995;
- abs.f32 %f2129, %f1932;
- setp.ltu.f32 %p1597, %f2129, 0f47CE4780;
- @%p1597 bra $L__BB0_1896;
-
- setp.eq.f32 %p1598, %f2129, 0f7F800000;
- @%p1598 bra $L__BB0_1895;
- bra.uni $L__BB0_1890;
-
-$L__BB0_1895:
- mov.f32 %f4999, 0f00000000;
- mul.rn.f32 %f5895, %f1932, %f4999;
- mov.u32 %r8737, 0;
- bra.uni $L__BB0_1896;
-
-$L__BB0_1890:
- mov.b32 %r2523, %f1932;
- shr.u32 %r7774, %r2523, 23;
- and.b32 %r7775, %r7774, 255;
- add.s32 %r2524, %r7775, -128;
- shl.b32 %r7776, %r2523, 8;
- or.b32 %r2525, %r7776, -2147483648;
- shr.u32 %r2526, %r2524, 5;
- mov.u64 %rd2795, 0;
- mov.u32 %r8734, 0;
- mov.u64 %rd2793, __cudart_i2opi_f;
- mov.u64 %rd2794, %rd1;
-
-$L__BB0_1891:
- .pragma "nounroll";
- ld.global.nc.u32 %r7777, [%rd2793];
- mad.wide.u32 %rd2465, %r7777, %r2525, %rd2795;
- shr.u64 %rd2795, %rd2465, 32;
- st.local.u32 [%rd2794], %rd2465;
- add.s64 %rd2794, %rd2794, 4;
- add.s64 %rd2793, %rd2793, 4;
- add.s32 %r8734, %r8734, 1;
- setp.ne.s32 %p1599, %r8734, 6;
- @%p1599 bra $L__BB0_1891;
-
- st.local.u32 [%rd5], %rd2795;
- mov.u32 %r7778, 4;
- sub.s32 %r2529, %r7778, %r2526;
- mov.u32 %r7779, 6;
- sub.s32 %r7780, %r7779, %r2526;
- mul.wide.s32 %rd2466, %r7780, 4;
- add.s64 %rd2467, %rd1, %rd2466;
- ld.local.u32 %r8735, [%rd2467];
- ld.local.u32 %r8736, [%rd2467+-4];
- and.b32 %r2532, %r2524, 31;
- setp.eq.s32 %p1600, %r2532, 0;
- @%p1600 bra $L__BB0_1894;
-
- mov.u32 %r7781, 32;
- sub.s32 %r7782, %r7781, %r2532;
- shr.u32 %r7783, %r8736, %r7782;
- shl.b32 %r7784, %r8735, %r2532;
- add.s32 %r8735, %r7783, %r7784;
- mul.wide.s32 %rd2468, %r2529, 4;
- add.s64 %rd2469, %rd1, %rd2468;
- ld.local.u32 %r7785, [%rd2469];
- shr.u32 %r7786, %r7785, %r7782;
- shl.b32 %r7787, %r8736, %r2532;
- add.s32 %r8736, %r7786, %r7787;
-
-$L__BB0_1894:
- and.b32 %r7788, %r2523, -2147483648;
- shr.u32 %r7789, %r8736, 30;
- shl.b32 %r7790, %r8735, 2;
- or.b32 %r7791, %r7789, %r7790;
- shr.u32 %r7792, %r7791, 31;
- shr.u32 %r7793, %r8735, 30;
- add.s32 %r7794, %r7792, %r7793;
- neg.s32 %r7795, %r7794;
- setp.eq.s32 %p1601, %r7788, 0;
- selp.b32 %r8737, %r7794, %r7795, %p1601;
- setp.ne.s32 %p1602, %r7792, 0;
- xor.b32 %r7796, %r7788, -2147483648;
- selp.b32 %r7797, %r7796, %r7788, %p1602;
- selp.b32 %r7798, -1, 0, %p1602;
- xor.b32 %r7799, %r7791, %r7798;
- shl.b32 %r7800, %r8736, 2;
- xor.b32 %r7801, %r7800, %r7798;
- cvt.u64.u32 %rd2470, %r7799;
- cvt.u64.u32 %rd2471, %r7801;
- bfi.b64 %rd2472, %rd2470, %rd2471, 32, 32;
- cvt.rn.f64.s64 %fd255, %rd2472;
- mul.f64 %fd256, %fd255, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4997, %fd256;
- setp.eq.s32 %p1603, %r7797, 0;
- neg.f32 %f4998, %f4997;
- selp.f32 %f5895, %f4997, %f4998, %p1603;
-
-$L__BB0_1896:
- add.s32 %r2539, %r8737, 1;
- and.b32 %r2540, %r2539, 1;
- setp.eq.s32 %p1604, %r2540, 0;
- selp.f32 %f2133, %f5895, 0f3F800000, %p1604;
- mul.rn.f32 %f2134, %f5895, %f5895;
- mov.f32 %f5896, 0fB94D4153;
- @%p1604 bra $L__BB0_1898;
-
- mov.f32 %f5001, 0fBAB607ED;
- mov.f32 %f5002, 0f37CBAC00;
- fma.rn.f32 %f5896, %f5002, %f2134, %f5001;
-
-$L__BB0_1898:
- selp.f32 %f5003, 0f3C0885E4, 0f3D2AAABB, %p1604;
- fma.rn.f32 %f5004, %f5896, %f2134, %f5003;
- selp.f32 %f5005, 0fBE2AAAA8, 0fBEFFFFFF, %p1604;
- fma.rn.f32 %f5006, %f5004, %f2134, %f5005;
- mov.f32 %f5007, 0f00000000;
- fma.rn.f32 %f5008, %f2134, %f2133, %f5007;
- fma.rn.f32 %f5897, %f5006, %f5008, %f2133;
- and.b32 %r7803, %r2539, 2;
- setp.eq.s32 %p1606, %r7803, 0;
- @%p1606 bra $L__BB0_1900;
-
- mov.f32 %f5010, 0fBF800000;
- fma.rn.f32 %f5897, %f5897, %f5010, %f5007;
-
-$L__BB0_1900:
- add.f32 %f5898, %f5894, %f5897;
- bra.uni $L__BB0_1901;
-
-$L__BB0_1480:
- mov.b32 %r1943, %f5348;
- shr.u32 %r6727, %r1943, 23;
- and.b32 %r6728, %r6727, 255;
- add.s32 %r1944, %r6728, -128;
- shl.b32 %r6729, %r1943, 8;
- or.b32 %r1945, %r6729, -2147483648;
- shr.u32 %r1946, %r1944, 5;
+ selp.f32 %f4393, 0f3C0885E4, 0f3D2AAABB, %p1227;
+ fma.rn.f32 %f4394, %f5844, %f1696, %f4393;
+ selp.f32 %f4395, 0fBE2AAAA8, 0fBEFFFFFF, %p1227;
+ fma.rn.f32 %f4396, %f4394, %f1696, %f4395;
+ mov.f32 %f4397, 0f00000000;
+ fma.rn.f32 %f4398, %f1696, %f1695, %f4397;
+ fma.rn.f32 %f5281, %f4396, %f4398, %f1695;
+ and.b32 %r6593, %r8354, 2;
+ setp.eq.s32 %p1229, %r6593, 0;
+ @%p1229 bra $L__BB0_1454;
+
+ mov.f32 %f4400, 0fBF800000;
+ fma.rn.f32 %f5281, %f5281, %f4400, %f4397;
+
+$L__BB0_1454:
+ setp.lt.s32 %p24, %r11, %r1893;
+ @%p1219 bra $L__BB0_1467;
+
+ mul.f32 %f4401, %f5607, 0f3F22F983;
+ cvt.rni.s32.f32 %r8358, %f4401;
+ cvt.rn.f32.s32 %f4402, %r8358;
+ mov.f32 %f4403, 0fBFC90FDA;
+ fma.rn.f32 %f4404, %f4402, %f4403, %f5607;
+ mov.f32 %f4405, 0fB3A22168;
+ fma.rn.f32 %f4406, %f4402, %f4405, %f4404;
+ mov.f32 %f4407, 0fA7C234C5;
+ fma.rn.f32 %f5847, %f4402, %f4407, %f4406;
+ abs.f32 %f1704, %f5607;
+ setp.ltu.f32 %p1231, %f1704, 0f47CE4780;
+ @%p1231 bra $L__BB0_1463;
+
+ setp.eq.f32 %p1232, %f1704, 0f7F800000;
+ @%p1232 bra $L__BB0_1462;
+ bra.uni $L__BB0_1457;
+
+$L__BB0_1462:
+ mov.f32 %f4410, 0f00000000;
+ mul.rn.f32 %f5847, %f5607, %f4410;
+ mov.u32 %r8358, 0;
+ bra.uni $L__BB0_1463;
+
+$L__BB0_1457:
+ mov.b32 %r1913, %f5607;
+ shr.u32 %r6595, %r1913, 23;
+ and.b32 %r6596, %r6595, 255;
+ add.s32 %r1914, %r6596, -128;
+ shl.b32 %r6597, %r1913, 8;
+ or.b32 %r1915, %r6597, -2147483648;
+ shr.u32 %r1916, %r1914, 5;
mov.u64 %rd2702, 0;
- mov.u32 %r8610, 0;
+ mov.u32 %r8355, 0;
mov.u64 %rd2700, __cudart_i2opi_f;
mov.u64 %rd2701, %rd1;
-$L__BB0_1481:
+$L__BB0_1458:
.pragma "nounroll";
- ld.global.nc.u32 %r6730, [%rd2700];
- mad.wide.u32 %rd2129, %r6730, %r1945, %rd2702;
- shr.u64 %rd2702, %rd2129, 32;
- st.local.u32 [%rd2701], %rd2129;
+ ld.global.nc.u32 %r6598, [%rd2700];
+ mad.wide.u32 %rd2155, %r6598, %r1915, %rd2702;
+ shr.u64 %rd2702, %rd2155, 32;
+ st.local.u32 [%rd2701], %rd2155;
add.s64 %rd2701, %rd2701, 4;
add.s64 %rd2700, %rd2700, 4;
- add.s32 %r8610, %r8610, 1;
- setp.ne.s32 %p1250, %r8610, 6;
- @%p1250 bra $L__BB0_1481;
-
- st.local.u32 [%rd5], %rd2702;
- mov.u32 %r6731, 4;
- sub.s32 %r1949, %r6731, %r1946;
- mov.u32 %r6732, 6;
- sub.s32 %r6733, %r6732, %r1946;
- mul.wide.s32 %rd2130, %r6733, 4;
- add.s64 %rd2131, %rd1, %rd2130;
- ld.local.u32 %r8611, [%rd2131];
- ld.local.u32 %r8612, [%rd2131+-4];
- and.b32 %r1952, %r1944, 31;
- setp.eq.s32 %p1251, %r1952, 0;
- @%p1251 bra $L__BB0_1484;
-
- mov.u32 %r6734, 32;
- sub.s32 %r6735, %r6734, %r1952;
- shr.u32 %r6736, %r8612, %r6735;
- shl.b32 %r6737, %r8611, %r1952;
- add.s32 %r8611, %r6736, %r6737;
- mul.wide.s32 %rd2132, %r1949, 4;
- add.s64 %rd2133, %rd1, %rd2132;
- ld.local.u32 %r6738, [%rd2133];
- shr.u32 %r6739, %r6738, %r6735;
- shl.b32 %r6740, %r8612, %r1952;
- add.s32 %r8612, %r6739, %r6740;
-
-$L__BB0_1484:
- and.b32 %r6741, %r1943, -2147483648;
- shr.u32 %r6742, %r8612, 30;
- shl.b32 %r6743, %r8611, 2;
- or.b32 %r6744, %r6742, %r6743;
- shr.u32 %r6745, %r6744, 31;
- shr.u32 %r6746, %r8611, 30;
- add.s32 %r6747, %r6745, %r6746;
- neg.s32 %r6748, %r6747;
- setp.eq.s32 %p1252, %r6741, 0;
- selp.b32 %r8613, %r6747, %r6748, %p1252;
- setp.ne.s32 %p1253, %r6745, 0;
- xor.b32 %r6749, %r6741, -2147483648;
- selp.b32 %r6750, %r6749, %r6741, %p1253;
- selp.b32 %r6751, -1, 0, %p1253;
- xor.b32 %r6752, %r6744, %r6751;
- shl.b32 %r6753, %r8612, 2;
- xor.b32 %r6754, %r6753, %r6751;
- cvt.u64.u32 %rd2134, %r6752;
- cvt.u64.u32 %rd2135, %r6754;
- bfi.b64 %rd2136, %rd2134, %rd2135, 32, 32;
- cvt.rn.f64.s64 %fd193, %rd2136;
- mul.f64 %fd194, %fd193, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4331, %fd194;
- setp.eq.s32 %p1254, %r6750, 0;
- neg.f32 %f4332, %f4331;
- selp.f32 %f5751, %f4331, %f4332, %p1254;
-
-$L__BB0_1486:
- and.b32 %r1959, %r8613, 1;
- setp.eq.s32 %p1255, %r1959, 0;
- selp.f32 %f1663, %f5751, 0f3F800000, %p1255;
- mul.rn.f32 %f1664, %f5751, %f5751;
- mov.f32 %f5752, 0fB94D4153;
- @%p1255 bra $L__BB0_1488;
-
- mov.f32 %f4335, 0fBAB607ED;
- mov.f32 %f4336, 0f37CBAC00;
- fma.rn.f32 %f5752, %f4336, %f1664, %f4335;
-
-$L__BB0_1488:
- selp.f32 %f4337, 0f3C0885E4, 0f3D2AAABB, %p1255;
- fma.rn.f32 %f4338, %f5752, %f1664, %f4337;
- selp.f32 %f4339, 0fBE2AAAA8, 0fBEFFFFFF, %p1255;
- fma.rn.f32 %f4340, %f4338, %f1664, %f4339;
- mov.f32 %f4341, 0f00000000;
- fma.rn.f32 %f4342, %f1664, %f1663, %f4341;
- fma.rn.f32 %f5213, %f4340, %f4342, %f1663;
- and.b32 %r6756, %r8613, 2;
- setp.eq.s32 %p1257, %r6756, 0;
- @%p1257 bra $L__BB0_1490;
-
- mov.f32 %f4344, 0fBF800000;
- fma.rn.f32 %f5213, %f5213, %f4344, %f4341;
-
-$L__BB0_1490:
- setp.lt.s32 %p24, %r14, %r1941;
- @%p1247 bra $L__BB0_1503;
-
- mul.f32 %f4345, %f5531, 0f3F22F983;
- cvt.rni.s32.f32 %r8617, %f4345;
- cvt.rn.f32.s32 %f4346, %r8617;
- mov.f32 %f4347, 0fBFC90FDA;
- fma.rn.f32 %f4348, %f4346, %f4347, %f5531;
- mov.f32 %f4349, 0fB3A22168;
- fma.rn.f32 %f4350, %f4346, %f4349, %f4348;
- mov.f32 %f4351, 0fA7C234C5;
- fma.rn.f32 %f5755, %f4346, %f4351, %f4350;
- abs.f32 %f1672, %f5531;
- setp.ltu.f32 %p1259, %f1672, 0f47CE4780;
- @%p1259 bra $L__BB0_1499;
-
- setp.eq.f32 %p1260, %f1672, 0f7F800000;
- @%p1260 bra $L__BB0_1498;
- bra.uni $L__BB0_1493;
-
-$L__BB0_1498:
- mov.f32 %f4354, 0f00000000;
- mul.rn.f32 %f5755, %f5531, %f4354;
- mov.u32 %r8617, 0;
- bra.uni $L__BB0_1499;
-
-$L__BB0_1493:
- mov.b32 %r1961, %f5531;
- shr.u32 %r6758, %r1961, 23;
- and.b32 %r6759, %r6758, 255;
- add.s32 %r1962, %r6759, -128;
- shl.b32 %r6760, %r1961, 8;
- or.b32 %r1963, %r6760, -2147483648;
- shr.u32 %r1964, %r1962, 5;
+ add.s32 %r8355, %r8355, 1;
+ setp.ne.s32 %p1233, %r8355, 6;
+ @%p1233 bra $L__BB0_1458;
+
+ st.local.u32 [%rd4], %rd2702;
+ mov.u32 %r6599, 4;
+ sub.s32 %r1919, %r6599, %r1916;
+ mov.u32 %r6600, 6;
+ sub.s32 %r6601, %r6600, %r1916;
+ mul.wide.s32 %rd2156, %r6601, 4;
+ add.s64 %rd2157, %rd1, %rd2156;
+ ld.local.u32 %r8356, [%rd2157];
+ ld.local.u32 %r8357, [%rd2157+-4];
+ and.b32 %r1922, %r1914, 31;
+ setp.eq.s32 %p1234, %r1922, 0;
+ @%p1234 bra $L__BB0_1461;
+
+ mov.u32 %r6602, 32;
+ sub.s32 %r6603, %r6602, %r1922;
+ shr.u32 %r6604, %r8357, %r6603;
+ shl.b32 %r6605, %r8356, %r1922;
+ add.s32 %r8356, %r6604, %r6605;
+ mul.wide.s32 %rd2158, %r1919, 4;
+ add.s64 %rd2159, %rd1, %rd2158;
+ ld.local.u32 %r6606, [%rd2159];
+ shr.u32 %r6607, %r6606, %r6603;
+ shl.b32 %r6608, %r8357, %r1922;
+ add.s32 %r8357, %r6607, %r6608;
+
+$L__BB0_1461:
+ and.b32 %r6609, %r1913, -2147483648;
+ shr.u32 %r6610, %r8357, 30;
+ shl.b32 %r6611, %r8356, 2;
+ or.b32 %r6612, %r6610, %r6611;
+ shr.u32 %r6613, %r6612, 31;
+ shr.u32 %r6614, %r8356, 30;
+ add.s32 %r6615, %r6613, %r6614;
+ neg.s32 %r6616, %r6615;
+ setp.eq.s32 %p1235, %r6609, 0;
+ selp.b32 %r8358, %r6615, %r6616, %p1235;
+ setp.ne.s32 %p1236, %r6613, 0;
+ xor.b32 %r6617, %r6609, -2147483648;
+ selp.b32 %r6618, %r6617, %r6609, %p1236;
+ selp.b32 %r6619, -1, 0, %p1236;
+ xor.b32 %r6620, %r6612, %r6619;
+ shl.b32 %r6621, %r8357, 2;
+ xor.b32 %r6622, %r6621, %r6619;
+ cvt.u64.u32 %rd2160, %r6620;
+ cvt.u64.u32 %rd2161, %r6622;
+ bfi.b64 %rd2162, %rd2160, %rd2161, 32, 32;
+ cvt.rn.f64.s64 %fd195, %rd2162;
+ mul.f64 %fd196, %fd195, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4408, %fd196;
+ setp.eq.s32 %p1237, %r6618, 0;
+ neg.f32 %f4409, %f4408;
+ selp.f32 %f5847, %f4408, %f4409, %p1237;
+
+$L__BB0_1463:
+ add.s32 %r1929, %r8358, 1;
+ and.b32 %r1930, %r1929, 1;
+ setp.eq.s32 %p1238, %r1930, 0;
+ selp.f32 %f1708, %f5847, 0f3F800000, %p1238;
+ mul.rn.f32 %f1709, %f5847, %f5847;
+ mov.f32 %f5848, 0fB94D4153;
+ @%p1238 bra $L__BB0_1465;
+
+ mov.f32 %f4412, 0fBAB607ED;
+ mov.f32 %f4413, 0f37CBAC00;
+ fma.rn.f32 %f5848, %f4413, %f1709, %f4412;
+
+$L__BB0_1465:
+ selp.f32 %f4414, 0f3C0885E4, 0f3D2AAABB, %p1238;
+ fma.rn.f32 %f4415, %f5848, %f1709, %f4414;
+ selp.f32 %f4416, 0fBE2AAAA8, 0fBEFFFFFF, %p1238;
+ fma.rn.f32 %f4417, %f4415, %f1709, %f4416;
+ mov.f32 %f4418, 0f00000000;
+ fma.rn.f32 %f4419, %f1709, %f1708, %f4418;
+ fma.rn.f32 %f5283, %f4417, %f4419, %f1708;
+ and.b32 %r6624, %r1929, 2;
+ setp.eq.s32 %p1240, %r6624, 0;
+ @%p1240 bra $L__BB0_1467;
+
+ mov.f32 %f4421, 0fBF800000;
+ fma.rn.f32 %f5283, %f5283, %f4421, %f4418;
+
+$L__BB0_1467:
+ selp.f32 %f1716, %f5283, %f5284, %p24;
+ selp.f32 %f1717, %f5281, %f5282, %p24;
+ @%p1219 bra $L__BB0_1469;
+
+ add.f32 %f5997, %f1717, %f1716;
+ mov.f32 %f5282, %f5281;
+ mov.f32 %f5284, %f5283;
+
+$L__BB0_1469:
+ @%p1194 bra $L__BB0_1498;
+
+ shl.b32 %r6625, %r12, 5;
+ mov.u32 %r6626, -32;
+ sub.s32 %r1931, %r6626, %r6625;
+ setp.ge.s32 %p1244, %r11, %r1931;
+ @%p1244 bra $L__BB0_1483;
+
+ mul.f32 %f4424, %f5415, 0f3F22F983;
+ cvt.rni.s32.f32 %r8362, %f4424;
+ cvt.rn.f32.s32 %f4425, %r8362;
+ mov.f32 %f4426, 0fBFC90FDA;
+ fma.rn.f32 %f4427, %f4425, %f4426, %f5415;
+ mov.f32 %f4428, 0fB3A22168;
+ fma.rn.f32 %f4429, %f4425, %f4428, %f4427;
+ mov.f32 %f4430, 0fA7C234C5;
+ fma.rn.f32 %f5856, %f4425, %f4430, %f4429;
+ abs.f32 %f1725, %f5415;
+ setp.ltu.f32 %p1245, %f1725, 0f47CE4780;
+ @%p1245 bra $L__BB0_1479;
+
+ setp.eq.f32 %p1246, %f1725, 0f7F800000;
+ @%p1246 bra $L__BB0_1478;
+ bra.uni $L__BB0_1473;
+
+$L__BB0_1478:
+ mov.f32 %f4433, 0f00000000;
+ mul.rn.f32 %f5856, %f5415, %f4433;
+ mov.u32 %r8362, 0;
+ bra.uni $L__BB0_1479;
+
+$L__BB0_1473:
+ mov.b32 %r1933, %f5415;
+ shr.u32 %r6628, %r1933, 23;
+ and.b32 %r6629, %r6628, 255;
+ add.s32 %r1934, %r6629, -128;
+ shl.b32 %r6630, %r1933, 8;
+ or.b32 %r1935, %r6630, -2147483648;
+ shr.u32 %r1936, %r1934, 5;
mov.u64 %rd2705, 0;
- mov.u32 %r8614, 0;
+ mov.u32 %r8359, 0;
mov.u64 %rd2703, __cudart_i2opi_f;
mov.u64 %rd2704, %rd1;
-$L__BB0_1494:
+$L__BB0_1474:
.pragma "nounroll";
- ld.global.nc.u32 %r6761, [%rd2703];
- mad.wide.u32 %rd2139, %r6761, %r1963, %rd2705;
- shr.u64 %rd2705, %rd2139, 32;
- st.local.u32 [%rd2704], %rd2139;
+ ld.global.nc.u32 %r6631, [%rd2703];
+ mad.wide.u32 %rd2165, %r6631, %r1935, %rd2705;
+ shr.u64 %rd2705, %rd2165, 32;
+ st.local.u32 [%rd2704], %rd2165;
add.s64 %rd2704, %rd2704, 4;
add.s64 %rd2703, %rd2703, 4;
- add.s32 %r8614, %r8614, 1;
- setp.ne.s32 %p1261, %r8614, 6;
- @%p1261 bra $L__BB0_1494;
-
- st.local.u32 [%rd5], %rd2705;
- mov.u32 %r6762, 4;
- sub.s32 %r1967, %r6762, %r1964;
- mov.u32 %r6763, 6;
- sub.s32 %r6764, %r6763, %r1964;
- mul.wide.s32 %rd2140, %r6764, 4;
- add.s64 %rd2141, %rd1, %rd2140;
- ld.local.u32 %r8615, [%rd2141];
- ld.local.u32 %r8616, [%rd2141+-4];
- and.b32 %r1970, %r1962, 31;
- setp.eq.s32 %p1262, %r1970, 0;
- @%p1262 bra $L__BB0_1497;
-
- mov.u32 %r6765, 32;
- sub.s32 %r6766, %r6765, %r1970;
- shr.u32 %r6767, %r8616, %r6766;
- shl.b32 %r6768, %r8615, %r1970;
- add.s32 %r8615, %r6767, %r6768;
- mul.wide.s32 %rd2142, %r1967, 4;
- add.s64 %rd2143, %rd1, %rd2142;
- ld.local.u32 %r6769, [%rd2143];
- shr.u32 %r6770, %r6769, %r6766;
- shl.b32 %r6771, %r8616, %r1970;
- add.s32 %r8616, %r6770, %r6771;
-
-$L__BB0_1497:
- and.b32 %r6772, %r1961, -2147483648;
- shr.u32 %r6773, %r8616, 30;
- shl.b32 %r6774, %r8615, 2;
- or.b32 %r6775, %r6773, %r6774;
- shr.u32 %r6776, %r6775, 31;
- shr.u32 %r6777, %r8615, 30;
- add.s32 %r6778, %r6776, %r6777;
- neg.s32 %r6779, %r6778;
- setp.eq.s32 %p1263, %r6772, 0;
- selp.b32 %r8617, %r6778, %r6779, %p1263;
- setp.ne.s32 %p1264, %r6776, 0;
- xor.b32 %r6780, %r6772, -2147483648;
- selp.b32 %r6781, %r6780, %r6772, %p1264;
- selp.b32 %r6782, -1, 0, %p1264;
- xor.b32 %r6783, %r6775, %r6782;
- shl.b32 %r6784, %r8616, 2;
- xor.b32 %r6785, %r6784, %r6782;
- cvt.u64.u32 %rd2144, %r6783;
- cvt.u64.u32 %rd2145, %r6785;
- bfi.b64 %rd2146, %rd2144, %rd2145, 32, 32;
- cvt.rn.f64.s64 %fd195, %rd2146;
- mul.f64 %fd196, %fd195, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4352, %fd196;
- setp.eq.s32 %p1265, %r6781, 0;
- neg.f32 %f4353, %f4352;
- selp.f32 %f5755, %f4352, %f4353, %p1265;
-
-$L__BB0_1499:
- add.s32 %r1977, %r8617, 1;
- and.b32 %r1978, %r1977, 1;
- setp.eq.s32 %p1266, %r1978, 0;
- selp.f32 %f1676, %f5755, 0f3F800000, %p1266;
- mul.rn.f32 %f1677, %f5755, %f5755;
- mov.f32 %f5756, 0fB94D4153;
- @%p1266 bra $L__BB0_1501;
-
- mov.f32 %f4356, 0fBAB607ED;
- mov.f32 %f4357, 0f37CBAC00;
- fma.rn.f32 %f5756, %f4357, %f1677, %f4356;
-
-$L__BB0_1501:
- selp.f32 %f4358, 0f3C0885E4, 0f3D2AAABB, %p1266;
- fma.rn.f32 %f4359, %f5756, %f1677, %f4358;
- selp.f32 %f4360, 0fBE2AAAA8, 0fBEFFFFFF, %p1266;
- fma.rn.f32 %f4361, %f4359, %f1677, %f4360;
- mov.f32 %f4362, 0f00000000;
- fma.rn.f32 %f4363, %f1677, %f1676, %f4362;
- fma.rn.f32 %f5215, %f4361, %f4363, %f1676;
- and.b32 %r6787, %r1977, 2;
- setp.eq.s32 %p1268, %r6787, 0;
- @%p1268 bra $L__BB0_1503;
-
- mov.f32 %f4365, 0fBF800000;
- fma.rn.f32 %f5215, %f5215, %f4365, %f4362;
-
-$L__BB0_1503:
- selp.f32 %f1684, %f5215, %f5216, %p24;
- selp.f32 %f1685, %f5213, %f5214, %p24;
- @%p1247 bra $L__BB0_1505;
-
- add.f32 %f5905, %f1685, %f1684;
- mov.f32 %f5214, %f5213;
- mov.f32 %f5216, %f5215;
-
-$L__BB0_1505:
- @%p1214 bra $L__BB0_1534;
-
- shl.b32 %r6789, %r12, 5;
- mov.u32 %r6790, -32;
- sub.s32 %r1979, %r6790, %r6789;
- setp.ge.s32 %p1272, %r14, %r1979;
- @%p1272 bra $L__BB0_1519;
-
- mul.f32 %f4368, %f5347, 0f3F22F983;
- cvt.rni.s32.f32 %r8621, %f4368;
- cvt.rn.f32.s32 %f4369, %r8621;
- mov.f32 %f4370, 0fBFC90FDA;
- fma.rn.f32 %f4371, %f4369, %f4370, %f5347;
- mov.f32 %f4372, 0fB3A22168;
- fma.rn.f32 %f4373, %f4369, %f4372, %f4371;
- mov.f32 %f4374, 0fA7C234C5;
- fma.rn.f32 %f5764, %f4369, %f4374, %f4373;
- abs.f32 %f1693, %f5347;
- setp.ltu.f32 %p1273, %f1693, 0f47CE4780;
- @%p1273 bra $L__BB0_1515;
-
- setp.eq.f32 %p1274, %f1693, 0f7F800000;
- @%p1274 bra $L__BB0_1514;
- bra.uni $L__BB0_1509;
-
-$L__BB0_1514:
- mov.f32 %f4377, 0f00000000;
- mul.rn.f32 %f5764, %f5347, %f4377;
- mov.u32 %r8621, 0;
- bra.uni $L__BB0_1515;
-
-$L__BB0_1509:
- mov.b32 %r1981, %f5347;
- shr.u32 %r6792, %r1981, 23;
- and.b32 %r6793, %r6792, 255;
- add.s32 %r1982, %r6793, -128;
- shl.b32 %r6794, %r1981, 8;
- or.b32 %r1983, %r6794, -2147483648;
- shr.u32 %r1984, %r1982, 5;
+ add.s32 %r8359, %r8359, 1;
+ setp.ne.s32 %p1247, %r8359, 6;
+ @%p1247 bra $L__BB0_1474;
+
+ st.local.u32 [%rd4], %rd2705;
+ mov.u32 %r6632, 4;
+ sub.s32 %r1939, %r6632, %r1936;
+ mov.u32 %r6633, 6;
+ sub.s32 %r6634, %r6633, %r1936;
+ mul.wide.s32 %rd2166, %r6634, 4;
+ add.s64 %rd2167, %rd1, %rd2166;
+ ld.local.u32 %r8360, [%rd2167];
+ ld.local.u32 %r8361, [%rd2167+-4];
+ and.b32 %r1942, %r1934, 31;
+ setp.eq.s32 %p1248, %r1942, 0;
+ @%p1248 bra $L__BB0_1477;
+
+ mov.u32 %r6635, 32;
+ sub.s32 %r6636, %r6635, %r1942;
+ shr.u32 %r6637, %r8361, %r6636;
+ shl.b32 %r6638, %r8360, %r1942;
+ add.s32 %r8360, %r6637, %r6638;
+ mul.wide.s32 %rd2168, %r1939, 4;
+ add.s64 %rd2169, %rd1, %rd2168;
+ ld.local.u32 %r6639, [%rd2169];
+ shr.u32 %r6640, %r6639, %r6636;
+ shl.b32 %r6641, %r8361, %r1942;
+ add.s32 %r8361, %r6640, %r6641;
+
+$L__BB0_1477:
+ and.b32 %r6642, %r1933, -2147483648;
+ shr.u32 %r6643, %r8361, 30;
+ shl.b32 %r6644, %r8360, 2;
+ or.b32 %r6645, %r6643, %r6644;
+ shr.u32 %r6646, %r6645, 31;
+ shr.u32 %r6647, %r8360, 30;
+ add.s32 %r6648, %r6646, %r6647;
+ neg.s32 %r6649, %r6648;
+ setp.eq.s32 %p1249, %r6642, 0;
+ selp.b32 %r8362, %r6648, %r6649, %p1249;
+ setp.ne.s32 %p1250, %r6646, 0;
+ xor.b32 %r6650, %r6642, -2147483648;
+ selp.b32 %r6651, %r6650, %r6642, %p1250;
+ selp.b32 %r6652, -1, 0, %p1250;
+ xor.b32 %r6653, %r6645, %r6652;
+ shl.b32 %r6654, %r8361, 2;
+ xor.b32 %r6655, %r6654, %r6652;
+ cvt.u64.u32 %rd2170, %r6653;
+ cvt.u64.u32 %rd2171, %r6655;
+ bfi.b64 %rd2172, %rd2170, %rd2171, 32, 32;
+ cvt.rn.f64.s64 %fd197, %rd2172;
+ mul.f64 %fd198, %fd197, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4431, %fd198;
+ setp.eq.s32 %p1251, %r6651, 0;
+ neg.f32 %f4432, %f4431;
+ selp.f32 %f5856, %f4431, %f4432, %p1251;
+
+$L__BB0_1479:
+ and.b32 %r1949, %r8362, 1;
+ setp.eq.s32 %p1252, %r1949, 0;
+ selp.f32 %f1729, %f5856, 0f3F800000, %p1252;
+ mul.rn.f32 %f1730, %f5856, %f5856;
+ mov.f32 %f5857, 0fB94D4153;
+ @%p1252 bra $L__BB0_1481;
+
+ mov.f32 %f4435, 0fBAB607ED;
+ mov.f32 %f4436, 0f37CBAC00;
+ fma.rn.f32 %f5857, %f4436, %f1730, %f4435;
+
+$L__BB0_1481:
+ selp.f32 %f4437, 0f3C0885E4, 0f3D2AAABB, %p1252;
+ fma.rn.f32 %f4438, %f5857, %f1730, %f4437;
+ selp.f32 %f4439, 0fBE2AAAA8, 0fBEFFFFFF, %p1252;
+ fma.rn.f32 %f4440, %f4438, %f1730, %f4439;
+ mov.f32 %f4441, 0f00000000;
+ fma.rn.f32 %f4442, %f1730, %f1729, %f4441;
+ fma.rn.f32 %f5281, %f4440, %f4442, %f1729;
+ and.b32 %r6657, %r8362, 2;
+ setp.eq.s32 %p1254, %r6657, 0;
+ @%p1254 bra $L__BB0_1483;
+
+ mov.f32 %f4444, 0fBF800000;
+ fma.rn.f32 %f5281, %f5281, %f4444, %f4441;
+
+$L__BB0_1483:
+ setp.lt.s32 %p25, %r11, %r1931;
+ @%p1244 bra $L__BB0_1496;
+
+ mul.f32 %f4445, %f5606, 0f3F22F983;
+ cvt.rni.s32.f32 %r8366, %f4445;
+ cvt.rn.f32.s32 %f4446, %r8366;
+ mov.f32 %f4447, 0fBFC90FDA;
+ fma.rn.f32 %f4448, %f4446, %f4447, %f5606;
+ mov.f32 %f4449, 0fB3A22168;
+ fma.rn.f32 %f4450, %f4446, %f4449, %f4448;
+ mov.f32 %f4451, 0fA7C234C5;
+ fma.rn.f32 %f5860, %f4446, %f4451, %f4450;
+ abs.f32 %f1738, %f5606;
+ setp.ltu.f32 %p1256, %f1738, 0f47CE4780;
+ @%p1256 bra $L__BB0_1492;
+
+ setp.eq.f32 %p1257, %f1738, 0f7F800000;
+ @%p1257 bra $L__BB0_1491;
+ bra.uni $L__BB0_1486;
+
+$L__BB0_1491:
+ mov.f32 %f4454, 0f00000000;
+ mul.rn.f32 %f5860, %f5606, %f4454;
+ mov.u32 %r8366, 0;
+ bra.uni $L__BB0_1492;
+
+$L__BB0_1486:
+ mov.b32 %r1951, %f5606;
+ shr.u32 %r6659, %r1951, 23;
+ and.b32 %r6660, %r6659, 255;
+ add.s32 %r1952, %r6660, -128;
+ shl.b32 %r6661, %r1951, 8;
+ or.b32 %r1953, %r6661, -2147483648;
+ shr.u32 %r1954, %r1952, 5;
mov.u64 %rd2708, 0;
- mov.u32 %r8618, 0;
+ mov.u32 %r8363, 0;
mov.u64 %rd2706, __cudart_i2opi_f;
mov.u64 %rd2707, %rd1;
-$L__BB0_1510:
+$L__BB0_1487:
.pragma "nounroll";
- ld.global.nc.u32 %r6795, [%rd2706];
- mad.wide.u32 %rd2149, %r6795, %r1983, %rd2708;
- shr.u64 %rd2708, %rd2149, 32;
- st.local.u32 [%rd2707], %rd2149;
+ ld.global.nc.u32 %r6662, [%rd2706];
+ mad.wide.u32 %rd2175, %r6662, %r1953, %rd2708;
+ shr.u64 %rd2708, %rd2175, 32;
+ st.local.u32 [%rd2707], %rd2175;
add.s64 %rd2707, %rd2707, 4;
add.s64 %rd2706, %rd2706, 4;
- add.s32 %r8618, %r8618, 1;
- setp.ne.s32 %p1275, %r8618, 6;
- @%p1275 bra $L__BB0_1510;
-
- st.local.u32 [%rd5], %rd2708;
- mov.u32 %r6796, 4;
- sub.s32 %r1987, %r6796, %r1984;
- mov.u32 %r6797, 6;
- sub.s32 %r6798, %r6797, %r1984;
- mul.wide.s32 %rd2150, %r6798, 4;
- add.s64 %rd2151, %rd1, %rd2150;
- ld.local.u32 %r8619, [%rd2151];
- ld.local.u32 %r8620, [%rd2151+-4];
- and.b32 %r1990, %r1982, 31;
- setp.eq.s32 %p1276, %r1990, 0;
- @%p1276 bra $L__BB0_1513;
-
- mov.u32 %r6799, 32;
- sub.s32 %r6800, %r6799, %r1990;
- shr.u32 %r6801, %r8620, %r6800;
- shl.b32 %r6802, %r8619, %r1990;
- add.s32 %r8619, %r6801, %r6802;
- mul.wide.s32 %rd2152, %r1987, 4;
- add.s64 %rd2153, %rd1, %rd2152;
- ld.local.u32 %r6803, [%rd2153];
- shr.u32 %r6804, %r6803, %r6800;
- shl.b32 %r6805, %r8620, %r1990;
- add.s32 %r8620, %r6804, %r6805;
-
-$L__BB0_1513:
- and.b32 %r6806, %r1981, -2147483648;
- shr.u32 %r6807, %r8620, 30;
- shl.b32 %r6808, %r8619, 2;
- or.b32 %r6809, %r6807, %r6808;
- shr.u32 %r6810, %r6809, 31;
- shr.u32 %r6811, %r8619, 30;
- add.s32 %r6812, %r6810, %r6811;
- neg.s32 %r6813, %r6812;
- setp.eq.s32 %p1277, %r6806, 0;
- selp.b32 %r8621, %r6812, %r6813, %p1277;
- setp.ne.s32 %p1278, %r6810, 0;
- xor.b32 %r6814, %r6806, -2147483648;
- selp.b32 %r6815, %r6814, %r6806, %p1278;
- selp.b32 %r6816, -1, 0, %p1278;
- xor.b32 %r6817, %r6809, %r6816;
- shl.b32 %r6818, %r8620, 2;
- xor.b32 %r6819, %r6818, %r6816;
- cvt.u64.u32 %rd2154, %r6817;
- cvt.u64.u32 %rd2155, %r6819;
- bfi.b64 %rd2156, %rd2154, %rd2155, 32, 32;
- cvt.rn.f64.s64 %fd197, %rd2156;
- mul.f64 %fd198, %fd197, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4375, %fd198;
- setp.eq.s32 %p1279, %r6815, 0;
- neg.f32 %f4376, %f4375;
- selp.f32 %f5764, %f4375, %f4376, %p1279;
-
-$L__BB0_1515:
- and.b32 %r1997, %r8621, 1;
- setp.eq.s32 %p1280, %r1997, 0;
- selp.f32 %f1697, %f5764, 0f3F800000, %p1280;
- mul.rn.f32 %f1698, %f5764, %f5764;
- mov.f32 %f5765, 0fB94D4153;
- @%p1280 bra $L__BB0_1517;
-
- mov.f32 %f4379, 0fBAB607ED;
- mov.f32 %f4380, 0f37CBAC00;
- fma.rn.f32 %f5765, %f4380, %f1698, %f4379;
-
-$L__BB0_1517:
- selp.f32 %f4381, 0f3C0885E4, 0f3D2AAABB, %p1280;
- fma.rn.f32 %f4382, %f5765, %f1698, %f4381;
- selp.f32 %f4383, 0fBE2AAAA8, 0fBEFFFFFF, %p1280;
- fma.rn.f32 %f4384, %f4382, %f1698, %f4383;
- mov.f32 %f4385, 0f00000000;
- fma.rn.f32 %f4386, %f1698, %f1697, %f4385;
- fma.rn.f32 %f5213, %f4384, %f4386, %f1697;
- and.b32 %r6821, %r8621, 2;
- setp.eq.s32 %p1282, %r6821, 0;
- @%p1282 bra $L__BB0_1519;
-
- mov.f32 %f4388, 0fBF800000;
- fma.rn.f32 %f5213, %f5213, %f4388, %f4385;
-
-$L__BB0_1519:
- setp.lt.s32 %p25, %r14, %r1979;
- @%p1272 bra $L__BB0_1532;
-
- mul.f32 %f4389, %f5339, 0f3F22F983;
- cvt.rni.s32.f32 %r8625, %f4389;
- cvt.rn.f32.s32 %f4390, %r8625;
- mov.f32 %f4391, 0fBFC90FDA;
- fma.rn.f32 %f4392, %f4390, %f4391, %f5339;
- mov.f32 %f4393, 0fB3A22168;
- fma.rn.f32 %f4394, %f4390, %f4393, %f4392;
- mov.f32 %f4395, 0fA7C234C5;
- fma.rn.f32 %f5768, %f4390, %f4395, %f4394;
- abs.f32 %f1706, %f5339;
- setp.ltu.f32 %p1284, %f1706, 0f47CE4780;
- @%p1284 bra $L__BB0_1528;
-
- setp.eq.f32 %p1285, %f1706, 0f7F800000;
- @%p1285 bra $L__BB0_1527;
- bra.uni $L__BB0_1522;
-
-$L__BB0_1527:
- mov.f32 %f4398, 0f00000000;
- mul.rn.f32 %f5768, %f5339, %f4398;
- mov.u32 %r8625, 0;
- bra.uni $L__BB0_1528;
-
-$L__BB0_1522:
- mov.b32 %r1999, %f5339;
- shr.u32 %r6823, %r1999, 23;
- and.b32 %r6824, %r6823, 255;
- add.s32 %r2000, %r6824, -128;
- shl.b32 %r6825, %r1999, 8;
- or.b32 %r2001, %r6825, -2147483648;
- shr.u32 %r2002, %r2000, 5;
+ add.s32 %r8363, %r8363, 1;
+ setp.ne.s32 %p1258, %r8363, 6;
+ @%p1258 bra $L__BB0_1487;
+
+ st.local.u32 [%rd4], %rd2708;
+ mov.u32 %r6663, 4;
+ sub.s32 %r1957, %r6663, %r1954;
+ mov.u32 %r6664, 6;
+ sub.s32 %r6665, %r6664, %r1954;
+ mul.wide.s32 %rd2176, %r6665, 4;
+ add.s64 %rd2177, %rd1, %rd2176;
+ ld.local.u32 %r8364, [%rd2177];
+ ld.local.u32 %r8365, [%rd2177+-4];
+ and.b32 %r1960, %r1952, 31;
+ setp.eq.s32 %p1259, %r1960, 0;
+ @%p1259 bra $L__BB0_1490;
+
+ mov.u32 %r6666, 32;
+ sub.s32 %r6667, %r6666, %r1960;
+ shr.u32 %r6668, %r8365, %r6667;
+ shl.b32 %r6669, %r8364, %r1960;
+ add.s32 %r8364, %r6668, %r6669;
+ mul.wide.s32 %rd2178, %r1957, 4;
+ add.s64 %rd2179, %rd1, %rd2178;
+ ld.local.u32 %r6670, [%rd2179];
+ shr.u32 %r6671, %r6670, %r6667;
+ shl.b32 %r6672, %r8365, %r1960;
+ add.s32 %r8365, %r6671, %r6672;
+
+$L__BB0_1490:
+ and.b32 %r6673, %r1951, -2147483648;
+ shr.u32 %r6674, %r8365, 30;
+ shl.b32 %r6675, %r8364, 2;
+ or.b32 %r6676, %r6674, %r6675;
+ shr.u32 %r6677, %r6676, 31;
+ shr.u32 %r6678, %r8364, 30;
+ add.s32 %r6679, %r6677, %r6678;
+ neg.s32 %r6680, %r6679;
+ setp.eq.s32 %p1260, %r6673, 0;
+ selp.b32 %r8366, %r6679, %r6680, %p1260;
+ setp.ne.s32 %p1261, %r6677, 0;
+ xor.b32 %r6681, %r6673, -2147483648;
+ selp.b32 %r6682, %r6681, %r6673, %p1261;
+ selp.b32 %r6683, -1, 0, %p1261;
+ xor.b32 %r6684, %r6676, %r6683;
+ shl.b32 %r6685, %r8365, 2;
+ xor.b32 %r6686, %r6685, %r6683;
+ cvt.u64.u32 %rd2180, %r6684;
+ cvt.u64.u32 %rd2181, %r6686;
+ bfi.b64 %rd2182, %rd2180, %rd2181, 32, 32;
+ cvt.rn.f64.s64 %fd199, %rd2182;
+ mul.f64 %fd200, %fd199, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4452, %fd200;
+ setp.eq.s32 %p1262, %r6682, 0;
+ neg.f32 %f4453, %f4452;
+ selp.f32 %f5860, %f4452, %f4453, %p1262;
+
+$L__BB0_1492:
+ add.s32 %r1967, %r8366, 1;
+ and.b32 %r1968, %r1967, 1;
+ setp.eq.s32 %p1263, %r1968, 0;
+ selp.f32 %f1742, %f5860, 0f3F800000, %p1263;
+ mul.rn.f32 %f1743, %f5860, %f5860;
+ mov.f32 %f5861, 0fB94D4153;
+ @%p1263 bra $L__BB0_1494;
+
+ mov.f32 %f4456, 0fBAB607ED;
+ mov.f32 %f4457, 0f37CBAC00;
+ fma.rn.f32 %f5861, %f4457, %f1743, %f4456;
+
+$L__BB0_1494:
+ selp.f32 %f4458, 0f3C0885E4, 0f3D2AAABB, %p1263;
+ fma.rn.f32 %f4459, %f5861, %f1743, %f4458;
+ selp.f32 %f4460, 0fBE2AAAA8, 0fBEFFFFFF, %p1263;
+ fma.rn.f32 %f4461, %f4459, %f1743, %f4460;
+ mov.f32 %f4462, 0f00000000;
+ fma.rn.f32 %f4463, %f1743, %f1742, %f4462;
+ fma.rn.f32 %f5283, %f4461, %f4463, %f1742;
+ and.b32 %r6688, %r1967, 2;
+ setp.eq.s32 %p1265, %r6688, 0;
+ @%p1265 bra $L__BB0_1496;
+
+ mov.f32 %f4465, 0fBF800000;
+ fma.rn.f32 %f5283, %f5283, %f4465, %f4462;
+
+$L__BB0_1496:
+ selp.f32 %f1750, %f5283, %f5284, %p25;
+ selp.f32 %f1751, %f5281, %f5282, %p25;
+ @%p1244 bra $L__BB0_1498;
+
+ add.f32 %f5996, %f1751, %f1750;
+ mov.f32 %f5282, %f5281;
+ mov.f32 %f5284, %f5283;
+
+$L__BB0_1498:
+ @%p1197 bra $L__BB0_1527;
+
+ shl.b32 %r6689, %r12, 5;
+ neg.s32 %r1969, %r6689;
+ setp.ge.s32 %p1269, %r11, %r1969;
+ @%p1269 bra $L__BB0_1512;
+
+ mul.f32 %f4468, %f5414, 0f3F22F983;
+ cvt.rni.s32.f32 %r8370, %f4468;
+ cvt.rn.f32.s32 %f4469, %r8370;
+ mov.f32 %f4470, 0fBFC90FDA;
+ fma.rn.f32 %f4471, %f4469, %f4470, %f5414;
+ mov.f32 %f4472, 0fB3A22168;
+ fma.rn.f32 %f4473, %f4469, %f4472, %f4471;
+ mov.f32 %f4474, 0fA7C234C5;
+ fma.rn.f32 %f5869, %f4469, %f4474, %f4473;
+ abs.f32 %f1759, %f5414;
+ setp.ltu.f32 %p1270, %f1759, 0f47CE4780;
+ @%p1270 bra $L__BB0_1508;
+
+ setp.eq.f32 %p1271, %f1759, 0f7F800000;
+ @%p1271 bra $L__BB0_1507;
+ bra.uni $L__BB0_1502;
+
+$L__BB0_1507:
+ mov.f32 %f4477, 0f00000000;
+ mul.rn.f32 %f5869, %f5414, %f4477;
+ mov.u32 %r8370, 0;
+ bra.uni $L__BB0_1508;
+
+$L__BB0_1502:
+ mov.b32 %r1971, %f5414;
+ shr.u32 %r6691, %r1971, 23;
+ and.b32 %r6692, %r6691, 255;
+ add.s32 %r1972, %r6692, -128;
+ shl.b32 %r6693, %r1971, 8;
+ or.b32 %r1973, %r6693, -2147483648;
+ shr.u32 %r1974, %r1972, 5;
mov.u64 %rd2711, 0;
- mov.u32 %r8622, 0;
+ mov.u32 %r8367, 0;
mov.u64 %rd2709, __cudart_i2opi_f;
mov.u64 %rd2710, %rd1;
-$L__BB0_1523:
+$L__BB0_1503:
.pragma "nounroll";
- ld.global.nc.u32 %r6826, [%rd2709];
- mad.wide.u32 %rd2159, %r6826, %r2001, %rd2711;
- shr.u64 %rd2711, %rd2159, 32;
- st.local.u32 [%rd2710], %rd2159;
+ ld.global.nc.u32 %r6694, [%rd2709];
+ mad.wide.u32 %rd2185, %r6694, %r1973, %rd2711;
+ shr.u64 %rd2711, %rd2185, 32;
+ st.local.u32 [%rd2710], %rd2185;
add.s64 %rd2710, %rd2710, 4;
add.s64 %rd2709, %rd2709, 4;
- add.s32 %r8622, %r8622, 1;
- setp.ne.s32 %p1286, %r8622, 6;
- @%p1286 bra $L__BB0_1523;
-
- st.local.u32 [%rd5], %rd2711;
- mov.u32 %r6827, 4;
- sub.s32 %r2005, %r6827, %r2002;
- mov.u32 %r6828, 6;
- sub.s32 %r6829, %r6828, %r2002;
- mul.wide.s32 %rd2160, %r6829, 4;
- add.s64 %rd2161, %rd1, %rd2160;
- ld.local.u32 %r8623, [%rd2161];
- ld.local.u32 %r8624, [%rd2161+-4];
- and.b32 %r2008, %r2000, 31;
- setp.eq.s32 %p1287, %r2008, 0;
- @%p1287 bra $L__BB0_1526;
-
- mov.u32 %r6830, 32;
- sub.s32 %r6831, %r6830, %r2008;
- shr.u32 %r6832, %r8624, %r6831;
- shl.b32 %r6833, %r8623, %r2008;
- add.s32 %r8623, %r6832, %r6833;
- mul.wide.s32 %rd2162, %r2005, 4;
- add.s64 %rd2163, %rd1, %rd2162;
- ld.local.u32 %r6834, [%rd2163];
- shr.u32 %r6835, %r6834, %r6831;
- shl.b32 %r6836, %r8624, %r2008;
- add.s32 %r8624, %r6835, %r6836;
-
-$L__BB0_1526:
- and.b32 %r6837, %r1999, -2147483648;
- shr.u32 %r6838, %r8624, 30;
- shl.b32 %r6839, %r8623, 2;
- or.b32 %r6840, %r6838, %r6839;
- shr.u32 %r6841, %r6840, 31;
- shr.u32 %r6842, %r8623, 30;
- add.s32 %r6843, %r6841, %r6842;
- neg.s32 %r6844, %r6843;
- setp.eq.s32 %p1288, %r6837, 0;
- selp.b32 %r8625, %r6843, %r6844, %p1288;
- setp.ne.s32 %p1289, %r6841, 0;
- xor.b32 %r6845, %r6837, -2147483648;
- selp.b32 %r6846, %r6845, %r6837, %p1289;
- selp.b32 %r6847, -1, 0, %p1289;
- xor.b32 %r6848, %r6840, %r6847;
- shl.b32 %r6849, %r8624, 2;
- xor.b32 %r6850, %r6849, %r6847;
- cvt.u64.u32 %rd2164, %r6848;
- cvt.u64.u32 %rd2165, %r6850;
- bfi.b64 %rd2166, %rd2164, %rd2165, 32, 32;
- cvt.rn.f64.s64 %fd199, %rd2166;
- mul.f64 %fd200, %fd199, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4396, %fd200;
- setp.eq.s32 %p1290, %r6846, 0;
- neg.f32 %f4397, %f4396;
- selp.f32 %f5768, %f4396, %f4397, %p1290;
-
-$L__BB0_1528:
- add.s32 %r2015, %r8625, 1;
- and.b32 %r2016, %r2015, 1;
- setp.eq.s32 %p1291, %r2016, 0;
- selp.f32 %f1710, %f5768, 0f3F800000, %p1291;
- mul.rn.f32 %f1711, %f5768, %f5768;
- mov.f32 %f5769, 0fB94D4153;
- @%p1291 bra $L__BB0_1530;
-
- mov.f32 %f4400, 0fBAB607ED;
- mov.f32 %f4401, 0f37CBAC00;
- fma.rn.f32 %f5769, %f4401, %f1711, %f4400;
-
-$L__BB0_1530:
- selp.f32 %f4402, 0f3C0885E4, 0f3D2AAABB, %p1291;
- fma.rn.f32 %f4403, %f5769, %f1711, %f4402;
- selp.f32 %f4404, 0fBE2AAAA8, 0fBEFFFFFF, %p1291;
- fma.rn.f32 %f4405, %f4403, %f1711, %f4404;
- mov.f32 %f4406, 0f00000000;
- fma.rn.f32 %f4407, %f1711, %f1710, %f4406;
- fma.rn.f32 %f5215, %f4405, %f4407, %f1710;
- and.b32 %r6852, %r2015, 2;
- setp.eq.s32 %p1293, %r6852, 0;
- @%p1293 bra $L__BB0_1532;
-
- mov.f32 %f4409, 0fBF800000;
- fma.rn.f32 %f5215, %f5215, %f4409, %f4406;
-
-$L__BB0_1532:
- selp.f32 %f1718, %f5215, %f5216, %p25;
- selp.f32 %f1719, %f5213, %f5214, %p25;
- @%p1272 bra $L__BB0_1534;
-
- add.f32 %f5904, %f1719, %f1718;
- mov.f32 %f5214, %f5213;
- mov.f32 %f5216, %f5215;
-
-$L__BB0_1534:
- @%p1218 bra $L__BB0_1563;
-
- shl.b32 %r6854, %r12, 5;
- neg.s32 %r2017, %r6854;
- setp.ge.s32 %p1297, %r14, %r2017;
- @%p1297 bra $L__BB0_1548;
-
- mul.f32 %f4412, %f5346, 0f3F22F983;
- cvt.rni.s32.f32 %r8629, %f4412;
- cvt.rn.f32.s32 %f4413, %r8629;
- mov.f32 %f4414, 0fBFC90FDA;
- fma.rn.f32 %f4415, %f4413, %f4414, %f5346;
- mov.f32 %f4416, 0fB3A22168;
- fma.rn.f32 %f4417, %f4413, %f4416, %f4415;
- mov.f32 %f4418, 0fA7C234C5;
- fma.rn.f32 %f5777, %f4413, %f4418, %f4417;
- abs.f32 %f1727, %f5346;
- setp.ltu.f32 %p1298, %f1727, 0f47CE4780;
- @%p1298 bra $L__BB0_1544;
-
- setp.eq.f32 %p1299, %f1727, 0f7F800000;
- @%p1299 bra $L__BB0_1543;
- bra.uni $L__BB0_1538;
-
-$L__BB0_1543:
- mov.f32 %f4421, 0f00000000;
- mul.rn.f32 %f5777, %f5346, %f4421;
- mov.u32 %r8629, 0;
- bra.uni $L__BB0_1544;
-
-$L__BB0_1538:
- mov.b32 %r2019, %f5346;
- shr.u32 %r6856, %r2019, 23;
- and.b32 %r6857, %r6856, 255;
- add.s32 %r2020, %r6857, -128;
- shl.b32 %r6858, %r2019, 8;
- or.b32 %r2021, %r6858, -2147483648;
- shr.u32 %r2022, %r2020, 5;
+ add.s32 %r8367, %r8367, 1;
+ setp.ne.s32 %p1272, %r8367, 6;
+ @%p1272 bra $L__BB0_1503;
+
+ st.local.u32 [%rd4], %rd2711;
+ mov.u32 %r6695, 4;
+ sub.s32 %r1977, %r6695, %r1974;
+ mov.u32 %r6696, 6;
+ sub.s32 %r6697, %r6696, %r1974;
+ mul.wide.s32 %rd2186, %r6697, 4;
+ add.s64 %rd2187, %rd1, %rd2186;
+ ld.local.u32 %r8368, [%rd2187];
+ ld.local.u32 %r8369, [%rd2187+-4];
+ and.b32 %r1980, %r1972, 31;
+ setp.eq.s32 %p1273, %r1980, 0;
+ @%p1273 bra $L__BB0_1506;
+
+ mov.u32 %r6698, 32;
+ sub.s32 %r6699, %r6698, %r1980;
+ shr.u32 %r6700, %r8369, %r6699;
+ shl.b32 %r6701, %r8368, %r1980;
+ add.s32 %r8368, %r6700, %r6701;
+ mul.wide.s32 %rd2188, %r1977, 4;
+ add.s64 %rd2189, %rd1, %rd2188;
+ ld.local.u32 %r6702, [%rd2189];
+ shr.u32 %r6703, %r6702, %r6699;
+ shl.b32 %r6704, %r8369, %r1980;
+ add.s32 %r8369, %r6703, %r6704;
+
+$L__BB0_1506:
+ and.b32 %r6705, %r1971, -2147483648;
+ shr.u32 %r6706, %r8369, 30;
+ shl.b32 %r6707, %r8368, 2;
+ or.b32 %r6708, %r6706, %r6707;
+ shr.u32 %r6709, %r6708, 31;
+ shr.u32 %r6710, %r8368, 30;
+ add.s32 %r6711, %r6709, %r6710;
+ neg.s32 %r6712, %r6711;
+ setp.eq.s32 %p1274, %r6705, 0;
+ selp.b32 %r8370, %r6711, %r6712, %p1274;
+ setp.ne.s32 %p1275, %r6709, 0;
+ xor.b32 %r6713, %r6705, -2147483648;
+ selp.b32 %r6714, %r6713, %r6705, %p1275;
+ selp.b32 %r6715, -1, 0, %p1275;
+ xor.b32 %r6716, %r6708, %r6715;
+ shl.b32 %r6717, %r8369, 2;
+ xor.b32 %r6718, %r6717, %r6715;
+ cvt.u64.u32 %rd2190, %r6716;
+ cvt.u64.u32 %rd2191, %r6718;
+ bfi.b64 %rd2192, %rd2190, %rd2191, 32, 32;
+ cvt.rn.f64.s64 %fd201, %rd2192;
+ mul.f64 %fd202, %fd201, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4475, %fd202;
+ setp.eq.s32 %p1276, %r6714, 0;
+ neg.f32 %f4476, %f4475;
+ selp.f32 %f5869, %f4475, %f4476, %p1276;
+
+$L__BB0_1508:
+ and.b32 %r1987, %r8370, 1;
+ setp.eq.s32 %p1277, %r1987, 0;
+ selp.f32 %f1763, %f5869, 0f3F800000, %p1277;
+ mul.rn.f32 %f1764, %f5869, %f5869;
+ mov.f32 %f5870, 0fB94D4153;
+ @%p1277 bra $L__BB0_1510;
+
+ mov.f32 %f4479, 0fBAB607ED;
+ mov.f32 %f4480, 0f37CBAC00;
+ fma.rn.f32 %f5870, %f4480, %f1764, %f4479;
+
+$L__BB0_1510:
+ selp.f32 %f4481, 0f3C0885E4, 0f3D2AAABB, %p1277;
+ fma.rn.f32 %f4482, %f5870, %f1764, %f4481;
+ selp.f32 %f4483, 0fBE2AAAA8, 0fBEFFFFFF, %p1277;
+ fma.rn.f32 %f4484, %f4482, %f1764, %f4483;
+ mov.f32 %f4485, 0f00000000;
+ fma.rn.f32 %f4486, %f1764, %f1763, %f4485;
+ fma.rn.f32 %f5281, %f4484, %f4486, %f1763;
+ and.b32 %r6720, %r8370, 2;
+ setp.eq.s32 %p1279, %r6720, 0;
+ @%p1279 bra $L__BB0_1512;
+
+ mov.f32 %f4488, 0fBF800000;
+ fma.rn.f32 %f5281, %f5281, %f4488, %f4485;
+
+$L__BB0_1512:
+ setp.lt.s32 %p26, %r11, %r1969;
+ @%p1269 bra $L__BB0_1525;
+
+ mul.f32 %f4489, %f5406, 0f3F22F983;
+ cvt.rni.s32.f32 %r8374, %f4489;
+ cvt.rn.f32.s32 %f4490, %r8374;
+ mov.f32 %f4491, 0fBFC90FDA;
+ fma.rn.f32 %f4492, %f4490, %f4491, %f5406;
+ mov.f32 %f4493, 0fB3A22168;
+ fma.rn.f32 %f4494, %f4490, %f4493, %f4492;
+ mov.f32 %f4495, 0fA7C234C5;
+ fma.rn.f32 %f5873, %f4490, %f4495, %f4494;
+ abs.f32 %f1772, %f5406;
+ setp.ltu.f32 %p1281, %f1772, 0f47CE4780;
+ @%p1281 bra $L__BB0_1521;
+
+ setp.eq.f32 %p1282, %f1772, 0f7F800000;
+ @%p1282 bra $L__BB0_1520;
+ bra.uni $L__BB0_1515;
+
+$L__BB0_1520:
+ mov.f32 %f4498, 0f00000000;
+ mul.rn.f32 %f5873, %f5406, %f4498;
+ mov.u32 %r8374, 0;
+ bra.uni $L__BB0_1521;
+
+$L__BB0_1515:
+ mov.b32 %r1989, %f5406;
+ shr.u32 %r6722, %r1989, 23;
+ and.b32 %r6723, %r6722, 255;
+ add.s32 %r1990, %r6723, -128;
+ shl.b32 %r6724, %r1989, 8;
+ or.b32 %r1991, %r6724, -2147483648;
+ shr.u32 %r1992, %r1990, 5;
mov.u64 %rd2714, 0;
- mov.u32 %r8626, 0;
+ mov.u32 %r8371, 0;
mov.u64 %rd2712, __cudart_i2opi_f;
mov.u64 %rd2713, %rd1;
-$L__BB0_1539:
+$L__BB0_1516:
.pragma "nounroll";
- ld.global.nc.u32 %r6859, [%rd2712];
- mad.wide.u32 %rd2169, %r6859, %r2021, %rd2714;
- shr.u64 %rd2714, %rd2169, 32;
- st.local.u32 [%rd2713], %rd2169;
+ ld.global.nc.u32 %r6725, [%rd2712];
+ mad.wide.u32 %rd2195, %r6725, %r1991, %rd2714;
+ shr.u64 %rd2714, %rd2195, 32;
+ st.local.u32 [%rd2713], %rd2195;
add.s64 %rd2713, %rd2713, 4;
add.s64 %rd2712, %rd2712, 4;
- add.s32 %r8626, %r8626, 1;
- setp.ne.s32 %p1300, %r8626, 6;
- @%p1300 bra $L__BB0_1539;
-
- st.local.u32 [%rd5], %rd2714;
- mov.u32 %r6860, 4;
- sub.s32 %r2025, %r6860, %r2022;
- mov.u32 %r6861, 6;
- sub.s32 %r6862, %r6861, %r2022;
- mul.wide.s32 %rd2170, %r6862, 4;
- add.s64 %rd2171, %rd1, %rd2170;
- ld.local.u32 %r8627, [%rd2171];
- ld.local.u32 %r8628, [%rd2171+-4];
- and.b32 %r2028, %r2020, 31;
- setp.eq.s32 %p1301, %r2028, 0;
- @%p1301 bra $L__BB0_1542;
-
- mov.u32 %r6863, 32;
- sub.s32 %r6864, %r6863, %r2028;
- shr.u32 %r6865, %r8628, %r6864;
- shl.b32 %r6866, %r8627, %r2028;
- add.s32 %r8627, %r6865, %r6866;
- mul.wide.s32 %rd2172, %r2025, 4;
- add.s64 %rd2173, %rd1, %rd2172;
- ld.local.u32 %r6867, [%rd2173];
- shr.u32 %r6868, %r6867, %r6864;
- shl.b32 %r6869, %r8628, %r2028;
- add.s32 %r8628, %r6868, %r6869;
-
-$L__BB0_1542:
- and.b32 %r6870, %r2019, -2147483648;
- shr.u32 %r6871, %r8628, 30;
- shl.b32 %r6872, %r8627, 2;
- or.b32 %r6873, %r6871, %r6872;
- shr.u32 %r6874, %r6873, 31;
- shr.u32 %r6875, %r8627, 30;
- add.s32 %r6876, %r6874, %r6875;
- neg.s32 %r6877, %r6876;
- setp.eq.s32 %p1302, %r6870, 0;
- selp.b32 %r8629, %r6876, %r6877, %p1302;
- setp.ne.s32 %p1303, %r6874, 0;
- xor.b32 %r6878, %r6870, -2147483648;
- selp.b32 %r6879, %r6878, %r6870, %p1303;
- selp.b32 %r6880, -1, 0, %p1303;
- xor.b32 %r6881, %r6873, %r6880;
- shl.b32 %r6882, %r8628, 2;
- xor.b32 %r6883, %r6882, %r6880;
- cvt.u64.u32 %rd2174, %r6881;
- cvt.u64.u32 %rd2175, %r6883;
- bfi.b64 %rd2176, %rd2174, %rd2175, 32, 32;
- cvt.rn.f64.s64 %fd201, %rd2176;
- mul.f64 %fd202, %fd201, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4419, %fd202;
- setp.eq.s32 %p1304, %r6879, 0;
- neg.f32 %f4420, %f4419;
- selp.f32 %f5777, %f4419, %f4420, %p1304;
-
-$L__BB0_1544:
- and.b32 %r2035, %r8629, 1;
- setp.eq.s32 %p1305, %r2035, 0;
- selp.f32 %f1731, %f5777, 0f3F800000, %p1305;
- mul.rn.f32 %f1732, %f5777, %f5777;
- mov.f32 %f5778, 0fB94D4153;
- @%p1305 bra $L__BB0_1546;
-
- mov.f32 %f4423, 0fBAB607ED;
- mov.f32 %f4424, 0f37CBAC00;
- fma.rn.f32 %f5778, %f4424, %f1732, %f4423;
-
-$L__BB0_1546:
- selp.f32 %f4425, 0f3C0885E4, 0f3D2AAABB, %p1305;
- fma.rn.f32 %f4426, %f5778, %f1732, %f4425;
- selp.f32 %f4427, 0fBE2AAAA8, 0fBEFFFFFF, %p1305;
- fma.rn.f32 %f4428, %f4426, %f1732, %f4427;
- mov.f32 %f4429, 0f00000000;
- fma.rn.f32 %f4430, %f1732, %f1731, %f4429;
- fma.rn.f32 %f5213, %f4428, %f4430, %f1731;
- and.b32 %r6885, %r8629, 2;
- setp.eq.s32 %p1307, %r6885, 0;
- @%p1307 bra $L__BB0_1548;
-
- mov.f32 %f4432, 0fBF800000;
- fma.rn.f32 %f5213, %f5213, %f4432, %f4429;
-
-$L__BB0_1548:
- setp.lt.s32 %p26, %r14, %r2017;
- @%p1297 bra $L__BB0_1561;
-
- mul.f32 %f4433, %f5338, 0f3F22F983;
- cvt.rni.s32.f32 %r8633, %f4433;
- cvt.rn.f32.s32 %f4434, %r8633;
- mov.f32 %f4435, 0fBFC90FDA;
- fma.rn.f32 %f4436, %f4434, %f4435, %f5338;
- mov.f32 %f4437, 0fB3A22168;
- fma.rn.f32 %f4438, %f4434, %f4437, %f4436;
- mov.f32 %f4439, 0fA7C234C5;
- fma.rn.f32 %f5781, %f4434, %f4439, %f4438;
- abs.f32 %f1740, %f5338;
- setp.ltu.f32 %p1309, %f1740, 0f47CE4780;
- @%p1309 bra $L__BB0_1557;
-
- setp.eq.f32 %p1310, %f1740, 0f7F800000;
- @%p1310 bra $L__BB0_1556;
- bra.uni $L__BB0_1551;
-
-$L__BB0_1556:
- mov.f32 %f4442, 0f00000000;
- mul.rn.f32 %f5781, %f5338, %f4442;
- mov.u32 %r8633, 0;
- bra.uni $L__BB0_1557;
-
-$L__BB0_1551:
- mov.b32 %r2037, %f5338;
- shr.u32 %r6887, %r2037, 23;
- and.b32 %r6888, %r6887, 255;
- add.s32 %r2038, %r6888, -128;
- shl.b32 %r6889, %r2037, 8;
- or.b32 %r2039, %r6889, -2147483648;
- shr.u32 %r2040, %r2038, 5;
+ add.s32 %r8371, %r8371, 1;
+ setp.ne.s32 %p1283, %r8371, 6;
+ @%p1283 bra $L__BB0_1516;
+
+ st.local.u32 [%rd4], %rd2714;
+ mov.u32 %r6726, 4;
+ sub.s32 %r1995, %r6726, %r1992;
+ mov.u32 %r6727, 6;
+ sub.s32 %r6728, %r6727, %r1992;
+ mul.wide.s32 %rd2196, %r6728, 4;
+ add.s64 %rd2197, %rd1, %rd2196;
+ ld.local.u32 %r8372, [%rd2197];
+ ld.local.u32 %r8373, [%rd2197+-4];
+ and.b32 %r1998, %r1990, 31;
+ setp.eq.s32 %p1284, %r1998, 0;
+ @%p1284 bra $L__BB0_1519;
+
+ mov.u32 %r6729, 32;
+ sub.s32 %r6730, %r6729, %r1998;
+ shr.u32 %r6731, %r8373, %r6730;
+ shl.b32 %r6732, %r8372, %r1998;
+ add.s32 %r8372, %r6731, %r6732;
+ mul.wide.s32 %rd2198, %r1995, 4;
+ add.s64 %rd2199, %rd1, %rd2198;
+ ld.local.u32 %r6733, [%rd2199];
+ shr.u32 %r6734, %r6733, %r6730;
+ shl.b32 %r6735, %r8373, %r1998;
+ add.s32 %r8373, %r6734, %r6735;
+
+$L__BB0_1519:
+ and.b32 %r6736, %r1989, -2147483648;
+ shr.u32 %r6737, %r8373, 30;
+ shl.b32 %r6738, %r8372, 2;
+ or.b32 %r6739, %r6737, %r6738;
+ shr.u32 %r6740, %r6739, 31;
+ shr.u32 %r6741, %r8372, 30;
+ add.s32 %r6742, %r6740, %r6741;
+ neg.s32 %r6743, %r6742;
+ setp.eq.s32 %p1285, %r6736, 0;
+ selp.b32 %r8374, %r6742, %r6743, %p1285;
+ setp.ne.s32 %p1286, %r6740, 0;
+ xor.b32 %r6744, %r6736, -2147483648;
+ selp.b32 %r6745, %r6744, %r6736, %p1286;
+ selp.b32 %r6746, -1, 0, %p1286;
+ xor.b32 %r6747, %r6739, %r6746;
+ shl.b32 %r6748, %r8373, 2;
+ xor.b32 %r6749, %r6748, %r6746;
+ cvt.u64.u32 %rd2200, %r6747;
+ cvt.u64.u32 %rd2201, %r6749;
+ bfi.b64 %rd2202, %rd2200, %rd2201, 32, 32;
+ cvt.rn.f64.s64 %fd203, %rd2202;
+ mul.f64 %fd204, %fd203, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4496, %fd204;
+ setp.eq.s32 %p1287, %r6745, 0;
+ neg.f32 %f4497, %f4496;
+ selp.f32 %f5873, %f4496, %f4497, %p1287;
+
+$L__BB0_1521:
+ add.s32 %r2005, %r8374, 1;
+ and.b32 %r2006, %r2005, 1;
+ setp.eq.s32 %p1288, %r2006, 0;
+ selp.f32 %f1776, %f5873, 0f3F800000, %p1288;
+ mul.rn.f32 %f1777, %f5873, %f5873;
+ mov.f32 %f5874, 0fB94D4153;
+ @%p1288 bra $L__BB0_1523;
+
+ mov.f32 %f4500, 0fBAB607ED;
+ mov.f32 %f4501, 0f37CBAC00;
+ fma.rn.f32 %f5874, %f4501, %f1777, %f4500;
+
+$L__BB0_1523:
+ selp.f32 %f4502, 0f3C0885E4, 0f3D2AAABB, %p1288;
+ fma.rn.f32 %f4503, %f5874, %f1777, %f4502;
+ selp.f32 %f4504, 0fBE2AAAA8, 0fBEFFFFFF, %p1288;
+ fma.rn.f32 %f4505, %f4503, %f1777, %f4504;
+ mov.f32 %f4506, 0f00000000;
+ fma.rn.f32 %f4507, %f1777, %f1776, %f4506;
+ fma.rn.f32 %f5283, %f4505, %f4507, %f1776;
+ and.b32 %r6751, %r2005, 2;
+ setp.eq.s32 %p1290, %r6751, 0;
+ @%p1290 bra $L__BB0_1525;
+
+ mov.f32 %f4509, 0fBF800000;
+ fma.rn.f32 %f5283, %f5283, %f4509, %f4506;
+
+$L__BB0_1525:
+ selp.f32 %f1784, %f5283, %f5284, %p26;
+ selp.f32 %f1785, %f5281, %f5282, %p26;
+ @%p1269 bra $L__BB0_1527;
+
+ add.f32 %f5995, %f1785, %f1784;
+ mov.f32 %f5282, %f5281;
+ mov.f32 %f5284, %f5283;
+
+$L__BB0_1527:
+ @%p1197 bra $L__BB0_1556;
+
+ shl.b32 %r6752, %r12, 5;
+ mov.u32 %r6753, -32;
+ sub.s32 %r2007, %r6753, %r6752;
+ setp.ge.s32 %p1294, %r11, %r2007;
+ @%p1294 bra $L__BB0_1541;
+
+ mul.f32 %f4512, %f5413, 0f3F22F983;
+ cvt.rni.s32.f32 %r8378, %f4512;
+ cvt.rn.f32.s32 %f4513, %r8378;
+ mov.f32 %f4514, 0fBFC90FDA;
+ fma.rn.f32 %f4515, %f4513, %f4514, %f5413;
+ mov.f32 %f4516, 0fB3A22168;
+ fma.rn.f32 %f4517, %f4513, %f4516, %f4515;
+ mov.f32 %f4518, 0fA7C234C5;
+ fma.rn.f32 %f5882, %f4513, %f4518, %f4517;
+ abs.f32 %f1793, %f5413;
+ setp.ltu.f32 %p1295, %f1793, 0f47CE4780;
+ @%p1295 bra $L__BB0_1537;
+
+ setp.eq.f32 %p1296, %f1793, 0f7F800000;
+ @%p1296 bra $L__BB0_1536;
+ bra.uni $L__BB0_1531;
+
+$L__BB0_1536:
+ mov.f32 %f4521, 0f00000000;
+ mul.rn.f32 %f5882, %f5413, %f4521;
+ mov.u32 %r8378, 0;
+ bra.uni $L__BB0_1537;
+
+$L__BB0_1531:
+ mov.b32 %r2009, %f5413;
+ shr.u32 %r6755, %r2009, 23;
+ and.b32 %r6756, %r6755, 255;
+ add.s32 %r2010, %r6756, -128;
+ shl.b32 %r6757, %r2009, 8;
+ or.b32 %r2011, %r6757, -2147483648;
+ shr.u32 %r2012, %r2010, 5;
mov.u64 %rd2717, 0;
- mov.u32 %r8630, 0;
+ mov.u32 %r8375, 0;
mov.u64 %rd2715, __cudart_i2opi_f;
mov.u64 %rd2716, %rd1;
-$L__BB0_1552:
+$L__BB0_1532:
.pragma "nounroll";
- ld.global.nc.u32 %r6890, [%rd2715];
- mad.wide.u32 %rd2179, %r6890, %r2039, %rd2717;
- shr.u64 %rd2717, %rd2179, 32;
- st.local.u32 [%rd2716], %rd2179;
+ ld.global.nc.u32 %r6758, [%rd2715];
+ mad.wide.u32 %rd2205, %r6758, %r2011, %rd2717;
+ shr.u64 %rd2717, %rd2205, 32;
+ st.local.u32 [%rd2716], %rd2205;
add.s64 %rd2716, %rd2716, 4;
add.s64 %rd2715, %rd2715, 4;
- add.s32 %r8630, %r8630, 1;
- setp.ne.s32 %p1311, %r8630, 6;
- @%p1311 bra $L__BB0_1552;
-
- st.local.u32 [%rd5], %rd2717;
- mov.u32 %r6891, 4;
- sub.s32 %r2043, %r6891, %r2040;
- mov.u32 %r6892, 6;
- sub.s32 %r6893, %r6892, %r2040;
- mul.wide.s32 %rd2180, %r6893, 4;
- add.s64 %rd2181, %rd1, %rd2180;
- ld.local.u32 %r8631, [%rd2181];
- ld.local.u32 %r8632, [%rd2181+-4];
- and.b32 %r2046, %r2038, 31;
- setp.eq.s32 %p1312, %r2046, 0;
- @%p1312 bra $L__BB0_1555;
-
- mov.u32 %r6894, 32;
- sub.s32 %r6895, %r6894, %r2046;
- shr.u32 %r6896, %r8632, %r6895;
- shl.b32 %r6897, %r8631, %r2046;
- add.s32 %r8631, %r6896, %r6897;
- mul.wide.s32 %rd2182, %r2043, 4;
- add.s64 %rd2183, %rd1, %rd2182;
- ld.local.u32 %r6898, [%rd2183];
- shr.u32 %r6899, %r6898, %r6895;
- shl.b32 %r6900, %r8632, %r2046;
- add.s32 %r8632, %r6899, %r6900;
-
-$L__BB0_1555:
- and.b32 %r6901, %r2037, -2147483648;
- shr.u32 %r6902, %r8632, 30;
- shl.b32 %r6903, %r8631, 2;
- or.b32 %r6904, %r6902, %r6903;
- shr.u32 %r6905, %r6904, 31;
- shr.u32 %r6906, %r8631, 30;
- add.s32 %r6907, %r6905, %r6906;
- neg.s32 %r6908, %r6907;
- setp.eq.s32 %p1313, %r6901, 0;
- selp.b32 %r8633, %r6907, %r6908, %p1313;
- setp.ne.s32 %p1314, %r6905, 0;
- xor.b32 %r6909, %r6901, -2147483648;
- selp.b32 %r6910, %r6909, %r6901, %p1314;
- selp.b32 %r6911, -1, 0, %p1314;
- xor.b32 %r6912, %r6904, %r6911;
- shl.b32 %r6913, %r8632, 2;
- xor.b32 %r6914, %r6913, %r6911;
- cvt.u64.u32 %rd2184, %r6912;
- cvt.u64.u32 %rd2185, %r6914;
- bfi.b64 %rd2186, %rd2184, %rd2185, 32, 32;
- cvt.rn.f64.s64 %fd203, %rd2186;
- mul.f64 %fd204, %fd203, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4440, %fd204;
- setp.eq.s32 %p1315, %r6910, 0;
- neg.f32 %f4441, %f4440;
- selp.f32 %f5781, %f4440, %f4441, %p1315;
-
-$L__BB0_1557:
- add.s32 %r2053, %r8633, 1;
- and.b32 %r2054, %r2053, 1;
- setp.eq.s32 %p1316, %r2054, 0;
- selp.f32 %f1744, %f5781, 0f3F800000, %p1316;
- mul.rn.f32 %f1745, %f5781, %f5781;
- mov.f32 %f5782, 0fB94D4153;
- @%p1316 bra $L__BB0_1559;
-
- mov.f32 %f4444, 0fBAB607ED;
- mov.f32 %f4445, 0f37CBAC00;
- fma.rn.f32 %f5782, %f4445, %f1745, %f4444;
-
-$L__BB0_1559:
- selp.f32 %f4446, 0f3C0885E4, 0f3D2AAABB, %p1316;
- fma.rn.f32 %f4447, %f5782, %f1745, %f4446;
- selp.f32 %f4448, 0fBE2AAAA8, 0fBEFFFFFF, %p1316;
- fma.rn.f32 %f4449, %f4447, %f1745, %f4448;
- mov.f32 %f4450, 0f00000000;
- fma.rn.f32 %f4451, %f1745, %f1744, %f4450;
- fma.rn.f32 %f5215, %f4449, %f4451, %f1744;
- and.b32 %r6916, %r2053, 2;
- setp.eq.s32 %p1318, %r6916, 0;
- @%p1318 bra $L__BB0_1561;
-
- mov.f32 %f4453, 0fBF800000;
- fma.rn.f32 %f5215, %f5215, %f4453, %f4450;
-
-$L__BB0_1561:
- selp.f32 %f1752, %f5215, %f5216, %p26;
- selp.f32 %f1753, %f5213, %f5214, %p26;
- @%p1297 bra $L__BB0_1563;
-
- add.f32 %f5903, %f1753, %f1752;
- mov.f32 %f5214, %f5213;
- mov.f32 %f5216, %f5215;
-
-$L__BB0_1563:
- @%p1218 bra $L__BB0_1592;
-
- shl.b32 %r6918, %r12, 5;
- mov.u32 %r6919, -32;
- sub.s32 %r2055, %r6919, %r6918;
- setp.ge.s32 %p1322, %r14, %r2055;
- @%p1322 bra $L__BB0_1577;
-
- mul.f32 %f4456, %f5345, 0f3F22F983;
- cvt.rni.s32.f32 %r8637, %f4456;
- cvt.rn.f32.s32 %f4457, %r8637;
- mov.f32 %f4458, 0fBFC90FDA;
- fma.rn.f32 %f4459, %f4457, %f4458, %f5345;
- mov.f32 %f4460, 0fB3A22168;
- fma.rn.f32 %f4461, %f4457, %f4460, %f4459;
- mov.f32 %f4462, 0fA7C234C5;
- fma.rn.f32 %f5790, %f4457, %f4462, %f4461;
- abs.f32 %f1761, %f5345;
- setp.ltu.f32 %p1323, %f1761, 0f47CE4780;
- @%p1323 bra $L__BB0_1573;
-
- setp.eq.f32 %p1324, %f1761, 0f7F800000;
- @%p1324 bra $L__BB0_1572;
- bra.uni $L__BB0_1567;
-
-$L__BB0_1572:
- mov.f32 %f4465, 0f00000000;
- mul.rn.f32 %f5790, %f5345, %f4465;
- mov.u32 %r8637, 0;
- bra.uni $L__BB0_1573;
-
-$L__BB0_1567:
- mov.b32 %r2057, %f5345;
- shr.u32 %r6921, %r2057, 23;
- and.b32 %r6922, %r6921, 255;
- add.s32 %r2058, %r6922, -128;
- shl.b32 %r6923, %r2057, 8;
- or.b32 %r2059, %r6923, -2147483648;
- shr.u32 %r2060, %r2058, 5;
+ add.s32 %r8375, %r8375, 1;
+ setp.ne.s32 %p1297, %r8375, 6;
+ @%p1297 bra $L__BB0_1532;
+
+ st.local.u32 [%rd4], %rd2717;
+ mov.u32 %r6759, 4;
+ sub.s32 %r2015, %r6759, %r2012;
+ mov.u32 %r6760, 6;
+ sub.s32 %r6761, %r6760, %r2012;
+ mul.wide.s32 %rd2206, %r6761, 4;
+ add.s64 %rd2207, %rd1, %rd2206;
+ ld.local.u32 %r8376, [%rd2207];
+ ld.local.u32 %r8377, [%rd2207+-4];
+ and.b32 %r2018, %r2010, 31;
+ setp.eq.s32 %p1298, %r2018, 0;
+ @%p1298 bra $L__BB0_1535;
+
+ mov.u32 %r6762, 32;
+ sub.s32 %r6763, %r6762, %r2018;
+ shr.u32 %r6764, %r8377, %r6763;
+ shl.b32 %r6765, %r8376, %r2018;
+ add.s32 %r8376, %r6764, %r6765;
+ mul.wide.s32 %rd2208, %r2015, 4;
+ add.s64 %rd2209, %rd1, %rd2208;
+ ld.local.u32 %r6766, [%rd2209];
+ shr.u32 %r6767, %r6766, %r6763;
+ shl.b32 %r6768, %r8377, %r2018;
+ add.s32 %r8377, %r6767, %r6768;
+
+$L__BB0_1535:
+ and.b32 %r6769, %r2009, -2147483648;
+ shr.u32 %r6770, %r8377, 30;
+ shl.b32 %r6771, %r8376, 2;
+ or.b32 %r6772, %r6770, %r6771;
+ shr.u32 %r6773, %r6772, 31;
+ shr.u32 %r6774, %r8376, 30;
+ add.s32 %r6775, %r6773, %r6774;
+ neg.s32 %r6776, %r6775;
+ setp.eq.s32 %p1299, %r6769, 0;
+ selp.b32 %r8378, %r6775, %r6776, %p1299;
+ setp.ne.s32 %p1300, %r6773, 0;
+ xor.b32 %r6777, %r6769, -2147483648;
+ selp.b32 %r6778, %r6777, %r6769, %p1300;
+ selp.b32 %r6779, -1, 0, %p1300;
+ xor.b32 %r6780, %r6772, %r6779;
+ shl.b32 %r6781, %r8377, 2;
+ xor.b32 %r6782, %r6781, %r6779;
+ cvt.u64.u32 %rd2210, %r6780;
+ cvt.u64.u32 %rd2211, %r6782;
+ bfi.b64 %rd2212, %rd2210, %rd2211, 32, 32;
+ cvt.rn.f64.s64 %fd205, %rd2212;
+ mul.f64 %fd206, %fd205, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4519, %fd206;
+ setp.eq.s32 %p1301, %r6778, 0;
+ neg.f32 %f4520, %f4519;
+ selp.f32 %f5882, %f4519, %f4520, %p1301;
+
+$L__BB0_1537:
+ and.b32 %r2025, %r8378, 1;
+ setp.eq.s32 %p1302, %r2025, 0;
+ selp.f32 %f1797, %f5882, 0f3F800000, %p1302;
+ mul.rn.f32 %f1798, %f5882, %f5882;
+ mov.f32 %f5883, 0fB94D4153;
+ @%p1302 bra $L__BB0_1539;
+
+ mov.f32 %f4523, 0fBAB607ED;
+ mov.f32 %f4524, 0f37CBAC00;
+ fma.rn.f32 %f5883, %f4524, %f1798, %f4523;
+
+$L__BB0_1539:
+ selp.f32 %f4525, 0f3C0885E4, 0f3D2AAABB, %p1302;
+ fma.rn.f32 %f4526, %f5883, %f1798, %f4525;
+ selp.f32 %f4527, 0fBE2AAAA8, 0fBEFFFFFF, %p1302;
+ fma.rn.f32 %f4528, %f4526, %f1798, %f4527;
+ mov.f32 %f4529, 0f00000000;
+ fma.rn.f32 %f4530, %f1798, %f1797, %f4529;
+ fma.rn.f32 %f5281, %f4528, %f4530, %f1797;
+ and.b32 %r6784, %r8378, 2;
+ setp.eq.s32 %p1304, %r6784, 0;
+ @%p1304 bra $L__BB0_1541;
+
+ mov.f32 %f4532, 0fBF800000;
+ fma.rn.f32 %f5281, %f5281, %f4532, %f4529;
+
+$L__BB0_1541:
+ setp.lt.s32 %p27, %r11, %r2007;
+ @%p1294 bra $L__BB0_1554;
+
+ mul.f32 %f4533, %f5405, 0f3F22F983;
+ cvt.rni.s32.f32 %r8382, %f4533;
+ cvt.rn.f32.s32 %f4534, %r8382;
+ mov.f32 %f4535, 0fBFC90FDA;
+ fma.rn.f32 %f4536, %f4534, %f4535, %f5405;
+ mov.f32 %f4537, 0fB3A22168;
+ fma.rn.f32 %f4538, %f4534, %f4537, %f4536;
+ mov.f32 %f4539, 0fA7C234C5;
+ fma.rn.f32 %f5886, %f4534, %f4539, %f4538;
+ abs.f32 %f1806, %f5405;
+ setp.ltu.f32 %p1306, %f1806, 0f47CE4780;
+ @%p1306 bra $L__BB0_1550;
+
+ setp.eq.f32 %p1307, %f1806, 0f7F800000;
+ @%p1307 bra $L__BB0_1549;
+ bra.uni $L__BB0_1544;
+
+$L__BB0_1549:
+ mov.f32 %f4542, 0f00000000;
+ mul.rn.f32 %f5886, %f5405, %f4542;
+ mov.u32 %r8382, 0;
+ bra.uni $L__BB0_1550;
+
+$L__BB0_1544:
+ mov.b32 %r2027, %f5405;
+ shr.u32 %r6786, %r2027, 23;
+ and.b32 %r6787, %r6786, 255;
+ add.s32 %r2028, %r6787, -128;
+ shl.b32 %r6788, %r2027, 8;
+ or.b32 %r2029, %r6788, -2147483648;
+ shr.u32 %r2030, %r2028, 5;
mov.u64 %rd2720, 0;
- mov.u32 %r8634, 0;
+ mov.u32 %r8379, 0;
mov.u64 %rd2718, __cudart_i2opi_f;
mov.u64 %rd2719, %rd1;
-$L__BB0_1568:
+$L__BB0_1545:
.pragma "nounroll";
- ld.global.nc.u32 %r6924, [%rd2718];
- mad.wide.u32 %rd2189, %r6924, %r2059, %rd2720;
- shr.u64 %rd2720, %rd2189, 32;
- st.local.u32 [%rd2719], %rd2189;
+ ld.global.nc.u32 %r6789, [%rd2718];
+ mad.wide.u32 %rd2215, %r6789, %r2029, %rd2720;
+ shr.u64 %rd2720, %rd2215, 32;
+ st.local.u32 [%rd2719], %rd2215;
add.s64 %rd2719, %rd2719, 4;
add.s64 %rd2718, %rd2718, 4;
- add.s32 %r8634, %r8634, 1;
- setp.ne.s32 %p1325, %r8634, 6;
- @%p1325 bra $L__BB0_1568;
-
- st.local.u32 [%rd5], %rd2720;
- mov.u32 %r6925, 4;
- sub.s32 %r2063, %r6925, %r2060;
- mov.u32 %r6926, 6;
- sub.s32 %r6927, %r6926, %r2060;
- mul.wide.s32 %rd2190, %r6927, 4;
- add.s64 %rd2191, %rd1, %rd2190;
- ld.local.u32 %r8635, [%rd2191];
- ld.local.u32 %r8636, [%rd2191+-4];
- and.b32 %r2066, %r2058, 31;
- setp.eq.s32 %p1326, %r2066, 0;
- @%p1326 bra $L__BB0_1571;
-
- mov.u32 %r6928, 32;
- sub.s32 %r6929, %r6928, %r2066;
- shr.u32 %r6930, %r8636, %r6929;
- shl.b32 %r6931, %r8635, %r2066;
- add.s32 %r8635, %r6930, %r6931;
- mul.wide.s32 %rd2192, %r2063, 4;
- add.s64 %rd2193, %rd1, %rd2192;
- ld.local.u32 %r6932, [%rd2193];
- shr.u32 %r6933, %r6932, %r6929;
- shl.b32 %r6934, %r8636, %r2066;
- add.s32 %r8636, %r6933, %r6934;
-
-$L__BB0_1571:
- and.b32 %r6935, %r2057, -2147483648;
- shr.u32 %r6936, %r8636, 30;
- shl.b32 %r6937, %r8635, 2;
- or.b32 %r6938, %r6936, %r6937;
- shr.u32 %r6939, %r6938, 31;
- shr.u32 %r6940, %r8635, 30;
- add.s32 %r6941, %r6939, %r6940;
- neg.s32 %r6942, %r6941;
- setp.eq.s32 %p1327, %r6935, 0;
- selp.b32 %r8637, %r6941, %r6942, %p1327;
- setp.ne.s32 %p1328, %r6939, 0;
- xor.b32 %r6943, %r6935, -2147483648;
- selp.b32 %r6944, %r6943, %r6935, %p1328;
- selp.b32 %r6945, -1, 0, %p1328;
- xor.b32 %r6946, %r6938, %r6945;
- shl.b32 %r6947, %r8636, 2;
- xor.b32 %r6948, %r6947, %r6945;
- cvt.u64.u32 %rd2194, %r6946;
- cvt.u64.u32 %rd2195, %r6948;
- bfi.b64 %rd2196, %rd2194, %rd2195, 32, 32;
- cvt.rn.f64.s64 %fd205, %rd2196;
- mul.f64 %fd206, %fd205, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4463, %fd206;
- setp.eq.s32 %p1329, %r6944, 0;
- neg.f32 %f4464, %f4463;
- selp.f32 %f5790, %f4463, %f4464, %p1329;
-
-$L__BB0_1573:
- and.b32 %r2073, %r8637, 1;
- setp.eq.s32 %p1330, %r2073, 0;
- selp.f32 %f1765, %f5790, 0f3F800000, %p1330;
- mul.rn.f32 %f1766, %f5790, %f5790;
- mov.f32 %f5791, 0fB94D4153;
- @%p1330 bra $L__BB0_1575;
-
- mov.f32 %f4467, 0fBAB607ED;
- mov.f32 %f4468, 0f37CBAC00;
- fma.rn.f32 %f5791, %f4468, %f1766, %f4467;
-
-$L__BB0_1575:
- selp.f32 %f4469, 0f3C0885E4, 0f3D2AAABB, %p1330;
- fma.rn.f32 %f4470, %f5791, %f1766, %f4469;
- selp.f32 %f4471, 0fBE2AAAA8, 0fBEFFFFFF, %p1330;
- fma.rn.f32 %f4472, %f4470, %f1766, %f4471;
- mov.f32 %f4473, 0f00000000;
- fma.rn.f32 %f4474, %f1766, %f1765, %f4473;
- fma.rn.f32 %f5213, %f4472, %f4474, %f1765;
- and.b32 %r6950, %r8637, 2;
- setp.eq.s32 %p1332, %r6950, 0;
- @%p1332 bra $L__BB0_1577;
-
- mov.f32 %f4476, 0fBF800000;
- fma.rn.f32 %f5213, %f5213, %f4476, %f4473;
-
-$L__BB0_1577:
- setp.lt.s32 %p27, %r14, %r2055;
- @%p1322 bra $L__BB0_1590;
-
- mul.f32 %f4477, %f5337, 0f3F22F983;
- cvt.rni.s32.f32 %r8641, %f4477;
- cvt.rn.f32.s32 %f4478, %r8641;
- mov.f32 %f4479, 0fBFC90FDA;
- fma.rn.f32 %f4480, %f4478, %f4479, %f5337;
- mov.f32 %f4481, 0fB3A22168;
- fma.rn.f32 %f4482, %f4478, %f4481, %f4480;
- mov.f32 %f4483, 0fA7C234C5;
- fma.rn.f32 %f5794, %f4478, %f4483, %f4482;
- abs.f32 %f1774, %f5337;
- setp.ltu.f32 %p1334, %f1774, 0f47CE4780;
- @%p1334 bra $L__BB0_1586;
-
- setp.eq.f32 %p1335, %f1774, 0f7F800000;
- @%p1335 bra $L__BB0_1585;
- bra.uni $L__BB0_1580;
-
-$L__BB0_1585:
- mov.f32 %f4486, 0f00000000;
- mul.rn.f32 %f5794, %f5337, %f4486;
- mov.u32 %r8641, 0;
- bra.uni $L__BB0_1586;
-
-$L__BB0_1580:
- mov.b32 %r2075, %f5337;
- shr.u32 %r6952, %r2075, 23;
- and.b32 %r6953, %r6952, 255;
- add.s32 %r2076, %r6953, -128;
- shl.b32 %r6954, %r2075, 8;
- or.b32 %r2077, %r6954, -2147483648;
- shr.u32 %r2078, %r2076, 5;
+ add.s32 %r8379, %r8379, 1;
+ setp.ne.s32 %p1308, %r8379, 6;
+ @%p1308 bra $L__BB0_1545;
+
+ st.local.u32 [%rd4], %rd2720;
+ mov.u32 %r6790, 4;
+ sub.s32 %r2033, %r6790, %r2030;
+ mov.u32 %r6791, 6;
+ sub.s32 %r6792, %r6791, %r2030;
+ mul.wide.s32 %rd2216, %r6792, 4;
+ add.s64 %rd2217, %rd1, %rd2216;
+ ld.local.u32 %r8380, [%rd2217];
+ ld.local.u32 %r8381, [%rd2217+-4];
+ and.b32 %r2036, %r2028, 31;
+ setp.eq.s32 %p1309, %r2036, 0;
+ @%p1309 bra $L__BB0_1548;
+
+ mov.u32 %r6793, 32;
+ sub.s32 %r6794, %r6793, %r2036;
+ shr.u32 %r6795, %r8381, %r6794;
+ shl.b32 %r6796, %r8380, %r2036;
+ add.s32 %r8380, %r6795, %r6796;
+ mul.wide.s32 %rd2218, %r2033, 4;
+ add.s64 %rd2219, %rd1, %rd2218;
+ ld.local.u32 %r6797, [%rd2219];
+ shr.u32 %r6798, %r6797, %r6794;
+ shl.b32 %r6799, %r8381, %r2036;
+ add.s32 %r8381, %r6798, %r6799;
+
+$L__BB0_1548:
+ and.b32 %r6800, %r2027, -2147483648;
+ shr.u32 %r6801, %r8381, 30;
+ shl.b32 %r6802, %r8380, 2;
+ or.b32 %r6803, %r6801, %r6802;
+ shr.u32 %r6804, %r6803, 31;
+ shr.u32 %r6805, %r8380, 30;
+ add.s32 %r6806, %r6804, %r6805;
+ neg.s32 %r6807, %r6806;
+ setp.eq.s32 %p1310, %r6800, 0;
+ selp.b32 %r8382, %r6806, %r6807, %p1310;
+ setp.ne.s32 %p1311, %r6804, 0;
+ xor.b32 %r6808, %r6800, -2147483648;
+ selp.b32 %r6809, %r6808, %r6800, %p1311;
+ selp.b32 %r6810, -1, 0, %p1311;
+ xor.b32 %r6811, %r6803, %r6810;
+ shl.b32 %r6812, %r8381, 2;
+ xor.b32 %r6813, %r6812, %r6810;
+ cvt.u64.u32 %rd2220, %r6811;
+ cvt.u64.u32 %rd2221, %r6813;
+ bfi.b64 %rd2222, %rd2220, %rd2221, 32, 32;
+ cvt.rn.f64.s64 %fd207, %rd2222;
+ mul.f64 %fd208, %fd207, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4540, %fd208;
+ setp.eq.s32 %p1312, %r6809, 0;
+ neg.f32 %f4541, %f4540;
+ selp.f32 %f5886, %f4540, %f4541, %p1312;
+
+$L__BB0_1550:
+ add.s32 %r2043, %r8382, 1;
+ and.b32 %r2044, %r2043, 1;
+ setp.eq.s32 %p1313, %r2044, 0;
+ selp.f32 %f1810, %f5886, 0f3F800000, %p1313;
+ mul.rn.f32 %f1811, %f5886, %f5886;
+ mov.f32 %f5887, 0fB94D4153;
+ @%p1313 bra $L__BB0_1552;
+
+ mov.f32 %f4544, 0fBAB607ED;
+ mov.f32 %f4545, 0f37CBAC00;
+ fma.rn.f32 %f5887, %f4545, %f1811, %f4544;
+
+$L__BB0_1552:
+ selp.f32 %f4546, 0f3C0885E4, 0f3D2AAABB, %p1313;
+ fma.rn.f32 %f4547, %f5887, %f1811, %f4546;
+ selp.f32 %f4548, 0fBE2AAAA8, 0fBEFFFFFF, %p1313;
+ fma.rn.f32 %f4549, %f4547, %f1811, %f4548;
+ mov.f32 %f4550, 0f00000000;
+ fma.rn.f32 %f4551, %f1811, %f1810, %f4550;
+ fma.rn.f32 %f5283, %f4549, %f4551, %f1810;
+ and.b32 %r6815, %r2043, 2;
+ setp.eq.s32 %p1315, %r6815, 0;
+ @%p1315 bra $L__BB0_1554;
+
+ mov.f32 %f4553, 0fBF800000;
+ fma.rn.f32 %f5283, %f5283, %f4553, %f4550;
+
+$L__BB0_1554:
+ selp.f32 %f1818, %f5283, %f5284, %p27;
+ selp.f32 %f1819, %f5281, %f5282, %p27;
+ @%p1294 bra $L__BB0_1556;
+
+ add.f32 %f5994, %f1819, %f1818;
+ mov.f32 %f5282, %f5281;
+ mov.f32 %f5284, %f5283;
+
+$L__BB0_1556:
+ @%p1200 bra $L__BB0_1585;
+
+ shl.b32 %r6816, %r12, 5;
+ neg.s32 %r2045, %r6816;
+ setp.ge.s32 %p1319, %r11, %r2045;
+ @%p1319 bra $L__BB0_1570;
+
+ mul.f32 %f4556, %f5412, 0f3F22F983;
+ cvt.rni.s32.f32 %r8386, %f4556;
+ cvt.rn.f32.s32 %f4557, %r8386;
+ mov.f32 %f4558, 0fBFC90FDA;
+ fma.rn.f32 %f4559, %f4557, %f4558, %f5412;
+ mov.f32 %f4560, 0fB3A22168;
+ fma.rn.f32 %f4561, %f4557, %f4560, %f4559;
+ mov.f32 %f4562, 0fA7C234C5;
+ fma.rn.f32 %f5895, %f4557, %f4562, %f4561;
+ abs.f32 %f1827, %f5412;
+ setp.ltu.f32 %p1320, %f1827, 0f47CE4780;
+ @%p1320 bra $L__BB0_1566;
+
+ setp.eq.f32 %p1321, %f1827, 0f7F800000;
+ @%p1321 bra $L__BB0_1565;
+ bra.uni $L__BB0_1560;
+
+$L__BB0_1565:
+ mov.f32 %f4565, 0f00000000;
+ mul.rn.f32 %f5895, %f5412, %f4565;
+ mov.u32 %r8386, 0;
+ bra.uni $L__BB0_1566;
+
+$L__BB0_1560:
+ mov.b32 %r2047, %f5412;
+ shr.u32 %r6818, %r2047, 23;
+ and.b32 %r6819, %r6818, 255;
+ add.s32 %r2048, %r6819, -128;
+ shl.b32 %r6820, %r2047, 8;
+ or.b32 %r2049, %r6820, -2147483648;
+ shr.u32 %r2050, %r2048, 5;
mov.u64 %rd2723, 0;
- mov.u32 %r8638, 0;
+ mov.u32 %r8383, 0;
mov.u64 %rd2721, __cudart_i2opi_f;
mov.u64 %rd2722, %rd1;
-$L__BB0_1581:
+$L__BB0_1561:
.pragma "nounroll";
- ld.global.nc.u32 %r6955, [%rd2721];
- mad.wide.u32 %rd2199, %r6955, %r2077, %rd2723;
- shr.u64 %rd2723, %rd2199, 32;
- st.local.u32 [%rd2722], %rd2199;
+ ld.global.nc.u32 %r6821, [%rd2721];
+ mad.wide.u32 %rd2225, %r6821, %r2049, %rd2723;
+ shr.u64 %rd2723, %rd2225, 32;
+ st.local.u32 [%rd2722], %rd2225;
add.s64 %rd2722, %rd2722, 4;
add.s64 %rd2721, %rd2721, 4;
- add.s32 %r8638, %r8638, 1;
- setp.ne.s32 %p1336, %r8638, 6;
- @%p1336 bra $L__BB0_1581;
-
- st.local.u32 [%rd5], %rd2723;
- mov.u32 %r6956, 4;
- sub.s32 %r2081, %r6956, %r2078;
- mov.u32 %r6957, 6;
- sub.s32 %r6958, %r6957, %r2078;
- mul.wide.s32 %rd2200, %r6958, 4;
- add.s64 %rd2201, %rd1, %rd2200;
- ld.local.u32 %r8639, [%rd2201];
- ld.local.u32 %r8640, [%rd2201+-4];
- and.b32 %r2084, %r2076, 31;
- setp.eq.s32 %p1337, %r2084, 0;
- @%p1337 bra $L__BB0_1584;
-
- mov.u32 %r6959, 32;
- sub.s32 %r6960, %r6959, %r2084;
- shr.u32 %r6961, %r8640, %r6960;
- shl.b32 %r6962, %r8639, %r2084;
- add.s32 %r8639, %r6961, %r6962;
- mul.wide.s32 %rd2202, %r2081, 4;
- add.s64 %rd2203, %rd1, %rd2202;
- ld.local.u32 %r6963, [%rd2203];
- shr.u32 %r6964, %r6963, %r6960;
- shl.b32 %r6965, %r8640, %r2084;
- add.s32 %r8640, %r6964, %r6965;
-
-$L__BB0_1584:
- and.b32 %r6966, %r2075, -2147483648;
- shr.u32 %r6967, %r8640, 30;
- shl.b32 %r6968, %r8639, 2;
- or.b32 %r6969, %r6967, %r6968;
- shr.u32 %r6970, %r6969, 31;
- shr.u32 %r6971, %r8639, 30;
- add.s32 %r6972, %r6970, %r6971;
- neg.s32 %r6973, %r6972;
- setp.eq.s32 %p1338, %r6966, 0;
- selp.b32 %r8641, %r6972, %r6973, %p1338;
- setp.ne.s32 %p1339, %r6970, 0;
- xor.b32 %r6974, %r6966, -2147483648;
- selp.b32 %r6975, %r6974, %r6966, %p1339;
- selp.b32 %r6976, -1, 0, %p1339;
- xor.b32 %r6977, %r6969, %r6976;
- shl.b32 %r6978, %r8640, 2;
- xor.b32 %r6979, %r6978, %r6976;
- cvt.u64.u32 %rd2204, %r6977;
- cvt.u64.u32 %rd2205, %r6979;
- bfi.b64 %rd2206, %rd2204, %rd2205, 32, 32;
- cvt.rn.f64.s64 %fd207, %rd2206;
- mul.f64 %fd208, %fd207, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4484, %fd208;
- setp.eq.s32 %p1340, %r6975, 0;
- neg.f32 %f4485, %f4484;
- selp.f32 %f5794, %f4484, %f4485, %p1340;
-
-$L__BB0_1586:
- add.s32 %r2091, %r8641, 1;
- and.b32 %r2092, %r2091, 1;
- setp.eq.s32 %p1341, %r2092, 0;
- selp.f32 %f1778, %f5794, 0f3F800000, %p1341;
- mul.rn.f32 %f1779, %f5794, %f5794;
- mov.f32 %f5795, 0fB94D4153;
- @%p1341 bra $L__BB0_1588;
-
- mov.f32 %f4488, 0fBAB607ED;
- mov.f32 %f4489, 0f37CBAC00;
- fma.rn.f32 %f5795, %f4489, %f1779, %f4488;
-
-$L__BB0_1588:
- selp.f32 %f4490, 0f3C0885E4, 0f3D2AAABB, %p1341;
- fma.rn.f32 %f4491, %f5795, %f1779, %f4490;
- selp.f32 %f4492, 0fBE2AAAA8, 0fBEFFFFFF, %p1341;
- fma.rn.f32 %f4493, %f4491, %f1779, %f4492;
- mov.f32 %f4494, 0f00000000;
- fma.rn.f32 %f4495, %f1779, %f1778, %f4494;
- fma.rn.f32 %f5215, %f4493, %f4495, %f1778;
- and.b32 %r6981, %r2091, 2;
- setp.eq.s32 %p1343, %r6981, 0;
- @%p1343 bra $L__BB0_1590;
-
- mov.f32 %f4497, 0fBF800000;
- fma.rn.f32 %f5215, %f5215, %f4497, %f4494;
-
-$L__BB0_1590:
- selp.f32 %f1786, %f5215, %f5216, %p27;
- selp.f32 %f1787, %f5213, %f5214, %p27;
- @%p1322 bra $L__BB0_1592;
-
- add.f32 %f5902, %f1787, %f1786;
- mov.f32 %f5214, %f5213;
- mov.f32 %f5216, %f5215;
-
-$L__BB0_1592:
- @%p1222 bra $L__BB0_1621;
-
- shl.b32 %r6983, %r12, 5;
- neg.s32 %r2093, %r6983;
- setp.ge.s32 %p1347, %r14, %r2093;
- @%p1347 bra $L__BB0_1606;
-
- mul.f32 %f4500, %f5344, 0f3F22F983;
- cvt.rni.s32.f32 %r8645, %f4500;
- cvt.rn.f32.s32 %f4501, %r8645;
- mov.f32 %f4502, 0fBFC90FDA;
- fma.rn.f32 %f4503, %f4501, %f4502, %f5344;
- mov.f32 %f4504, 0fB3A22168;
- fma.rn.f32 %f4505, %f4501, %f4504, %f4503;
- mov.f32 %f4506, 0fA7C234C5;
- fma.rn.f32 %f5803, %f4501, %f4506, %f4505;
- abs.f32 %f1795, %f5344;
- setp.ltu.f32 %p1348, %f1795, 0f47CE4780;
- @%p1348 bra $L__BB0_1602;
-
- setp.eq.f32 %p1349, %f1795, 0f7F800000;
- @%p1349 bra $L__BB0_1601;
- bra.uni $L__BB0_1596;
-
-$L__BB0_1601:
- mov.f32 %f4509, 0f00000000;
- mul.rn.f32 %f5803, %f5344, %f4509;
- mov.u32 %r8645, 0;
- bra.uni $L__BB0_1602;
-
-$L__BB0_1596:
- mov.b32 %r2095, %f5344;
- shr.u32 %r6985, %r2095, 23;
- and.b32 %r6986, %r6985, 255;
- add.s32 %r2096, %r6986, -128;
- shl.b32 %r6987, %r2095, 8;
- or.b32 %r2097, %r6987, -2147483648;
- shr.u32 %r2098, %r2096, 5;
+ add.s32 %r8383, %r8383, 1;
+ setp.ne.s32 %p1322, %r8383, 6;
+ @%p1322 bra $L__BB0_1561;
+
+ st.local.u32 [%rd4], %rd2723;
+ mov.u32 %r6822, 4;
+ sub.s32 %r2053, %r6822, %r2050;
+ mov.u32 %r6823, 6;
+ sub.s32 %r6824, %r6823, %r2050;
+ mul.wide.s32 %rd2226, %r6824, 4;
+ add.s64 %rd2227, %rd1, %rd2226;
+ ld.local.u32 %r8384, [%rd2227];
+ ld.local.u32 %r8385, [%rd2227+-4];
+ and.b32 %r2056, %r2048, 31;
+ setp.eq.s32 %p1323, %r2056, 0;
+ @%p1323 bra $L__BB0_1564;
+
+ mov.u32 %r6825, 32;
+ sub.s32 %r6826, %r6825, %r2056;
+ shr.u32 %r6827, %r8385, %r6826;
+ shl.b32 %r6828, %r8384, %r2056;
+ add.s32 %r8384, %r6827, %r6828;
+ mul.wide.s32 %rd2228, %r2053, 4;
+ add.s64 %rd2229, %rd1, %rd2228;
+ ld.local.u32 %r6829, [%rd2229];
+ shr.u32 %r6830, %r6829, %r6826;
+ shl.b32 %r6831, %r8385, %r2056;
+ add.s32 %r8385, %r6830, %r6831;
+
+$L__BB0_1564:
+ and.b32 %r6832, %r2047, -2147483648;
+ shr.u32 %r6833, %r8385, 30;
+ shl.b32 %r6834, %r8384, 2;
+ or.b32 %r6835, %r6833, %r6834;
+ shr.u32 %r6836, %r6835, 31;
+ shr.u32 %r6837, %r8384, 30;
+ add.s32 %r6838, %r6836, %r6837;
+ neg.s32 %r6839, %r6838;
+ setp.eq.s32 %p1324, %r6832, 0;
+ selp.b32 %r8386, %r6838, %r6839, %p1324;
+ setp.ne.s32 %p1325, %r6836, 0;
+ xor.b32 %r6840, %r6832, -2147483648;
+ selp.b32 %r6841, %r6840, %r6832, %p1325;
+ selp.b32 %r6842, -1, 0, %p1325;
+ xor.b32 %r6843, %r6835, %r6842;
+ shl.b32 %r6844, %r8385, 2;
+ xor.b32 %r6845, %r6844, %r6842;
+ cvt.u64.u32 %rd2230, %r6843;
+ cvt.u64.u32 %rd2231, %r6845;
+ bfi.b64 %rd2232, %rd2230, %rd2231, 32, 32;
+ cvt.rn.f64.s64 %fd209, %rd2232;
+ mul.f64 %fd210, %fd209, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4563, %fd210;
+ setp.eq.s32 %p1326, %r6841, 0;
+ neg.f32 %f4564, %f4563;
+ selp.f32 %f5895, %f4563, %f4564, %p1326;
+
+$L__BB0_1566:
+ and.b32 %r2063, %r8386, 1;
+ setp.eq.s32 %p1327, %r2063, 0;
+ selp.f32 %f1831, %f5895, 0f3F800000, %p1327;
+ mul.rn.f32 %f1832, %f5895, %f5895;
+ mov.f32 %f5896, 0fB94D4153;
+ @%p1327 bra $L__BB0_1568;
+
+ mov.f32 %f4567, 0fBAB607ED;
+ mov.f32 %f4568, 0f37CBAC00;
+ fma.rn.f32 %f5896, %f4568, %f1832, %f4567;
+
+$L__BB0_1568:
+ selp.f32 %f4569, 0f3C0885E4, 0f3D2AAABB, %p1327;
+ fma.rn.f32 %f4570, %f5896, %f1832, %f4569;
+ selp.f32 %f4571, 0fBE2AAAA8, 0fBEFFFFFF, %p1327;
+ fma.rn.f32 %f4572, %f4570, %f1832, %f4571;
+ mov.f32 %f4573, 0f00000000;
+ fma.rn.f32 %f4574, %f1832, %f1831, %f4573;
+ fma.rn.f32 %f5281, %f4572, %f4574, %f1831;
+ and.b32 %r6847, %r8386, 2;
+ setp.eq.s32 %p1329, %r6847, 0;
+ @%p1329 bra $L__BB0_1570;
+
+ mov.f32 %f4576, 0fBF800000;
+ fma.rn.f32 %f5281, %f5281, %f4576, %f4573;
+
+$L__BB0_1570:
+ setp.lt.s32 %p28, %r11, %r2045;
+ @%p1319 bra $L__BB0_1583;
+
+ mul.f32 %f4577, %f5404, 0f3F22F983;
+ cvt.rni.s32.f32 %r8390, %f4577;
+ cvt.rn.f32.s32 %f4578, %r8390;
+ mov.f32 %f4579, 0fBFC90FDA;
+ fma.rn.f32 %f4580, %f4578, %f4579, %f5404;
+ mov.f32 %f4581, 0fB3A22168;
+ fma.rn.f32 %f4582, %f4578, %f4581, %f4580;
+ mov.f32 %f4583, 0fA7C234C5;
+ fma.rn.f32 %f5899, %f4578, %f4583, %f4582;
+ abs.f32 %f1840, %f5404;
+ setp.ltu.f32 %p1331, %f1840, 0f47CE4780;
+ @%p1331 bra $L__BB0_1579;
+
+ setp.eq.f32 %p1332, %f1840, 0f7F800000;
+ @%p1332 bra $L__BB0_1578;
+ bra.uni $L__BB0_1573;
+
+$L__BB0_1578:
+ mov.f32 %f4586, 0f00000000;
+ mul.rn.f32 %f5899, %f5404, %f4586;
+ mov.u32 %r8390, 0;
+ bra.uni $L__BB0_1579;
+
+$L__BB0_1573:
+ mov.b32 %r2065, %f5404;
+ shr.u32 %r6849, %r2065, 23;
+ and.b32 %r6850, %r6849, 255;
+ add.s32 %r2066, %r6850, -128;
+ shl.b32 %r6851, %r2065, 8;
+ or.b32 %r2067, %r6851, -2147483648;
+ shr.u32 %r2068, %r2066, 5;
mov.u64 %rd2726, 0;
- mov.u32 %r8642, 0;
+ mov.u32 %r8387, 0;
mov.u64 %rd2724, __cudart_i2opi_f;
mov.u64 %rd2725, %rd1;
-$L__BB0_1597:
+$L__BB0_1574:
.pragma "nounroll";
- ld.global.nc.u32 %r6988, [%rd2724];
- mad.wide.u32 %rd2209, %r6988, %r2097, %rd2726;
- shr.u64 %rd2726, %rd2209, 32;
- st.local.u32 [%rd2725], %rd2209;
+ ld.global.nc.u32 %r6852, [%rd2724];
+ mad.wide.u32 %rd2235, %r6852, %r2067, %rd2726;
+ shr.u64 %rd2726, %rd2235, 32;
+ st.local.u32 [%rd2725], %rd2235;
add.s64 %rd2725, %rd2725, 4;
add.s64 %rd2724, %rd2724, 4;
- add.s32 %r8642, %r8642, 1;
- setp.ne.s32 %p1350, %r8642, 6;
- @%p1350 bra $L__BB0_1597;
-
- st.local.u32 [%rd5], %rd2726;
- mov.u32 %r6989, 4;
- sub.s32 %r2101, %r6989, %r2098;
- mov.u32 %r6990, 6;
- sub.s32 %r6991, %r6990, %r2098;
- mul.wide.s32 %rd2210, %r6991, 4;
- add.s64 %rd2211, %rd1, %rd2210;
- ld.local.u32 %r8643, [%rd2211];
- ld.local.u32 %r8644, [%rd2211+-4];
- and.b32 %r2104, %r2096, 31;
- setp.eq.s32 %p1351, %r2104, 0;
- @%p1351 bra $L__BB0_1600;
-
- mov.u32 %r6992, 32;
- sub.s32 %r6993, %r6992, %r2104;
- shr.u32 %r6994, %r8644, %r6993;
- shl.b32 %r6995, %r8643, %r2104;
- add.s32 %r8643, %r6994, %r6995;
- mul.wide.s32 %rd2212, %r2101, 4;
- add.s64 %rd2213, %rd1, %rd2212;
- ld.local.u32 %r6996, [%rd2213];
- shr.u32 %r6997, %r6996, %r6993;
- shl.b32 %r6998, %r8644, %r2104;
- add.s32 %r8644, %r6997, %r6998;
-
-$L__BB0_1600:
- and.b32 %r6999, %r2095, -2147483648;
- shr.u32 %r7000, %r8644, 30;
- shl.b32 %r7001, %r8643, 2;
- or.b32 %r7002, %r7000, %r7001;
- shr.u32 %r7003, %r7002, 31;
- shr.u32 %r7004, %r8643, 30;
- add.s32 %r7005, %r7003, %r7004;
- neg.s32 %r7006, %r7005;
- setp.eq.s32 %p1352, %r6999, 0;
- selp.b32 %r8645, %r7005, %r7006, %p1352;
- setp.ne.s32 %p1353, %r7003, 0;
- xor.b32 %r7007, %r6999, -2147483648;
- selp.b32 %r7008, %r7007, %r6999, %p1353;
- selp.b32 %r7009, -1, 0, %p1353;
- xor.b32 %r7010, %r7002, %r7009;
- shl.b32 %r7011, %r8644, 2;
- xor.b32 %r7012, %r7011, %r7009;
- cvt.u64.u32 %rd2214, %r7010;
- cvt.u64.u32 %rd2215, %r7012;
- bfi.b64 %rd2216, %rd2214, %rd2215, 32, 32;
- cvt.rn.f64.s64 %fd209, %rd2216;
- mul.f64 %fd210, %fd209, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4507, %fd210;
- setp.eq.s32 %p1354, %r7008, 0;
- neg.f32 %f4508, %f4507;
- selp.f32 %f5803, %f4507, %f4508, %p1354;
-
-$L__BB0_1602:
- and.b32 %r2111, %r8645, 1;
- setp.eq.s32 %p1355, %r2111, 0;
- selp.f32 %f1799, %f5803, 0f3F800000, %p1355;
- mul.rn.f32 %f1800, %f5803, %f5803;
- mov.f32 %f5804, 0fB94D4153;
- @%p1355 bra $L__BB0_1604;
-
- mov.f32 %f4511, 0fBAB607ED;
- mov.f32 %f4512, 0f37CBAC00;
- fma.rn.f32 %f5804, %f4512, %f1800, %f4511;
-
-$L__BB0_1604:
- selp.f32 %f4513, 0f3C0885E4, 0f3D2AAABB, %p1355;
- fma.rn.f32 %f4514, %f5804, %f1800, %f4513;
- selp.f32 %f4515, 0fBE2AAAA8, 0fBEFFFFFF, %p1355;
- fma.rn.f32 %f4516, %f4514, %f1800, %f4515;
- mov.f32 %f4517, 0f00000000;
- fma.rn.f32 %f4518, %f1800, %f1799, %f4517;
- fma.rn.f32 %f5213, %f4516, %f4518, %f1799;
- and.b32 %r7014, %r8645, 2;
- setp.eq.s32 %p1357, %r7014, 0;
- @%p1357 bra $L__BB0_1606;
-
- mov.f32 %f4520, 0fBF800000;
- fma.rn.f32 %f5213, %f5213, %f4520, %f4517;
-
-$L__BB0_1606:
- setp.lt.s32 %p28, %r14, %r2093;
- @%p1347 bra $L__BB0_1619;
-
- mul.f32 %f4521, %f5336, 0f3F22F983;
- cvt.rni.s32.f32 %r8649, %f4521;
- cvt.rn.f32.s32 %f4522, %r8649;
- mov.f32 %f4523, 0fBFC90FDA;
- fma.rn.f32 %f4524, %f4522, %f4523, %f5336;
- mov.f32 %f4525, 0fB3A22168;
- fma.rn.f32 %f4526, %f4522, %f4525, %f4524;
- mov.f32 %f4527, 0fA7C234C5;
- fma.rn.f32 %f5807, %f4522, %f4527, %f4526;
- abs.f32 %f1808, %f5336;
- setp.ltu.f32 %p1359, %f1808, 0f47CE4780;
- @%p1359 bra $L__BB0_1615;
-
- setp.eq.f32 %p1360, %f1808, 0f7F800000;
- @%p1360 bra $L__BB0_1614;
- bra.uni $L__BB0_1609;
-
-$L__BB0_1614:
- mov.f32 %f4530, 0f00000000;
- mul.rn.f32 %f5807, %f5336, %f4530;
- mov.u32 %r8649, 0;
- bra.uni $L__BB0_1615;
-
-$L__BB0_1609:
- mov.b32 %r2113, %f5336;
- shr.u32 %r7016, %r2113, 23;
- and.b32 %r7017, %r7016, 255;
- add.s32 %r2114, %r7017, -128;
- shl.b32 %r7018, %r2113, 8;
- or.b32 %r2115, %r7018, -2147483648;
- shr.u32 %r2116, %r2114, 5;
+ add.s32 %r8387, %r8387, 1;
+ setp.ne.s32 %p1333, %r8387, 6;
+ @%p1333 bra $L__BB0_1574;
+
+ st.local.u32 [%rd4], %rd2726;
+ mov.u32 %r6853, 4;
+ sub.s32 %r2071, %r6853, %r2068;
+ mov.u32 %r6854, 6;
+ sub.s32 %r6855, %r6854, %r2068;
+ mul.wide.s32 %rd2236, %r6855, 4;
+ add.s64 %rd2237, %rd1, %rd2236;
+ ld.local.u32 %r8388, [%rd2237];
+ ld.local.u32 %r8389, [%rd2237+-4];
+ and.b32 %r2074, %r2066, 31;
+ setp.eq.s32 %p1334, %r2074, 0;
+ @%p1334 bra $L__BB0_1577;
+
+ mov.u32 %r6856, 32;
+ sub.s32 %r6857, %r6856, %r2074;
+ shr.u32 %r6858, %r8389, %r6857;
+ shl.b32 %r6859, %r8388, %r2074;
+ add.s32 %r8388, %r6858, %r6859;
+ mul.wide.s32 %rd2238, %r2071, 4;
+ add.s64 %rd2239, %rd1, %rd2238;
+ ld.local.u32 %r6860, [%rd2239];
+ shr.u32 %r6861, %r6860, %r6857;
+ shl.b32 %r6862, %r8389, %r2074;
+ add.s32 %r8389, %r6861, %r6862;
+
+$L__BB0_1577:
+ and.b32 %r6863, %r2065, -2147483648;
+ shr.u32 %r6864, %r8389, 30;
+ shl.b32 %r6865, %r8388, 2;
+ or.b32 %r6866, %r6864, %r6865;
+ shr.u32 %r6867, %r6866, 31;
+ shr.u32 %r6868, %r8388, 30;
+ add.s32 %r6869, %r6867, %r6868;
+ neg.s32 %r6870, %r6869;
+ setp.eq.s32 %p1335, %r6863, 0;
+ selp.b32 %r8390, %r6869, %r6870, %p1335;
+ setp.ne.s32 %p1336, %r6867, 0;
+ xor.b32 %r6871, %r6863, -2147483648;
+ selp.b32 %r6872, %r6871, %r6863, %p1336;
+ selp.b32 %r6873, -1, 0, %p1336;
+ xor.b32 %r6874, %r6866, %r6873;
+ shl.b32 %r6875, %r8389, 2;
+ xor.b32 %r6876, %r6875, %r6873;
+ cvt.u64.u32 %rd2240, %r6874;
+ cvt.u64.u32 %rd2241, %r6876;
+ bfi.b64 %rd2242, %rd2240, %rd2241, 32, 32;
+ cvt.rn.f64.s64 %fd211, %rd2242;
+ mul.f64 %fd212, %fd211, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4584, %fd212;
+ setp.eq.s32 %p1337, %r6872, 0;
+ neg.f32 %f4585, %f4584;
+ selp.f32 %f5899, %f4584, %f4585, %p1337;
+
+$L__BB0_1579:
+ add.s32 %r2081, %r8390, 1;
+ and.b32 %r2082, %r2081, 1;
+ setp.eq.s32 %p1338, %r2082, 0;
+ selp.f32 %f1844, %f5899, 0f3F800000, %p1338;
+ mul.rn.f32 %f1845, %f5899, %f5899;
+ mov.f32 %f5900, 0fB94D4153;
+ @%p1338 bra $L__BB0_1581;
+
+ mov.f32 %f4588, 0fBAB607ED;
+ mov.f32 %f4589, 0f37CBAC00;
+ fma.rn.f32 %f5900, %f4589, %f1845, %f4588;
+
+$L__BB0_1581:
+ selp.f32 %f4590, 0f3C0885E4, 0f3D2AAABB, %p1338;
+ fma.rn.f32 %f4591, %f5900, %f1845, %f4590;
+ selp.f32 %f4592, 0fBE2AAAA8, 0fBEFFFFFF, %p1338;
+ fma.rn.f32 %f4593, %f4591, %f1845, %f4592;
+ mov.f32 %f4594, 0f00000000;
+ fma.rn.f32 %f4595, %f1845, %f1844, %f4594;
+ fma.rn.f32 %f5283, %f4593, %f4595, %f1844;
+ and.b32 %r6878, %r2081, 2;
+ setp.eq.s32 %p1340, %r6878, 0;
+ @%p1340 bra $L__BB0_1583;
+
+ mov.f32 %f4597, 0fBF800000;
+ fma.rn.f32 %f5283, %f5283, %f4597, %f4594;
+
+$L__BB0_1583:
+ selp.f32 %f1852, %f5283, %f5284, %p28;
+ selp.f32 %f1853, %f5281, %f5282, %p28;
+ @%p1319 bra $L__BB0_1585;
+
+ add.f32 %f5993, %f1853, %f1852;
+ mov.f32 %f5282, %f5281;
+ mov.f32 %f5284, %f5283;
+
+$L__BB0_1585:
+ @%p1200 bra $L__BB0_1614;
+
+ shl.b32 %r6879, %r12, 5;
+ mov.u32 %r6880, -32;
+ sub.s32 %r2083, %r6880, %r6879;
+ setp.ge.s32 %p1344, %r11, %r2083;
+ @%p1344 bra $L__BB0_1599;
+
+ mul.f32 %f4600, %f5411, 0f3F22F983;
+ cvt.rni.s32.f32 %r8394, %f4600;
+ cvt.rn.f32.s32 %f4601, %r8394;
+ mov.f32 %f4602, 0fBFC90FDA;
+ fma.rn.f32 %f4603, %f4601, %f4602, %f5411;
+ mov.f32 %f4604, 0fB3A22168;
+ fma.rn.f32 %f4605, %f4601, %f4604, %f4603;
+ mov.f32 %f4606, 0fA7C234C5;
+ fma.rn.f32 %f5908, %f4601, %f4606, %f4605;
+ abs.f32 %f1861, %f5411;
+ setp.ltu.f32 %p1345, %f1861, 0f47CE4780;
+ @%p1345 bra $L__BB0_1595;
+
+ setp.eq.f32 %p1346, %f1861, 0f7F800000;
+ @%p1346 bra $L__BB0_1594;
+ bra.uni $L__BB0_1589;
+
+$L__BB0_1594:
+ mov.f32 %f4609, 0f00000000;
+ mul.rn.f32 %f5908, %f5411, %f4609;
+ mov.u32 %r8394, 0;
+ bra.uni $L__BB0_1595;
+
+$L__BB0_1589:
+ mov.b32 %r2085, %f5411;
+ shr.u32 %r6882, %r2085, 23;
+ and.b32 %r6883, %r6882, 255;
+ add.s32 %r2086, %r6883, -128;
+ shl.b32 %r6884, %r2085, 8;
+ or.b32 %r2087, %r6884, -2147483648;
+ shr.u32 %r2088, %r2086, 5;
mov.u64 %rd2729, 0;
- mov.u32 %r8646, 0;
+ mov.u32 %r8391, 0;
mov.u64 %rd2727, __cudart_i2opi_f;
mov.u64 %rd2728, %rd1;
-$L__BB0_1610:
+$L__BB0_1590:
.pragma "nounroll";
- ld.global.nc.u32 %r7019, [%rd2727];
- mad.wide.u32 %rd2219, %r7019, %r2115, %rd2729;
- shr.u64 %rd2729, %rd2219, 32;
- st.local.u32 [%rd2728], %rd2219;
+ ld.global.nc.u32 %r6885, [%rd2727];
+ mad.wide.u32 %rd2245, %r6885, %r2087, %rd2729;
+ shr.u64 %rd2729, %rd2245, 32;
+ st.local.u32 [%rd2728], %rd2245;
add.s64 %rd2728, %rd2728, 4;
add.s64 %rd2727, %rd2727, 4;
- add.s32 %r8646, %r8646, 1;
- setp.ne.s32 %p1361, %r8646, 6;
- @%p1361 bra $L__BB0_1610;
-
- st.local.u32 [%rd5], %rd2729;
- mov.u32 %r7020, 4;
- sub.s32 %r2119, %r7020, %r2116;
- mov.u32 %r7021, 6;
- sub.s32 %r7022, %r7021, %r2116;
- mul.wide.s32 %rd2220, %r7022, 4;
- add.s64 %rd2221, %rd1, %rd2220;
- ld.local.u32 %r8647, [%rd2221];
- ld.local.u32 %r8648, [%rd2221+-4];
- and.b32 %r2122, %r2114, 31;
- setp.eq.s32 %p1362, %r2122, 0;
- @%p1362 bra $L__BB0_1613;
-
- mov.u32 %r7023, 32;
- sub.s32 %r7024, %r7023, %r2122;
- shr.u32 %r7025, %r8648, %r7024;
- shl.b32 %r7026, %r8647, %r2122;
- add.s32 %r8647, %r7025, %r7026;
- mul.wide.s32 %rd2222, %r2119, 4;
- add.s64 %rd2223, %rd1, %rd2222;
- ld.local.u32 %r7027, [%rd2223];
- shr.u32 %r7028, %r7027, %r7024;
- shl.b32 %r7029, %r8648, %r2122;
- add.s32 %r8648, %r7028, %r7029;
-
-$L__BB0_1613:
- and.b32 %r7030, %r2113, -2147483648;
- shr.u32 %r7031, %r8648, 30;
- shl.b32 %r7032, %r8647, 2;
- or.b32 %r7033, %r7031, %r7032;
- shr.u32 %r7034, %r7033, 31;
- shr.u32 %r7035, %r8647, 30;
- add.s32 %r7036, %r7034, %r7035;
- neg.s32 %r7037, %r7036;
- setp.eq.s32 %p1363, %r7030, 0;
- selp.b32 %r8649, %r7036, %r7037, %p1363;
- setp.ne.s32 %p1364, %r7034, 0;
- xor.b32 %r7038, %r7030, -2147483648;
- selp.b32 %r7039, %r7038, %r7030, %p1364;
- selp.b32 %r7040, -1, 0, %p1364;
- xor.b32 %r7041, %r7033, %r7040;
- shl.b32 %r7042, %r8648, 2;
- xor.b32 %r7043, %r7042, %r7040;
- cvt.u64.u32 %rd2224, %r7041;
- cvt.u64.u32 %rd2225, %r7043;
- bfi.b64 %rd2226, %rd2224, %rd2225, 32, 32;
- cvt.rn.f64.s64 %fd211, %rd2226;
- mul.f64 %fd212, %fd211, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4528, %fd212;
- setp.eq.s32 %p1365, %r7039, 0;
- neg.f32 %f4529, %f4528;
- selp.f32 %f5807, %f4528, %f4529, %p1365;
-
-$L__BB0_1615:
- add.s32 %r2129, %r8649, 1;
- and.b32 %r2130, %r2129, 1;
- setp.eq.s32 %p1366, %r2130, 0;
- selp.f32 %f1812, %f5807, 0f3F800000, %p1366;
- mul.rn.f32 %f1813, %f5807, %f5807;
- mov.f32 %f5808, 0fB94D4153;
- @%p1366 bra $L__BB0_1617;
-
- mov.f32 %f4532, 0fBAB607ED;
- mov.f32 %f4533, 0f37CBAC00;
- fma.rn.f32 %f5808, %f4533, %f1813, %f4532;
-
-$L__BB0_1617:
- selp.f32 %f4534, 0f3C0885E4, 0f3D2AAABB, %p1366;
- fma.rn.f32 %f4535, %f5808, %f1813, %f4534;
- selp.f32 %f4536, 0fBE2AAAA8, 0fBEFFFFFF, %p1366;
- fma.rn.f32 %f4537, %f4535, %f1813, %f4536;
- mov.f32 %f4538, 0f00000000;
- fma.rn.f32 %f4539, %f1813, %f1812, %f4538;
- fma.rn.f32 %f5215, %f4537, %f4539, %f1812;
- and.b32 %r7045, %r2129, 2;
- setp.eq.s32 %p1368, %r7045, 0;
- @%p1368 bra $L__BB0_1619;
-
- mov.f32 %f4541, 0fBF800000;
- fma.rn.f32 %f5215, %f5215, %f4541, %f4538;
-
-$L__BB0_1619:
- selp.f32 %f1820, %f5215, %f5216, %p28;
- selp.f32 %f1821, %f5213, %f5214, %p28;
- @%p1347 bra $L__BB0_1621;
-
- add.f32 %f5901, %f1821, %f1820;
- mov.f32 %f5214, %f5213;
- mov.f32 %f5216, %f5215;
-
-$L__BB0_1621:
- @%p1222 bra $L__BB0_1650;
-
- shl.b32 %r7047, %r12, 5;
- mov.u32 %r7048, -32;
- sub.s32 %r2131, %r7048, %r7047;
- setp.ge.s32 %p1372, %r14, %r2131;
- @%p1372 bra $L__BB0_1635;
-
- mul.f32 %f4544, %f5343, 0f3F22F983;
- cvt.rni.s32.f32 %r8653, %f4544;
- cvt.rn.f32.s32 %f4545, %r8653;
- mov.f32 %f4546, 0fBFC90FDA;
- fma.rn.f32 %f4547, %f4545, %f4546, %f5343;
- mov.f32 %f4548, 0fB3A22168;
- fma.rn.f32 %f4549, %f4545, %f4548, %f4547;
- mov.f32 %f4550, 0fA7C234C5;
- fma.rn.f32 %f5816, %f4545, %f4550, %f4549;
- abs.f32 %f1829, %f5343;
- setp.ltu.f32 %p1373, %f1829, 0f47CE4780;
- @%p1373 bra $L__BB0_1631;
-
- setp.eq.f32 %p1374, %f1829, 0f7F800000;
- @%p1374 bra $L__BB0_1630;
- bra.uni $L__BB0_1625;
-
-$L__BB0_1630:
- mov.f32 %f4553, 0f00000000;
- mul.rn.f32 %f5816, %f5343, %f4553;
- mov.u32 %r8653, 0;
- bra.uni $L__BB0_1631;
-
-$L__BB0_1625:
- mov.b32 %r2133, %f5343;
- shr.u32 %r7050, %r2133, 23;
- and.b32 %r7051, %r7050, 255;
- add.s32 %r2134, %r7051, -128;
- shl.b32 %r7052, %r2133, 8;
- or.b32 %r2135, %r7052, -2147483648;
- shr.u32 %r2136, %r2134, 5;
+ add.s32 %r8391, %r8391, 1;
+ setp.ne.s32 %p1347, %r8391, 6;
+ @%p1347 bra $L__BB0_1590;
+
+ st.local.u32 [%rd4], %rd2729;
+ mov.u32 %r6886, 4;
+ sub.s32 %r2091, %r6886, %r2088;
+ mov.u32 %r6887, 6;
+ sub.s32 %r6888, %r6887, %r2088;
+ mul.wide.s32 %rd2246, %r6888, 4;
+ add.s64 %rd2247, %rd1, %rd2246;
+ ld.local.u32 %r8392, [%rd2247];
+ ld.local.u32 %r8393, [%rd2247+-4];
+ and.b32 %r2094, %r2086, 31;
+ setp.eq.s32 %p1348, %r2094, 0;
+ @%p1348 bra $L__BB0_1593;
+
+ mov.u32 %r6889, 32;
+ sub.s32 %r6890, %r6889, %r2094;
+ shr.u32 %r6891, %r8393, %r6890;
+ shl.b32 %r6892, %r8392, %r2094;
+ add.s32 %r8392, %r6891, %r6892;
+ mul.wide.s32 %rd2248, %r2091, 4;
+ add.s64 %rd2249, %rd1, %rd2248;
+ ld.local.u32 %r6893, [%rd2249];
+ shr.u32 %r6894, %r6893, %r6890;
+ shl.b32 %r6895, %r8393, %r2094;
+ add.s32 %r8393, %r6894, %r6895;
+
+$L__BB0_1593:
+ and.b32 %r6896, %r2085, -2147483648;
+ shr.u32 %r6897, %r8393, 30;
+ shl.b32 %r6898, %r8392, 2;
+ or.b32 %r6899, %r6897, %r6898;
+ shr.u32 %r6900, %r6899, 31;
+ shr.u32 %r6901, %r8392, 30;
+ add.s32 %r6902, %r6900, %r6901;
+ neg.s32 %r6903, %r6902;
+ setp.eq.s32 %p1349, %r6896, 0;
+ selp.b32 %r8394, %r6902, %r6903, %p1349;
+ setp.ne.s32 %p1350, %r6900, 0;
+ xor.b32 %r6904, %r6896, -2147483648;
+ selp.b32 %r6905, %r6904, %r6896, %p1350;
+ selp.b32 %r6906, -1, 0, %p1350;
+ xor.b32 %r6907, %r6899, %r6906;
+ shl.b32 %r6908, %r8393, 2;
+ xor.b32 %r6909, %r6908, %r6906;
+ cvt.u64.u32 %rd2250, %r6907;
+ cvt.u64.u32 %rd2251, %r6909;
+ bfi.b64 %rd2252, %rd2250, %rd2251, 32, 32;
+ cvt.rn.f64.s64 %fd213, %rd2252;
+ mul.f64 %fd214, %fd213, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4607, %fd214;
+ setp.eq.s32 %p1351, %r6905, 0;
+ neg.f32 %f4608, %f4607;
+ selp.f32 %f5908, %f4607, %f4608, %p1351;
+
+$L__BB0_1595:
+ and.b32 %r2101, %r8394, 1;
+ setp.eq.s32 %p1352, %r2101, 0;
+ selp.f32 %f1865, %f5908, 0f3F800000, %p1352;
+ mul.rn.f32 %f1866, %f5908, %f5908;
+ mov.f32 %f5909, 0fB94D4153;
+ @%p1352 bra $L__BB0_1597;
+
+ mov.f32 %f4611, 0fBAB607ED;
+ mov.f32 %f4612, 0f37CBAC00;
+ fma.rn.f32 %f5909, %f4612, %f1866, %f4611;
+
+$L__BB0_1597:
+ selp.f32 %f4613, 0f3C0885E4, 0f3D2AAABB, %p1352;
+ fma.rn.f32 %f4614, %f5909, %f1866, %f4613;
+ selp.f32 %f4615, 0fBE2AAAA8, 0fBEFFFFFF, %p1352;
+ fma.rn.f32 %f4616, %f4614, %f1866, %f4615;
+ mov.f32 %f4617, 0f00000000;
+ fma.rn.f32 %f4618, %f1866, %f1865, %f4617;
+ fma.rn.f32 %f5281, %f4616, %f4618, %f1865;
+ and.b32 %r6911, %r8394, 2;
+ setp.eq.s32 %p1354, %r6911, 0;
+ @%p1354 bra $L__BB0_1599;
+
+ mov.f32 %f4620, 0fBF800000;
+ fma.rn.f32 %f5281, %f5281, %f4620, %f4617;
+
+$L__BB0_1599:
+ setp.lt.s32 %p29, %r11, %r2083;
+ @%p1344 bra $L__BB0_1612;
+
+ mul.f32 %f4621, %f5403, 0f3F22F983;
+ cvt.rni.s32.f32 %r8398, %f4621;
+ cvt.rn.f32.s32 %f4622, %r8398;
+ mov.f32 %f4623, 0fBFC90FDA;
+ fma.rn.f32 %f4624, %f4622, %f4623, %f5403;
+ mov.f32 %f4625, 0fB3A22168;
+ fma.rn.f32 %f4626, %f4622, %f4625, %f4624;
+ mov.f32 %f4627, 0fA7C234C5;
+ fma.rn.f32 %f5912, %f4622, %f4627, %f4626;
+ abs.f32 %f1874, %f5403;
+ setp.ltu.f32 %p1356, %f1874, 0f47CE4780;
+ @%p1356 bra $L__BB0_1608;
+
+ setp.eq.f32 %p1357, %f1874, 0f7F800000;
+ @%p1357 bra $L__BB0_1607;
+ bra.uni $L__BB0_1602;
+
+$L__BB0_1607:
+ mov.f32 %f4630, 0f00000000;
+ mul.rn.f32 %f5912, %f5403, %f4630;
+ mov.u32 %r8398, 0;
+ bra.uni $L__BB0_1608;
+
+$L__BB0_1602:
+ mov.b32 %r2103, %f5403;
+ shr.u32 %r6913, %r2103, 23;
+ and.b32 %r6914, %r6913, 255;
+ add.s32 %r2104, %r6914, -128;
+ shl.b32 %r6915, %r2103, 8;
+ or.b32 %r2105, %r6915, -2147483648;
+ shr.u32 %r2106, %r2104, 5;
mov.u64 %rd2732, 0;
- mov.u32 %r8650, 0;
+ mov.u32 %r8395, 0;
mov.u64 %rd2730, __cudart_i2opi_f;
mov.u64 %rd2731, %rd1;
-$L__BB0_1626:
+$L__BB0_1603:
.pragma "nounroll";
- ld.global.nc.u32 %r7053, [%rd2730];
- mad.wide.u32 %rd2229, %r7053, %r2135, %rd2732;
- shr.u64 %rd2732, %rd2229, 32;
- st.local.u32 [%rd2731], %rd2229;
+ ld.global.nc.u32 %r6916, [%rd2730];
+ mad.wide.u32 %rd2255, %r6916, %r2105, %rd2732;
+ shr.u64 %rd2732, %rd2255, 32;
+ st.local.u32 [%rd2731], %rd2255;
add.s64 %rd2731, %rd2731, 4;
add.s64 %rd2730, %rd2730, 4;
- add.s32 %r8650, %r8650, 1;
- setp.ne.s32 %p1375, %r8650, 6;
- @%p1375 bra $L__BB0_1626;
-
- st.local.u32 [%rd5], %rd2732;
- mov.u32 %r7054, 4;
- sub.s32 %r2139, %r7054, %r2136;
- mov.u32 %r7055, 6;
- sub.s32 %r7056, %r7055, %r2136;
- mul.wide.s32 %rd2230, %r7056, 4;
- add.s64 %rd2231, %rd1, %rd2230;
- ld.local.u32 %r8651, [%rd2231];
- ld.local.u32 %r8652, [%rd2231+-4];
- and.b32 %r2142, %r2134, 31;
- setp.eq.s32 %p1376, %r2142, 0;
- @%p1376 bra $L__BB0_1629;
-
- mov.u32 %r7057, 32;
- sub.s32 %r7058, %r7057, %r2142;
- shr.u32 %r7059, %r8652, %r7058;
- shl.b32 %r7060, %r8651, %r2142;
- add.s32 %r8651, %r7059, %r7060;
- mul.wide.s32 %rd2232, %r2139, 4;
- add.s64 %rd2233, %rd1, %rd2232;
- ld.local.u32 %r7061, [%rd2233];
- shr.u32 %r7062, %r7061, %r7058;
- shl.b32 %r7063, %r8652, %r2142;
- add.s32 %r8652, %r7062, %r7063;
-
-$L__BB0_1629:
- and.b32 %r7064, %r2133, -2147483648;
- shr.u32 %r7065, %r8652, 30;
- shl.b32 %r7066, %r8651, 2;
- or.b32 %r7067, %r7065, %r7066;
- shr.u32 %r7068, %r7067, 31;
- shr.u32 %r7069, %r8651, 30;
- add.s32 %r7070, %r7068, %r7069;
- neg.s32 %r7071, %r7070;
- setp.eq.s32 %p1377, %r7064, 0;
- selp.b32 %r8653, %r7070, %r7071, %p1377;
- setp.ne.s32 %p1378, %r7068, 0;
- xor.b32 %r7072, %r7064, -2147483648;
- selp.b32 %r7073, %r7072, %r7064, %p1378;
- selp.b32 %r7074, -1, 0, %p1378;
- xor.b32 %r7075, %r7067, %r7074;
- shl.b32 %r7076, %r8652, 2;
- xor.b32 %r7077, %r7076, %r7074;
- cvt.u64.u32 %rd2234, %r7075;
- cvt.u64.u32 %rd2235, %r7077;
- bfi.b64 %rd2236, %rd2234, %rd2235, 32, 32;
- cvt.rn.f64.s64 %fd213, %rd2236;
- mul.f64 %fd214, %fd213, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4551, %fd214;
- setp.eq.s32 %p1379, %r7073, 0;
- neg.f32 %f4552, %f4551;
- selp.f32 %f5816, %f4551, %f4552, %p1379;
-
-$L__BB0_1631:
- and.b32 %r2149, %r8653, 1;
- setp.eq.s32 %p1380, %r2149, 0;
- selp.f32 %f1833, %f5816, 0f3F800000, %p1380;
- mul.rn.f32 %f1834, %f5816, %f5816;
- mov.f32 %f5817, 0fB94D4153;
- @%p1380 bra $L__BB0_1633;
-
- mov.f32 %f4555, 0fBAB607ED;
- mov.f32 %f4556, 0f37CBAC00;
- fma.rn.f32 %f5817, %f4556, %f1834, %f4555;
-
-$L__BB0_1633:
- selp.f32 %f4557, 0f3C0885E4, 0f3D2AAABB, %p1380;
- fma.rn.f32 %f4558, %f5817, %f1834, %f4557;
- selp.f32 %f4559, 0fBE2AAAA8, 0fBEFFFFFF, %p1380;
- fma.rn.f32 %f4560, %f4558, %f1834, %f4559;
- mov.f32 %f4561, 0f00000000;
- fma.rn.f32 %f4562, %f1834, %f1833, %f4561;
- fma.rn.f32 %f5213, %f4560, %f4562, %f1833;
- and.b32 %r7079, %r8653, 2;
- setp.eq.s32 %p1382, %r7079, 0;
- @%p1382 bra $L__BB0_1635;
-
- mov.f32 %f4564, 0fBF800000;
- fma.rn.f32 %f5213, %f5213, %f4564, %f4561;
-
-$L__BB0_1635:
- setp.lt.s32 %p29, %r14, %r2131;
- @%p1372 bra $L__BB0_1648;
-
- mul.f32 %f4565, %f5335, 0f3F22F983;
- cvt.rni.s32.f32 %r8657, %f4565;
- cvt.rn.f32.s32 %f4566, %r8657;
- mov.f32 %f4567, 0fBFC90FDA;
- fma.rn.f32 %f4568, %f4566, %f4567, %f5335;
- mov.f32 %f4569, 0fB3A22168;
- fma.rn.f32 %f4570, %f4566, %f4569, %f4568;
- mov.f32 %f4571, 0fA7C234C5;
- fma.rn.f32 %f5820, %f4566, %f4571, %f4570;
- abs.f32 %f1842, %f5335;
- setp.ltu.f32 %p1384, %f1842, 0f47CE4780;
- @%p1384 bra $L__BB0_1644;
-
- setp.eq.f32 %p1385, %f1842, 0f7F800000;
- @%p1385 bra $L__BB0_1643;
- bra.uni $L__BB0_1638;
-
-$L__BB0_1643:
- mov.f32 %f4574, 0f00000000;
- mul.rn.f32 %f5820, %f5335, %f4574;
- mov.u32 %r8657, 0;
- bra.uni $L__BB0_1644;
-
-$L__BB0_1638:
- mov.b32 %r2151, %f5335;
- shr.u32 %r7081, %r2151, 23;
- and.b32 %r7082, %r7081, 255;
- add.s32 %r2152, %r7082, -128;
- shl.b32 %r7083, %r2151, 8;
- or.b32 %r2153, %r7083, -2147483648;
- shr.u32 %r2154, %r2152, 5;
+ add.s32 %r8395, %r8395, 1;
+ setp.ne.s32 %p1358, %r8395, 6;
+ @%p1358 bra $L__BB0_1603;
+
+ st.local.u32 [%rd4], %rd2732;
+ mov.u32 %r6917, 4;
+ sub.s32 %r2109, %r6917, %r2106;
+ mov.u32 %r6918, 6;
+ sub.s32 %r6919, %r6918, %r2106;
+ mul.wide.s32 %rd2256, %r6919, 4;
+ add.s64 %rd2257, %rd1, %rd2256;
+ ld.local.u32 %r8396, [%rd2257];
+ ld.local.u32 %r8397, [%rd2257+-4];
+ and.b32 %r2112, %r2104, 31;
+ setp.eq.s32 %p1359, %r2112, 0;
+ @%p1359 bra $L__BB0_1606;
+
+ mov.u32 %r6920, 32;
+ sub.s32 %r6921, %r6920, %r2112;
+ shr.u32 %r6922, %r8397, %r6921;
+ shl.b32 %r6923, %r8396, %r2112;
+ add.s32 %r8396, %r6922, %r6923;
+ mul.wide.s32 %rd2258, %r2109, 4;
+ add.s64 %rd2259, %rd1, %rd2258;
+ ld.local.u32 %r6924, [%rd2259];
+ shr.u32 %r6925, %r6924, %r6921;
+ shl.b32 %r6926, %r8397, %r2112;
+ add.s32 %r8397, %r6925, %r6926;
+
+$L__BB0_1606:
+ and.b32 %r6927, %r2103, -2147483648;
+ shr.u32 %r6928, %r8397, 30;
+ shl.b32 %r6929, %r8396, 2;
+ or.b32 %r6930, %r6928, %r6929;
+ shr.u32 %r6931, %r6930, 31;
+ shr.u32 %r6932, %r8396, 30;
+ add.s32 %r6933, %r6931, %r6932;
+ neg.s32 %r6934, %r6933;
+ setp.eq.s32 %p1360, %r6927, 0;
+ selp.b32 %r8398, %r6933, %r6934, %p1360;
+ setp.ne.s32 %p1361, %r6931, 0;
+ xor.b32 %r6935, %r6927, -2147483648;
+ selp.b32 %r6936, %r6935, %r6927, %p1361;
+ selp.b32 %r6937, -1, 0, %p1361;
+ xor.b32 %r6938, %r6930, %r6937;
+ shl.b32 %r6939, %r8397, 2;
+ xor.b32 %r6940, %r6939, %r6937;
+ cvt.u64.u32 %rd2260, %r6938;
+ cvt.u64.u32 %rd2261, %r6940;
+ bfi.b64 %rd2262, %rd2260, %rd2261, 32, 32;
+ cvt.rn.f64.s64 %fd215, %rd2262;
+ mul.f64 %fd216, %fd215, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4628, %fd216;
+ setp.eq.s32 %p1362, %r6936, 0;
+ neg.f32 %f4629, %f4628;
+ selp.f32 %f5912, %f4628, %f4629, %p1362;
+
+$L__BB0_1608:
+ add.s32 %r2119, %r8398, 1;
+ and.b32 %r2120, %r2119, 1;
+ setp.eq.s32 %p1363, %r2120, 0;
+ selp.f32 %f1878, %f5912, 0f3F800000, %p1363;
+ mul.rn.f32 %f1879, %f5912, %f5912;
+ mov.f32 %f5913, 0fB94D4153;
+ @%p1363 bra $L__BB0_1610;
+
+ mov.f32 %f4632, 0fBAB607ED;
+ mov.f32 %f4633, 0f37CBAC00;
+ fma.rn.f32 %f5913, %f4633, %f1879, %f4632;
+
+$L__BB0_1610:
+ selp.f32 %f4634, 0f3C0885E4, 0f3D2AAABB, %p1363;
+ fma.rn.f32 %f4635, %f5913, %f1879, %f4634;
+ selp.f32 %f4636, 0fBE2AAAA8, 0fBEFFFFFF, %p1363;
+ fma.rn.f32 %f4637, %f4635, %f1879, %f4636;
+ mov.f32 %f4638, 0f00000000;
+ fma.rn.f32 %f4639, %f1879, %f1878, %f4638;
+ fma.rn.f32 %f5283, %f4637, %f4639, %f1878;
+ and.b32 %r6942, %r2119, 2;
+ setp.eq.s32 %p1365, %r6942, 0;
+ @%p1365 bra $L__BB0_1612;
+
+ mov.f32 %f4641, 0fBF800000;
+ fma.rn.f32 %f5283, %f5283, %f4641, %f4638;
+
+$L__BB0_1612:
+ selp.f32 %f1886, %f5283, %f5284, %p29;
+ selp.f32 %f1887, %f5281, %f5282, %p29;
+ @%p1344 bra $L__BB0_1614;
+
+ add.f32 %f5992, %f1887, %f1886;
+ mov.f32 %f5282, %f5281;
+ mov.f32 %f5284, %f5283;
+
+$L__BB0_1614:
+ @%p1203 bra $L__BB0_1643;
+
+ shl.b32 %r6943, %r12, 5;
+ neg.s32 %r2121, %r6943;
+ setp.ge.s32 %p1369, %r11, %r2121;
+ @%p1369 bra $L__BB0_1628;
+
+ mul.f32 %f4644, %f5410, 0f3F22F983;
+ cvt.rni.s32.f32 %r8402, %f4644;
+ cvt.rn.f32.s32 %f4645, %r8402;
+ mov.f32 %f4646, 0fBFC90FDA;
+ fma.rn.f32 %f4647, %f4645, %f4646, %f5410;
+ mov.f32 %f4648, 0fB3A22168;
+ fma.rn.f32 %f4649, %f4645, %f4648, %f4647;
+ mov.f32 %f4650, 0fA7C234C5;
+ fma.rn.f32 %f5921, %f4645, %f4650, %f4649;
+ abs.f32 %f1895, %f5410;
+ setp.ltu.f32 %p1370, %f1895, 0f47CE4780;
+ @%p1370 bra $L__BB0_1624;
+
+ setp.eq.f32 %p1371, %f1895, 0f7F800000;
+ @%p1371 bra $L__BB0_1623;
+ bra.uni $L__BB0_1618;
+
+$L__BB0_1623:
+ mov.f32 %f4653, 0f00000000;
+ mul.rn.f32 %f5921, %f5410, %f4653;
+ mov.u32 %r8402, 0;
+ bra.uni $L__BB0_1624;
+
+$L__BB0_1618:
+ mov.b32 %r2123, %f5410;
+ shr.u32 %r6945, %r2123, 23;
+ and.b32 %r6946, %r6945, 255;
+ add.s32 %r2124, %r6946, -128;
+ shl.b32 %r6947, %r2123, 8;
+ or.b32 %r2125, %r6947, -2147483648;
+ shr.u32 %r2126, %r2124, 5;
mov.u64 %rd2735, 0;
- mov.u32 %r8654, 0;
+ mov.u32 %r8399, 0;
mov.u64 %rd2733, __cudart_i2opi_f;
mov.u64 %rd2734, %rd1;
-$L__BB0_1639:
+$L__BB0_1619:
.pragma "nounroll";
- ld.global.nc.u32 %r7084, [%rd2733];
- mad.wide.u32 %rd2239, %r7084, %r2153, %rd2735;
- shr.u64 %rd2735, %rd2239, 32;
- st.local.u32 [%rd2734], %rd2239;
+ ld.global.nc.u32 %r6948, [%rd2733];
+ mad.wide.u32 %rd2265, %r6948, %r2125, %rd2735;
+ shr.u64 %rd2735, %rd2265, 32;
+ st.local.u32 [%rd2734], %rd2265;
add.s64 %rd2734, %rd2734, 4;
add.s64 %rd2733, %rd2733, 4;
- add.s32 %r8654, %r8654, 1;
- setp.ne.s32 %p1386, %r8654, 6;
- @%p1386 bra $L__BB0_1639;
-
- st.local.u32 [%rd5], %rd2735;
- mov.u32 %r7085, 4;
- sub.s32 %r2157, %r7085, %r2154;
- mov.u32 %r7086, 6;
- sub.s32 %r7087, %r7086, %r2154;
- mul.wide.s32 %rd2240, %r7087, 4;
- add.s64 %rd2241, %rd1, %rd2240;
- ld.local.u32 %r8655, [%rd2241];
- ld.local.u32 %r8656, [%rd2241+-4];
- and.b32 %r2160, %r2152, 31;
- setp.eq.s32 %p1387, %r2160, 0;
- @%p1387 bra $L__BB0_1642;
-
- mov.u32 %r7088, 32;
- sub.s32 %r7089, %r7088, %r2160;
- shr.u32 %r7090, %r8656, %r7089;
- shl.b32 %r7091, %r8655, %r2160;
- add.s32 %r8655, %r7090, %r7091;
- mul.wide.s32 %rd2242, %r2157, 4;
- add.s64 %rd2243, %rd1, %rd2242;
- ld.local.u32 %r7092, [%rd2243];
- shr.u32 %r7093, %r7092, %r7089;
- shl.b32 %r7094, %r8656, %r2160;
- add.s32 %r8656, %r7093, %r7094;
-
-$L__BB0_1642:
- and.b32 %r7095, %r2151, -2147483648;
- shr.u32 %r7096, %r8656, 30;
- shl.b32 %r7097, %r8655, 2;
- or.b32 %r7098, %r7096, %r7097;
- shr.u32 %r7099, %r7098, 31;
- shr.u32 %r7100, %r8655, 30;
- add.s32 %r7101, %r7099, %r7100;
- neg.s32 %r7102, %r7101;
- setp.eq.s32 %p1388, %r7095, 0;
- selp.b32 %r8657, %r7101, %r7102, %p1388;
- setp.ne.s32 %p1389, %r7099, 0;
- xor.b32 %r7103, %r7095, -2147483648;
- selp.b32 %r7104, %r7103, %r7095, %p1389;
- selp.b32 %r7105, -1, 0, %p1389;
- xor.b32 %r7106, %r7098, %r7105;
- shl.b32 %r7107, %r8656, 2;
- xor.b32 %r7108, %r7107, %r7105;
- cvt.u64.u32 %rd2244, %r7106;
- cvt.u64.u32 %rd2245, %r7108;
- bfi.b64 %rd2246, %rd2244, %rd2245, 32, 32;
- cvt.rn.f64.s64 %fd215, %rd2246;
- mul.f64 %fd216, %fd215, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4572, %fd216;
- setp.eq.s32 %p1390, %r7104, 0;
- neg.f32 %f4573, %f4572;
- selp.f32 %f5820, %f4572, %f4573, %p1390;
-
-$L__BB0_1644:
- add.s32 %r2167, %r8657, 1;
- and.b32 %r2168, %r2167, 1;
- setp.eq.s32 %p1391, %r2168, 0;
- selp.f32 %f1846, %f5820, 0f3F800000, %p1391;
- mul.rn.f32 %f1847, %f5820, %f5820;
- mov.f32 %f5821, 0fB94D4153;
- @%p1391 bra $L__BB0_1646;
-
- mov.f32 %f4576, 0fBAB607ED;
- mov.f32 %f4577, 0f37CBAC00;
- fma.rn.f32 %f5821, %f4577, %f1847, %f4576;
-
-$L__BB0_1646:
- selp.f32 %f4578, 0f3C0885E4, 0f3D2AAABB, %p1391;
- fma.rn.f32 %f4579, %f5821, %f1847, %f4578;
- selp.f32 %f4580, 0fBE2AAAA8, 0fBEFFFFFF, %p1391;
- fma.rn.f32 %f4581, %f4579, %f1847, %f4580;
- mov.f32 %f4582, 0f00000000;
- fma.rn.f32 %f4583, %f1847, %f1846, %f4582;
- fma.rn.f32 %f5215, %f4581, %f4583, %f1846;
- and.b32 %r7110, %r2167, 2;
- setp.eq.s32 %p1393, %r7110, 0;
- @%p1393 bra $L__BB0_1648;
-
- mov.f32 %f4585, 0fBF800000;
- fma.rn.f32 %f5215, %f5215, %f4585, %f4582;
-
-$L__BB0_1648:
- selp.f32 %f1854, %f5215, %f5216, %p29;
- selp.f32 %f1855, %f5213, %f5214, %p29;
- @%p1372 bra $L__BB0_1650;
-
- add.f32 %f5900, %f1855, %f1854;
- mov.f32 %f5214, %f5213;
- mov.f32 %f5216, %f5215;
-
-$L__BB0_1650:
- @%p1226 bra $L__BB0_1679;
-
- shl.b32 %r7112, %r12, 5;
- neg.s32 %r2169, %r7112;
- setp.ge.s32 %p1397, %r14, %r2169;
- @%p1397 bra $L__BB0_1664;
-
- mul.f32 %f4588, %f5342, 0f3F22F983;
- cvt.rni.s32.f32 %r8661, %f4588;
- cvt.rn.f32.s32 %f4589, %r8661;
- mov.f32 %f4590, 0fBFC90FDA;
- fma.rn.f32 %f4591, %f4589, %f4590, %f5342;
- mov.f32 %f4592, 0fB3A22168;
- fma.rn.f32 %f4593, %f4589, %f4592, %f4591;
- mov.f32 %f4594, 0fA7C234C5;
- fma.rn.f32 %f5829, %f4589, %f4594, %f4593;
- abs.f32 %f1863, %f5342;
- setp.ltu.f32 %p1398, %f1863, 0f47CE4780;
- @%p1398 bra $L__BB0_1660;
-
- setp.eq.f32 %p1399, %f1863, 0f7F800000;
- @%p1399 bra $L__BB0_1659;
- bra.uni $L__BB0_1654;
-
-$L__BB0_1659:
- mov.f32 %f4597, 0f00000000;
- mul.rn.f32 %f5829, %f5342, %f4597;
- mov.u32 %r8661, 0;
- bra.uni $L__BB0_1660;
-
-$L__BB0_1654:
- mov.b32 %r2171, %f5342;
- shr.u32 %r7114, %r2171, 23;
- and.b32 %r7115, %r7114, 255;
- add.s32 %r2172, %r7115, -128;
- shl.b32 %r7116, %r2171, 8;
- or.b32 %r2173, %r7116, -2147483648;
- shr.u32 %r2174, %r2172, 5;
+ add.s32 %r8399, %r8399, 1;
+ setp.ne.s32 %p1372, %r8399, 6;
+ @%p1372 bra $L__BB0_1619;
+
+ st.local.u32 [%rd4], %rd2735;
+ mov.u32 %r6949, 4;
+ sub.s32 %r2129, %r6949, %r2126;
+ mov.u32 %r6950, 6;
+ sub.s32 %r6951, %r6950, %r2126;
+ mul.wide.s32 %rd2266, %r6951, 4;
+ add.s64 %rd2267, %rd1, %rd2266;
+ ld.local.u32 %r8400, [%rd2267];
+ ld.local.u32 %r8401, [%rd2267+-4];
+ and.b32 %r2132, %r2124, 31;
+ setp.eq.s32 %p1373, %r2132, 0;
+ @%p1373 bra $L__BB0_1622;
+
+ mov.u32 %r6952, 32;
+ sub.s32 %r6953, %r6952, %r2132;
+ shr.u32 %r6954, %r8401, %r6953;
+ shl.b32 %r6955, %r8400, %r2132;
+ add.s32 %r8400, %r6954, %r6955;
+ mul.wide.s32 %rd2268, %r2129, 4;
+ add.s64 %rd2269, %rd1, %rd2268;
+ ld.local.u32 %r6956, [%rd2269];
+ shr.u32 %r6957, %r6956, %r6953;
+ shl.b32 %r6958, %r8401, %r2132;
+ add.s32 %r8401, %r6957, %r6958;
+
+$L__BB0_1622:
+ and.b32 %r6959, %r2123, -2147483648;
+ shr.u32 %r6960, %r8401, 30;
+ shl.b32 %r6961, %r8400, 2;
+ or.b32 %r6962, %r6960, %r6961;
+ shr.u32 %r6963, %r6962, 31;
+ shr.u32 %r6964, %r8400, 30;
+ add.s32 %r6965, %r6963, %r6964;
+ neg.s32 %r6966, %r6965;
+ setp.eq.s32 %p1374, %r6959, 0;
+ selp.b32 %r8402, %r6965, %r6966, %p1374;
+ setp.ne.s32 %p1375, %r6963, 0;
+ xor.b32 %r6967, %r6959, -2147483648;
+ selp.b32 %r6968, %r6967, %r6959, %p1375;
+ selp.b32 %r6969, -1, 0, %p1375;
+ xor.b32 %r6970, %r6962, %r6969;
+ shl.b32 %r6971, %r8401, 2;
+ xor.b32 %r6972, %r6971, %r6969;
+ cvt.u64.u32 %rd2270, %r6970;
+ cvt.u64.u32 %rd2271, %r6972;
+ bfi.b64 %rd2272, %rd2270, %rd2271, 32, 32;
+ cvt.rn.f64.s64 %fd217, %rd2272;
+ mul.f64 %fd218, %fd217, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4651, %fd218;
+ setp.eq.s32 %p1376, %r6968, 0;
+ neg.f32 %f4652, %f4651;
+ selp.f32 %f5921, %f4651, %f4652, %p1376;
+
+$L__BB0_1624:
+ and.b32 %r2139, %r8402, 1;
+ setp.eq.s32 %p1377, %r2139, 0;
+ selp.f32 %f1899, %f5921, 0f3F800000, %p1377;
+ mul.rn.f32 %f1900, %f5921, %f5921;
+ mov.f32 %f5922, 0fB94D4153;
+ @%p1377 bra $L__BB0_1626;
+
+ mov.f32 %f4655, 0fBAB607ED;
+ mov.f32 %f4656, 0f37CBAC00;
+ fma.rn.f32 %f5922, %f4656, %f1900, %f4655;
+
+$L__BB0_1626:
+ selp.f32 %f4657, 0f3C0885E4, 0f3D2AAABB, %p1377;
+ fma.rn.f32 %f4658, %f5922, %f1900, %f4657;
+ selp.f32 %f4659, 0fBE2AAAA8, 0fBEFFFFFF, %p1377;
+ fma.rn.f32 %f4660, %f4658, %f1900, %f4659;
+ mov.f32 %f4661, 0f00000000;
+ fma.rn.f32 %f4662, %f1900, %f1899, %f4661;
+ fma.rn.f32 %f5281, %f4660, %f4662, %f1899;
+ and.b32 %r6974, %r8402, 2;
+ setp.eq.s32 %p1379, %r6974, 0;
+ @%p1379 bra $L__BB0_1628;
+
+ mov.f32 %f4664, 0fBF800000;
+ fma.rn.f32 %f5281, %f5281, %f4664, %f4661;
+
+$L__BB0_1628:
+ setp.lt.s32 %p30, %r11, %r2121;
+ @%p1369 bra $L__BB0_1641;
+
+ mul.f32 %f4665, %f5402, 0f3F22F983;
+ cvt.rni.s32.f32 %r8406, %f4665;
+ cvt.rn.f32.s32 %f4666, %r8406;
+ mov.f32 %f4667, 0fBFC90FDA;
+ fma.rn.f32 %f4668, %f4666, %f4667, %f5402;
+ mov.f32 %f4669, 0fB3A22168;
+ fma.rn.f32 %f4670, %f4666, %f4669, %f4668;
+ mov.f32 %f4671, 0fA7C234C5;
+ fma.rn.f32 %f5925, %f4666, %f4671, %f4670;
+ abs.f32 %f1908, %f5402;
+ setp.ltu.f32 %p1381, %f1908, 0f47CE4780;
+ @%p1381 bra $L__BB0_1637;
+
+ setp.eq.f32 %p1382, %f1908, 0f7F800000;
+ @%p1382 bra $L__BB0_1636;
+ bra.uni $L__BB0_1631;
+
+$L__BB0_1636:
+ mov.f32 %f4674, 0f00000000;
+ mul.rn.f32 %f5925, %f5402, %f4674;
+ mov.u32 %r8406, 0;
+ bra.uni $L__BB0_1637;
+
+$L__BB0_1631:
+ mov.b32 %r2141, %f5402;
+ shr.u32 %r6976, %r2141, 23;
+ and.b32 %r6977, %r6976, 255;
+ add.s32 %r2142, %r6977, -128;
+ shl.b32 %r6978, %r2141, 8;
+ or.b32 %r2143, %r6978, -2147483648;
+ shr.u32 %r2144, %r2142, 5;
mov.u64 %rd2738, 0;
- mov.u32 %r8658, 0;
+ mov.u32 %r8403, 0;
mov.u64 %rd2736, __cudart_i2opi_f;
mov.u64 %rd2737, %rd1;
-$L__BB0_1655:
+$L__BB0_1632:
.pragma "nounroll";
- ld.global.nc.u32 %r7117, [%rd2736];
- mad.wide.u32 %rd2249, %r7117, %r2173, %rd2738;
- shr.u64 %rd2738, %rd2249, 32;
- st.local.u32 [%rd2737], %rd2249;
+ ld.global.nc.u32 %r6979, [%rd2736];
+ mad.wide.u32 %rd2275, %r6979, %r2143, %rd2738;
+ shr.u64 %rd2738, %rd2275, 32;
+ st.local.u32 [%rd2737], %rd2275;
add.s64 %rd2737, %rd2737, 4;
add.s64 %rd2736, %rd2736, 4;
- add.s32 %r8658, %r8658, 1;
- setp.ne.s32 %p1400, %r8658, 6;
- @%p1400 bra $L__BB0_1655;
-
- st.local.u32 [%rd5], %rd2738;
- mov.u32 %r7118, 4;
- sub.s32 %r2177, %r7118, %r2174;
- mov.u32 %r7119, 6;
- sub.s32 %r7120, %r7119, %r2174;
- mul.wide.s32 %rd2250, %r7120, 4;
- add.s64 %rd2251, %rd1, %rd2250;
- ld.local.u32 %r8659, [%rd2251];
- ld.local.u32 %r8660, [%rd2251+-4];
- and.b32 %r2180, %r2172, 31;
- setp.eq.s32 %p1401, %r2180, 0;
- @%p1401 bra $L__BB0_1658;
-
- mov.u32 %r7121, 32;
- sub.s32 %r7122, %r7121, %r2180;
- shr.u32 %r7123, %r8660, %r7122;
- shl.b32 %r7124, %r8659, %r2180;
- add.s32 %r8659, %r7123, %r7124;
- mul.wide.s32 %rd2252, %r2177, 4;
- add.s64 %rd2253, %rd1, %rd2252;
- ld.local.u32 %r7125, [%rd2253];
- shr.u32 %r7126, %r7125, %r7122;
- shl.b32 %r7127, %r8660, %r2180;
- add.s32 %r8660, %r7126, %r7127;
-
-$L__BB0_1658:
- and.b32 %r7128, %r2171, -2147483648;
- shr.u32 %r7129, %r8660, 30;
- shl.b32 %r7130, %r8659, 2;
- or.b32 %r7131, %r7129, %r7130;
- shr.u32 %r7132, %r7131, 31;
- shr.u32 %r7133, %r8659, 30;
- add.s32 %r7134, %r7132, %r7133;
- neg.s32 %r7135, %r7134;
- setp.eq.s32 %p1402, %r7128, 0;
- selp.b32 %r8661, %r7134, %r7135, %p1402;
- setp.ne.s32 %p1403, %r7132, 0;
- xor.b32 %r7136, %r7128, -2147483648;
- selp.b32 %r7137, %r7136, %r7128, %p1403;
- selp.b32 %r7138, -1, 0, %p1403;
- xor.b32 %r7139, %r7131, %r7138;
- shl.b32 %r7140, %r8660, 2;
- xor.b32 %r7141, %r7140, %r7138;
- cvt.u64.u32 %rd2254, %r7139;
- cvt.u64.u32 %rd2255, %r7141;
- bfi.b64 %rd2256, %rd2254, %rd2255, 32, 32;
- cvt.rn.f64.s64 %fd217, %rd2256;
- mul.f64 %fd218, %fd217, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4595, %fd218;
- setp.eq.s32 %p1404, %r7137, 0;
- neg.f32 %f4596, %f4595;
- selp.f32 %f5829, %f4595, %f4596, %p1404;
-
-$L__BB0_1660:
- and.b32 %r2187, %r8661, 1;
- setp.eq.s32 %p1405, %r2187, 0;
- selp.f32 %f1867, %f5829, 0f3F800000, %p1405;
- mul.rn.f32 %f1868, %f5829, %f5829;
- mov.f32 %f5830, 0fB94D4153;
- @%p1405 bra $L__BB0_1662;
-
- mov.f32 %f4599, 0fBAB607ED;
- mov.f32 %f4600, 0f37CBAC00;
- fma.rn.f32 %f5830, %f4600, %f1868, %f4599;
-
-$L__BB0_1662:
- selp.f32 %f4601, 0f3C0885E4, 0f3D2AAABB, %p1405;
- fma.rn.f32 %f4602, %f5830, %f1868, %f4601;
- selp.f32 %f4603, 0fBE2AAAA8, 0fBEFFFFFF, %p1405;
- fma.rn.f32 %f4604, %f4602, %f1868, %f4603;
- mov.f32 %f4605, 0f00000000;
- fma.rn.f32 %f4606, %f1868, %f1867, %f4605;
- fma.rn.f32 %f5213, %f4604, %f4606, %f1867;
- and.b32 %r7143, %r8661, 2;
- setp.eq.s32 %p1407, %r7143, 0;
- @%p1407 bra $L__BB0_1664;
-
- mov.f32 %f4608, 0fBF800000;
- fma.rn.f32 %f5213, %f5213, %f4608, %f4605;
-
-$L__BB0_1664:
- setp.lt.s32 %p30, %r14, %r2169;
- @%p1397 bra $L__BB0_1677;
-
- mul.f32 %f4609, %f5334, 0f3F22F983;
- cvt.rni.s32.f32 %r8665, %f4609;
- cvt.rn.f32.s32 %f4610, %r8665;
- mov.f32 %f4611, 0fBFC90FDA;
- fma.rn.f32 %f4612, %f4610, %f4611, %f5334;
- mov.f32 %f4613, 0fB3A22168;
- fma.rn.f32 %f4614, %f4610, %f4613, %f4612;
- mov.f32 %f4615, 0fA7C234C5;
- fma.rn.f32 %f5833, %f4610, %f4615, %f4614;
- abs.f32 %f1876, %f5334;
- setp.ltu.f32 %p1409, %f1876, 0f47CE4780;
- @%p1409 bra $L__BB0_1673;
-
- setp.eq.f32 %p1410, %f1876, 0f7F800000;
- @%p1410 bra $L__BB0_1672;
- bra.uni $L__BB0_1667;
-
-$L__BB0_1672:
- mov.f32 %f4618, 0f00000000;
- mul.rn.f32 %f5833, %f5334, %f4618;
- mov.u32 %r8665, 0;
- bra.uni $L__BB0_1673;
-
-$L__BB0_1667:
- mov.b32 %r2189, %f5334;
- shr.u32 %r7145, %r2189, 23;
- and.b32 %r7146, %r7145, 255;
- add.s32 %r2190, %r7146, -128;
- shl.b32 %r7147, %r2189, 8;
- or.b32 %r2191, %r7147, -2147483648;
- shr.u32 %r2192, %r2190, 5;
+ add.s32 %r8403, %r8403, 1;
+ setp.ne.s32 %p1383, %r8403, 6;
+ @%p1383 bra $L__BB0_1632;
+
+ st.local.u32 [%rd4], %rd2738;
+ mov.u32 %r6980, 4;
+ sub.s32 %r2147, %r6980, %r2144;
+ mov.u32 %r6981, 6;
+ sub.s32 %r6982, %r6981, %r2144;
+ mul.wide.s32 %rd2276, %r6982, 4;
+ add.s64 %rd2277, %rd1, %rd2276;
+ ld.local.u32 %r8404, [%rd2277];
+ ld.local.u32 %r8405, [%rd2277+-4];
+ and.b32 %r2150, %r2142, 31;
+ setp.eq.s32 %p1384, %r2150, 0;
+ @%p1384 bra $L__BB0_1635;
+
+ mov.u32 %r6983, 32;
+ sub.s32 %r6984, %r6983, %r2150;
+ shr.u32 %r6985, %r8405, %r6984;
+ shl.b32 %r6986, %r8404, %r2150;
+ add.s32 %r8404, %r6985, %r6986;
+ mul.wide.s32 %rd2278, %r2147, 4;
+ add.s64 %rd2279, %rd1, %rd2278;
+ ld.local.u32 %r6987, [%rd2279];
+ shr.u32 %r6988, %r6987, %r6984;
+ shl.b32 %r6989, %r8405, %r2150;
+ add.s32 %r8405, %r6988, %r6989;
+
+$L__BB0_1635:
+ and.b32 %r6990, %r2141, -2147483648;
+ shr.u32 %r6991, %r8405, 30;
+ shl.b32 %r6992, %r8404, 2;
+ or.b32 %r6993, %r6991, %r6992;
+ shr.u32 %r6994, %r6993, 31;
+ shr.u32 %r6995, %r8404, 30;
+ add.s32 %r6996, %r6994, %r6995;
+ neg.s32 %r6997, %r6996;
+ setp.eq.s32 %p1385, %r6990, 0;
+ selp.b32 %r8406, %r6996, %r6997, %p1385;
+ setp.ne.s32 %p1386, %r6994, 0;
+ xor.b32 %r6998, %r6990, -2147483648;
+ selp.b32 %r6999, %r6998, %r6990, %p1386;
+ selp.b32 %r7000, -1, 0, %p1386;
+ xor.b32 %r7001, %r6993, %r7000;
+ shl.b32 %r7002, %r8405, 2;
+ xor.b32 %r7003, %r7002, %r7000;
+ cvt.u64.u32 %rd2280, %r7001;
+ cvt.u64.u32 %rd2281, %r7003;
+ bfi.b64 %rd2282, %rd2280, %rd2281, 32, 32;
+ cvt.rn.f64.s64 %fd219, %rd2282;
+ mul.f64 %fd220, %fd219, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4672, %fd220;
+ setp.eq.s32 %p1387, %r6999, 0;
+ neg.f32 %f4673, %f4672;
+ selp.f32 %f5925, %f4672, %f4673, %p1387;
+
+$L__BB0_1637:
+ add.s32 %r2157, %r8406, 1;
+ and.b32 %r2158, %r2157, 1;
+ setp.eq.s32 %p1388, %r2158, 0;
+ selp.f32 %f1912, %f5925, 0f3F800000, %p1388;
+ mul.rn.f32 %f1913, %f5925, %f5925;
+ mov.f32 %f5926, 0fB94D4153;
+ @%p1388 bra $L__BB0_1639;
+
+ mov.f32 %f4676, 0fBAB607ED;
+ mov.f32 %f4677, 0f37CBAC00;
+ fma.rn.f32 %f5926, %f4677, %f1913, %f4676;
+
+$L__BB0_1639:
+ selp.f32 %f4678, 0f3C0885E4, 0f3D2AAABB, %p1388;
+ fma.rn.f32 %f4679, %f5926, %f1913, %f4678;
+ selp.f32 %f4680, 0fBE2AAAA8, 0fBEFFFFFF, %p1388;
+ fma.rn.f32 %f4681, %f4679, %f1913, %f4680;
+ mov.f32 %f4682, 0f00000000;
+ fma.rn.f32 %f4683, %f1913, %f1912, %f4682;
+ fma.rn.f32 %f5283, %f4681, %f4683, %f1912;
+ and.b32 %r7005, %r2157, 2;
+ setp.eq.s32 %p1390, %r7005, 0;
+ @%p1390 bra $L__BB0_1641;
+
+ mov.f32 %f4685, 0fBF800000;
+ fma.rn.f32 %f5283, %f5283, %f4685, %f4682;
+
+$L__BB0_1641:
+ selp.f32 %f1920, %f5283, %f5284, %p30;
+ selp.f32 %f1921, %f5281, %f5282, %p30;
+ @%p1369 bra $L__BB0_1643;
+
+ add.f32 %f5991, %f1921, %f1920;
+ mov.f32 %f5282, %f5281;
+ mov.f32 %f5284, %f5283;
+
+$L__BB0_1643:
+ @%p1203 bra $L__BB0_1865;
+
+ shl.b32 %r7006, %r12, 5;
+ mov.u32 %r7007, -32;
+ sub.s32 %r2159, %r7007, %r7006;
+ setp.ge.s32 %p1394, %r11, %r2159;
+ @%p1394 bra $L__BB0_1657;
+
+ mul.f32 %f4688, %f5409, 0f3F22F983;
+ cvt.rni.s32.f32 %r8410, %f4688;
+ cvt.rn.f32.s32 %f4689, %r8410;
+ mov.f32 %f4690, 0fBFC90FDA;
+ fma.rn.f32 %f4691, %f4689, %f4690, %f5409;
+ mov.f32 %f4692, 0fB3A22168;
+ fma.rn.f32 %f4693, %f4689, %f4692, %f4691;
+ mov.f32 %f4694, 0fA7C234C5;
+ fma.rn.f32 %f5934, %f4689, %f4694, %f4693;
+ abs.f32 %f1929, %f5409;
+ setp.ltu.f32 %p1395, %f1929, 0f47CE4780;
+ @%p1395 bra $L__BB0_1653;
+
+ setp.eq.f32 %p1396, %f1929, 0f7F800000;
+ @%p1396 bra $L__BB0_1652;
+ bra.uni $L__BB0_1647;
+
+$L__BB0_1652:
+ mov.f32 %f4697, 0f00000000;
+ mul.rn.f32 %f5934, %f5409, %f4697;
+ mov.u32 %r8410, 0;
+ bra.uni $L__BB0_1653;
+
+$L__BB0_1647:
+ mov.b32 %r2161, %f5409;
+ shr.u32 %r7009, %r2161, 23;
+ and.b32 %r7010, %r7009, 255;
+ add.s32 %r2162, %r7010, -128;
+ shl.b32 %r7011, %r2161, 8;
+ or.b32 %r2163, %r7011, -2147483648;
+ shr.u32 %r2164, %r2162, 5;
mov.u64 %rd2741, 0;
- mov.u32 %r8662, 0;
+ mov.u32 %r8407, 0;
mov.u64 %rd2739, __cudart_i2opi_f;
mov.u64 %rd2740, %rd1;
-$L__BB0_1668:
+$L__BB0_1648:
.pragma "nounroll";
- ld.global.nc.u32 %r7148, [%rd2739];
- mad.wide.u32 %rd2259, %r7148, %r2191, %rd2741;
- shr.u64 %rd2741, %rd2259, 32;
- st.local.u32 [%rd2740], %rd2259;
+ ld.global.nc.u32 %r7012, [%rd2739];
+ mad.wide.u32 %rd2285, %r7012, %r2163, %rd2741;
+ shr.u64 %rd2741, %rd2285, 32;
+ st.local.u32 [%rd2740], %rd2285;
add.s64 %rd2740, %rd2740, 4;
add.s64 %rd2739, %rd2739, 4;
- add.s32 %r8662, %r8662, 1;
- setp.ne.s32 %p1411, %r8662, 6;
- @%p1411 bra $L__BB0_1668;
-
- st.local.u32 [%rd5], %rd2741;
- mov.u32 %r7149, 4;
- sub.s32 %r2195, %r7149, %r2192;
- mov.u32 %r7150, 6;
- sub.s32 %r7151, %r7150, %r2192;
- mul.wide.s32 %rd2260, %r7151, 4;
- add.s64 %rd2261, %rd1, %rd2260;
- ld.local.u32 %r8663, [%rd2261];
- ld.local.u32 %r8664, [%rd2261+-4];
- and.b32 %r2198, %r2190, 31;
- setp.eq.s32 %p1412, %r2198, 0;
- @%p1412 bra $L__BB0_1671;
-
- mov.u32 %r7152, 32;
- sub.s32 %r7153, %r7152, %r2198;
- shr.u32 %r7154, %r8664, %r7153;
- shl.b32 %r7155, %r8663, %r2198;
- add.s32 %r8663, %r7154, %r7155;
- mul.wide.s32 %rd2262, %r2195, 4;
- add.s64 %rd2263, %rd1, %rd2262;
- ld.local.u32 %r7156, [%rd2263];
- shr.u32 %r7157, %r7156, %r7153;
- shl.b32 %r7158, %r8664, %r2198;
- add.s32 %r8664, %r7157, %r7158;
-
-$L__BB0_1671:
- and.b32 %r7159, %r2189, -2147483648;
- shr.u32 %r7160, %r8664, 30;
- shl.b32 %r7161, %r8663, 2;
- or.b32 %r7162, %r7160, %r7161;
- shr.u32 %r7163, %r7162, 31;
- shr.u32 %r7164, %r8663, 30;
- add.s32 %r7165, %r7163, %r7164;
- neg.s32 %r7166, %r7165;
- setp.eq.s32 %p1413, %r7159, 0;
- selp.b32 %r8665, %r7165, %r7166, %p1413;
- setp.ne.s32 %p1414, %r7163, 0;
- xor.b32 %r7167, %r7159, -2147483648;
- selp.b32 %r7168, %r7167, %r7159, %p1414;
- selp.b32 %r7169, -1, 0, %p1414;
- xor.b32 %r7170, %r7162, %r7169;
- shl.b32 %r7171, %r8664, 2;
- xor.b32 %r7172, %r7171, %r7169;
- cvt.u64.u32 %rd2264, %r7170;
- cvt.u64.u32 %rd2265, %r7172;
- bfi.b64 %rd2266, %rd2264, %rd2265, 32, 32;
- cvt.rn.f64.s64 %fd219, %rd2266;
- mul.f64 %fd220, %fd219, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4616, %fd220;
- setp.eq.s32 %p1415, %r7168, 0;
- neg.f32 %f4617, %f4616;
- selp.f32 %f5833, %f4616, %f4617, %p1415;
-
-$L__BB0_1673:
- add.s32 %r2205, %r8665, 1;
- and.b32 %r2206, %r2205, 1;
- setp.eq.s32 %p1416, %r2206, 0;
- selp.f32 %f1880, %f5833, 0f3F800000, %p1416;
- mul.rn.f32 %f1881, %f5833, %f5833;
- mov.f32 %f5834, 0fB94D4153;
- @%p1416 bra $L__BB0_1675;
-
- mov.f32 %f4620, 0fBAB607ED;
- mov.f32 %f4621, 0f37CBAC00;
- fma.rn.f32 %f5834, %f4621, %f1881, %f4620;
-
-$L__BB0_1675:
- selp.f32 %f4622, 0f3C0885E4, 0f3D2AAABB, %p1416;
- fma.rn.f32 %f4623, %f5834, %f1881, %f4622;
- selp.f32 %f4624, 0fBE2AAAA8, 0fBEFFFFFF, %p1416;
- fma.rn.f32 %f4625, %f4623, %f1881, %f4624;
- mov.f32 %f4626, 0f00000000;
- fma.rn.f32 %f4627, %f1881, %f1880, %f4626;
- fma.rn.f32 %f5215, %f4625, %f4627, %f1880;
- and.b32 %r7174, %r2205, 2;
- setp.eq.s32 %p1418, %r7174, 0;
- @%p1418 bra $L__BB0_1677;
-
- mov.f32 %f4629, 0fBF800000;
- fma.rn.f32 %f5215, %f5215, %f4629, %f4626;
-
-$L__BB0_1677:
- selp.f32 %f1888, %f5215, %f5216, %p30;
- selp.f32 %f1889, %f5213, %f5214, %p30;
- @%p1397 bra $L__BB0_1679;
-
- add.f32 %f5899, %f1889, %f1888;
- mov.f32 %f5214, %f5213;
- mov.f32 %f5216, %f5215;
-
-$L__BB0_1679:
- @%p1226 bra $L__BB0_1901;
-
- shl.b32 %r7176, %r12, 5;
- mov.u32 %r7177, -32;
- sub.s32 %r2207, %r7177, %r7176;
- setp.ge.s32 %p1422, %r14, %r2207;
- @%p1422 bra $L__BB0_1693;
-
- mul.f32 %f4632, %f5341, 0f3F22F983;
- cvt.rni.s32.f32 %r8669, %f4632;
- cvt.rn.f32.s32 %f4633, %r8669;
- mov.f32 %f4634, 0fBFC90FDA;
- fma.rn.f32 %f4635, %f4633, %f4634, %f5341;
- mov.f32 %f4636, 0fB3A22168;
- fma.rn.f32 %f4637, %f4633, %f4636, %f4635;
- mov.f32 %f4638, 0fA7C234C5;
- fma.rn.f32 %f5842, %f4633, %f4638, %f4637;
- abs.f32 %f1897, %f5341;
- setp.ltu.f32 %p1423, %f1897, 0f47CE4780;
- @%p1423 bra $L__BB0_1689;
-
- setp.eq.f32 %p1424, %f1897, 0f7F800000;
- @%p1424 bra $L__BB0_1688;
- bra.uni $L__BB0_1683;
-
-$L__BB0_1688:
- mov.f32 %f4641, 0f00000000;
- mul.rn.f32 %f5842, %f5341, %f4641;
- mov.u32 %r8669, 0;
- bra.uni $L__BB0_1689;
-
-$L__BB0_1683:
- mov.b32 %r2209, %f5341;
- shr.u32 %r7179, %r2209, 23;
- and.b32 %r7180, %r7179, 255;
- add.s32 %r2210, %r7180, -128;
- shl.b32 %r7181, %r2209, 8;
- or.b32 %r2211, %r7181, -2147483648;
- shr.u32 %r2212, %r2210, 5;
+ add.s32 %r8407, %r8407, 1;
+ setp.ne.s32 %p1397, %r8407, 6;
+ @%p1397 bra $L__BB0_1648;
+
+ st.local.u32 [%rd4], %rd2741;
+ mov.u32 %r7013, 4;
+ sub.s32 %r2167, %r7013, %r2164;
+ mov.u32 %r7014, 6;
+ sub.s32 %r7015, %r7014, %r2164;
+ mul.wide.s32 %rd2286, %r7015, 4;
+ add.s64 %rd2287, %rd1, %rd2286;
+ ld.local.u32 %r8408, [%rd2287];
+ ld.local.u32 %r8409, [%rd2287+-4];
+ and.b32 %r2170, %r2162, 31;
+ setp.eq.s32 %p1398, %r2170, 0;
+ @%p1398 bra $L__BB0_1651;
+
+ mov.u32 %r7016, 32;
+ sub.s32 %r7017, %r7016, %r2170;
+ shr.u32 %r7018, %r8409, %r7017;
+ shl.b32 %r7019, %r8408, %r2170;
+ add.s32 %r8408, %r7018, %r7019;
+ mul.wide.s32 %rd2288, %r2167, 4;
+ add.s64 %rd2289, %rd1, %rd2288;
+ ld.local.u32 %r7020, [%rd2289];
+ shr.u32 %r7021, %r7020, %r7017;
+ shl.b32 %r7022, %r8409, %r2170;
+ add.s32 %r8409, %r7021, %r7022;
+
+$L__BB0_1651:
+ and.b32 %r7023, %r2161, -2147483648;
+ shr.u32 %r7024, %r8409, 30;
+ shl.b32 %r7025, %r8408, 2;
+ or.b32 %r7026, %r7024, %r7025;
+ shr.u32 %r7027, %r7026, 31;
+ shr.u32 %r7028, %r8408, 30;
+ add.s32 %r7029, %r7027, %r7028;
+ neg.s32 %r7030, %r7029;
+ setp.eq.s32 %p1399, %r7023, 0;
+ selp.b32 %r8410, %r7029, %r7030, %p1399;
+ setp.ne.s32 %p1400, %r7027, 0;
+ xor.b32 %r7031, %r7023, -2147483648;
+ selp.b32 %r7032, %r7031, %r7023, %p1400;
+ selp.b32 %r7033, -1, 0, %p1400;
+ xor.b32 %r7034, %r7026, %r7033;
+ shl.b32 %r7035, %r8409, 2;
+ xor.b32 %r7036, %r7035, %r7033;
+ cvt.u64.u32 %rd2290, %r7034;
+ cvt.u64.u32 %rd2291, %r7036;
+ bfi.b64 %rd2292, %rd2290, %rd2291, 32, 32;
+ cvt.rn.f64.s64 %fd221, %rd2292;
+ mul.f64 %fd222, %fd221, 0d3BF921FB54442D19;
+ cvt.rn.f32.f64 %f4695, %fd222;
+ setp.eq.s32 %p1401, %r7032, 0;
+ neg.f32 %f4696, %f4695;
+ selp.f32 %f5934, %f4695, %f4696, %p1401;
+
+$L__BB0_1653:
+ and.b32 %r2177, %r8410, 1;
+ setp.eq.s32 %p1402, %r2177, 0;
+ selp.f32 %f1933, %f5934, 0f3F800000, %p1402;
+ mul.rn.f32 %f1934, %f5934, %f5934;
+ mov.f32 %f5935, 0fB94D4153;
+ @%p1402 bra $L__BB0_1655;
+
+ mov.f32 %f4699, 0fBAB607ED;
+ mov.f32 %f4700, 0f37CBAC00;
+ fma.rn.f32 %f5935, %f4700, %f1934, %f4699;
+
+$L__BB0_1655:
+ selp.f32 %f4701, 0f3C0885E4, 0f3D2AAABB, %p1402;
+ fma.rn.f32 %f4702, %f5935, %f1934, %f4701;
+ selp.f32 %f4703, 0fBE2AAAA8, 0fBEFFFFFF, %p1402;
+ fma.rn.f32 %f4704, %f4702, %f1934, %f4703;
+ mov.f32 %f4705, 0f00000000;
+ fma.rn.f32 %f4706, %f1934, %f1933, %f4705;
+ fma.rn.f32 %f5281, %f4704, %f4706, %f1933;
+ and.b32 %r7038, %r8410, 2;
+ setp.eq.s32 %p1404, %r7038, 0;
+ @%p1404 bra $L__BB0_1657;
+
+ mov.f32 %f4708, 0fBF800000;
+ fma.rn.f32 %f5281, %f5281, %f4708, %f4705;
+
+$L__BB0_1657:
+ setp.lt.s32 %p1406, %r11, %r2159;
+ selp.f32 %f1941, %f5281, %f5282, %p1406;
+ @%p1394 bra $L__BB0_1670;
+
+ mul.f32 %f4709, %f5401, 0f3F22F983;
+ cvt.rni.s32.f32 %r8414, %f4709;
+ cvt.rn.f32.s32 %f4710, %r8414;
+ mov.f32 %f4711, 0fBFC90FDA;
+ fma.rn.f32 %f4712, %f4710, %f4711, %f5401;
+ mov.f32 %f4713, 0fB3A22168;
+ fma.rn.f32 %f4714, %f4710, %f4713, %f4712;
+ mov.f32 %f4715, 0fA7C234C5;
+ fma.rn.f32 %f5938, %f4710, %f4715, %f4714;
+ abs.f32 %f1943, %f5401;
+ setp.ltu.f32 %p1407, %f1943, 0f47CE4780;
+ @%p1407 bra $L__BB0_1666;
+
+ setp.eq.f32 %p1408, %f1943, 0f7F800000;
+ @%p1408 bra $L__BB0_1665;
+ bra.uni $L__BB0_1660;
+
+$L__BB0_1665:
+ mov.f32 %f4718, 0f00000000;
+ mul.rn.f32 %f5938, %f5401, %f4718;
+ mov.u32 %r8414, 0;
+ bra.uni $L__BB0_1666;
+
+$L__BB0_1660:
+ mov.b32 %r2179, %f5401;
+ shr.u32 %r7040, %r2179, 23;
+ and.b32 %r7041, %r7040, 255;
+ add.s32 %r2180, %r7041, -128;
+ shl.b32 %r7042, %r2179, 8;
+ or.b32 %r2181, %r7042, -2147483648;
+ shr.u32 %r2182, %r2180, 5;
mov.u64 %rd2744, 0;
- mov.u32 %r8666, 0;
+ mov.u32 %r8411, 0;
mov.u64 %rd2742, __cudart_i2opi_f;
mov.u64 %rd2743, %rd1;
-$L__BB0_1684:
+$L__BB0_1661:
.pragma "nounroll";
- ld.global.nc.u32 %r7182, [%rd2742];
- mad.wide.u32 %rd2269, %r7182, %r2211, %rd2744;
- shr.u64 %rd2744, %rd2269, 32;
- st.local.u32 [%rd2743], %rd2269;
+ ld.global.nc.u32 %r7043, [%rd2742];
+ mad.wide.u32 %rd2295, %r7043, %r2181, %rd2744;
+ shr.u64 %rd2744, %rd2295, 32;
+ st.local.u32 [%rd2743], %rd2295;
add.s64 %rd2743, %rd2743, 4;
add.s64 %rd2742, %rd2742, 4;
- add.s32 %r8666, %r8666, 1;
- setp.ne.s32 %p1425, %r8666, 6;
- @%p1425 bra $L__BB0_1684;
-
- st.local.u32 [%rd5], %rd2744;
- mov.u32 %r7183, 4;
- sub.s32 %r2215, %r7183, %r2212;
- mov.u32 %r7184, 6;
- sub.s32 %r7185, %r7184, %r2212;
- mul.wide.s32 %rd2270, %r7185, 4;
- add.s64 %rd2271, %rd1, %rd2270;
- ld.local.u32 %r8667, [%rd2271];
- ld.local.u32 %r8668, [%rd2271+-4];
- and.b32 %r2218, %r2210, 31;
- setp.eq.s32 %p1426, %r2218, 0;
- @%p1426 bra $L__BB0_1687;
-
- mov.u32 %r7186, 32;
- sub.s32 %r7187, %r7186, %r2218;
- shr.u32 %r7188, %r8668, %r7187;
- shl.b32 %r7189, %r8667, %r2218;
- add.s32 %r8667, %r7188, %r7189;
- mul.wide.s32 %rd2272, %r2215, 4;
- add.s64 %rd2273, %rd1, %rd2272;
- ld.local.u32 %r7190, [%rd2273];
- shr.u32 %r7191, %r7190, %r7187;
- shl.b32 %r7192, %r8668, %r2218;
- add.s32 %r8668, %r7191, %r7192;
-
-$L__BB0_1687:
- and.b32 %r7193, %r2209, -2147483648;
- shr.u32 %r7194, %r8668, 30;
- shl.b32 %r7195, %r8667, 2;
- or.b32 %r7196, %r7194, %r7195;
- shr.u32 %r7197, %r7196, 31;
- shr.u32 %r7198, %r8667, 30;
- add.s32 %r7199, %r7197, %r7198;
- neg.s32 %r7200, %r7199;
- setp.eq.s32 %p1427, %r7193, 0;
- selp.b32 %r8669, %r7199, %r7200, %p1427;
- setp.ne.s32 %p1428, %r7197, 0;
- xor.b32 %r7201, %r7193, -2147483648;
- selp.b32 %r7202, %r7201, %r7193, %p1428;
- selp.b32 %r7203, -1, 0, %p1428;
- xor.b32 %r7204, %r7196, %r7203;
- shl.b32 %r7205, %r8668, 2;
- xor.b32 %r7206, %r7205, %r7203;
- cvt.u64.u32 %rd2274, %r7204;
- cvt.u64.u32 %rd2275, %r7206;
- bfi.b64 %rd2276, %rd2274, %rd2275, 32, 32;
- cvt.rn.f64.s64 %fd221, %rd2276;
- mul.f64 %fd222, %fd221, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4639, %fd222;
- setp.eq.s32 %p1429, %r7202, 0;
- neg.f32 %f4640, %f4639;
- selp.f32 %f5842, %f4639, %f4640, %p1429;
-
-$L__BB0_1689:
- and.b32 %r2225, %r8669, 1;
- setp.eq.s32 %p1430, %r2225, 0;
- selp.f32 %f1901, %f5842, 0f3F800000, %p1430;
- mul.rn.f32 %f1902, %f5842, %f5842;
- mov.f32 %f5843, 0fB94D4153;
- @%p1430 bra $L__BB0_1691;
-
- mov.f32 %f4643, 0fBAB607ED;
- mov.f32 %f4644, 0f37CBAC00;
- fma.rn.f32 %f5843, %f4644, %f1902, %f4643;
-
-$L__BB0_1691:
- selp.f32 %f4645, 0f3C0885E4, 0f3D2AAABB, %p1430;
- fma.rn.f32 %f4646, %f5843, %f1902, %f4645;
- selp.f32 %f4647, 0fBE2AAAA8, 0fBEFFFFFF, %p1430;
- fma.rn.f32 %f4648, %f4646, %f1902, %f4647;
- mov.f32 %f4649, 0f00000000;
- fma.rn.f32 %f4650, %f1902, %f1901, %f4649;
- fma.rn.f32 %f5213, %f4648, %f4650, %f1901;
- and.b32 %r7208, %r8669, 2;
- setp.eq.s32 %p1432, %r7208, 0;
- @%p1432 bra $L__BB0_1693;
-
- mov.f32 %f4652, 0fBF800000;
- fma.rn.f32 %f5213, %f5213, %f4652, %f4649;
-
-$L__BB0_1693:
- setp.lt.s32 %p1434, %r14, %r2207;
- selp.f32 %f1909, %f5213, %f5214, %p1434;
- @%p1422 bra $L__BB0_1706;
-
- mul.f32 %f4653, %f5333, 0f3F22F983;
- cvt.rni.s32.f32 %r8673, %f4653;
- cvt.rn.f32.s32 %f4654, %r8673;
- mov.f32 %f4655, 0fBFC90FDA;
- fma.rn.f32 %f4656, %f4654, %f4655, %f5333;
- mov.f32 %f4657, 0fB3A22168;
- fma.rn.f32 %f4658, %f4654, %f4657, %f4656;
- mov.f32 %f4659, 0fA7C234C5;
- fma.rn.f32 %f5846, %f4654, %f4659, %f4658;
- abs.f32 %f1911, %f5333;
- setp.ltu.f32 %p1435, %f1911, 0f47CE4780;
- @%p1435 bra $L__BB0_1702;
-
- setp.eq.f32 %p1436, %f1911, 0f7F800000;
- @%p1436 bra $L__BB0_1701;
- bra.uni $L__BB0_1696;
-
-$L__BB0_1701:
- mov.f32 %f4662, 0f00000000;
- mul.rn.f32 %f5846, %f5333, %f4662;
- mov.u32 %r8673, 0;
- bra.uni $L__BB0_1702;
-
-$L__BB0_1696:
- mov.b32 %r2227, %f5333;
- shr.u32 %r7210, %r2227, 23;
- and.b32 %r7211, %r7210, 255;
- add.s32 %r2228, %r7211, -128;
- shl.b32 %r7212, %r2227, 8;
- or.b32 %r2229, %r7212, -2147483648;
- shr.u32 %r2230, %r2228, 5;
- mov.u64 %rd2747, 0;
- mov.u32 %r8670, 0;
- mov.u64 %rd2745, __cudart_i2opi_f;
- mov.u64 %rd2746, %rd1;
-
-$L__BB0_1697:
- .pragma "nounroll";
- ld.global.nc.u32 %r7213, [%rd2745];
- mad.wide.u32 %rd2279, %r7213, %r2229, %rd2747;
- shr.u64 %rd2747, %rd2279, 32;
- st.local.u32 [%rd2746], %rd2279;
- add.s64 %rd2746, %rd2746, 4;
- add.s64 %rd2745, %rd2745, 4;
- add.s32 %r8670, %r8670, 1;
- setp.ne.s32 %p1437, %r8670, 6;
- @%p1437 bra $L__BB0_1697;
-
- st.local.u32 [%rd5], %rd2747;
- mov.u32 %r7214, 4;
- sub.s32 %r2233, %r7214, %r2230;
- mov.u32 %r7215, 6;
- sub.s32 %r7216, %r7215, %r2230;
- mul.wide.s32 %rd2280, %r7216, 4;
- add.s64 %rd2281, %rd1, %rd2280;
- ld.local.u32 %r8671, [%rd2281];
- ld.local.u32 %r8672, [%rd2281+-4];
- and.b32 %r2236, %r2228, 31;
- setp.eq.s32 %p1438, %r2236, 0;
- @%p1438 bra $L__BB0_1700;
-
- mov.u32 %r7217, 32;
- sub.s32 %r7218, %r7217, %r2236;
- shr.u32 %r7219, %r8672, %r7218;
- shl.b32 %r7220, %r8671, %r2236;
- add.s32 %r8671, %r7219, %r7220;
- mul.wide.s32 %rd2282, %r2233, 4;
- add.s64 %rd2283, %rd1, %rd2282;
- ld.local.u32 %r7221, [%rd2283];
- shr.u32 %r7222, %r7221, %r7218;
- shl.b32 %r7223, %r8672, %r2236;
- add.s32 %r8672, %r7222, %r7223;
-
-$L__BB0_1700:
- and.b32 %r7224, %r2227, -2147483648;
- shr.u32 %r7225, %r8672, 30;
- shl.b32 %r7226, %r8671, 2;
- or.b32 %r7227, %r7225, %r7226;
- shr.u32 %r7228, %r7227, 31;
- shr.u32 %r7229, %r8671, 30;
- add.s32 %r7230, %r7228, %r7229;
- neg.s32 %r7231, %r7230;
- setp.eq.s32 %p1439, %r7224, 0;
- selp.b32 %r8673, %r7230, %r7231, %p1439;
- setp.ne.s32 %p1440, %r7228, 0;
- xor.b32 %r7232, %r7224, -2147483648;
- selp.b32 %r7233, %r7232, %r7224, %p1440;
- selp.b32 %r7234, -1, 0, %p1440;
- xor.b32 %r7235, %r7227, %r7234;
- shl.b32 %r7236, %r8672, 2;
- xor.b32 %r7237, %r7236, %r7234;
- cvt.u64.u32 %rd2284, %r7235;
- cvt.u64.u32 %rd2285, %r7237;
- bfi.b64 %rd2286, %rd2284, %rd2285, 32, 32;
- cvt.rn.f64.s64 %fd223, %rd2286;
+ add.s32 %r8411, %r8411, 1;
+ setp.ne.s32 %p1409, %r8411, 6;
+ @%p1409 bra $L__BB0_1661;
+
+ st.local.u32 [%rd4], %rd2744;
+ mov.u32 %r7044, 4;
+ sub.s32 %r2185, %r7044, %r2182;
+ mov.u32 %r7045, 6;
+ sub.s32 %r7046, %r7045, %r2182;
+ mul.wide.s32 %rd2296, %r7046, 4;
+ add.s64 %rd2297, %rd1, %rd2296;
+ ld.local.u32 %r8412, [%rd2297];
+ ld.local.u32 %r8413, [%rd2297+-4];
+ and.b32 %r2188, %r2180, 31;
+ setp.eq.s32 %p1410, %r2188, 0;
+ @%p1410 bra $L__BB0_1664;
+
+ mov.u32 %r7047, 32;
+ sub.s32 %r7048, %r7047, %r2188;
+ shr.u32 %r7049, %r8413, %r7048;
+ shl.b32 %r7050, %r8412, %r2188;
+ add.s32 %r8412, %r7049, %r7050;
+ mul.wide.s32 %rd2298, %r2185, 4;
+ add.s64 %rd2299, %rd1, %rd2298;
+ ld.local.u32 %r7051, [%rd2299];
+ shr.u32 %r7052, %r7051, %r7048;
+ shl.b32 %r7053, %r8413, %r2188;
+ add.s32 %r8413, %r7052, %r7053;
+
+$L__BB0_1664:
+ and.b32 %r7054, %r2179, -2147483648;
+ shr.u32 %r7055, %r8413, 30;
+ shl.b32 %r7056, %r8412, 2;
+ or.b32 %r7057, %r7055, %r7056;
+ shr.u32 %r7058, %r7057, 31;
+ shr.u32 %r7059, %r8412, 30;
+ add.s32 %r7060, %r7058, %r7059;
+ neg.s32 %r7061, %r7060;
+ setp.eq.s32 %p1411, %r7054, 0;
+ selp.b32 %r8414, %r7060, %r7061, %p1411;
+ setp.ne.s32 %p1412, %r7058, 0;
+ xor.b32 %r7062, %r7054, -2147483648;
+ selp.b32 %r7063, %r7062, %r7054, %p1412;
+ selp.b32 %r7064, -1, 0, %p1412;
+ xor.b32 %r7065, %r7057, %r7064;
+ shl.b32 %r7066, %r8413, 2;
+ xor.b32 %r7067, %r7066, %r7064;
+ cvt.u64.u32 %rd2300, %r7065;
+ cvt.u64.u32 %rd2301, %r7067;
+ bfi.b64 %rd2302, %rd2300, %rd2301, 32, 32;
+ cvt.rn.f64.s64 %fd223, %rd2302;
mul.f64 %fd224, %fd223, 0d3BF921FB54442D19;
- cvt.rn.f32.f64 %f4660, %fd224;
- setp.eq.s32 %p1441, %r7233, 0;
- neg.f32 %f4661, %f4660;
- selp.f32 %f5846, %f4660, %f4661, %p1441;
-
-$L__BB0_1702:
- add.s32 %r2243, %r8673, 1;
- and.b32 %r2244, %r2243, 1;
- setp.eq.s32 %p1442, %r2244, 0;
- selp.f32 %f1915, %f5846, 0f3F800000, %p1442;
- mul.rn.f32 %f1916, %f5846, %f5846;
- mov.f32 %f5847, 0fB94D4153;
- @%p1442 bra $L__BB0_1704;
-
- mov.f32 %f4664, 0fBAB607ED;
- mov.f32 %f4665, 0f37CBAC00;
- fma.rn.f32 %f5847, %f4665, %f1916, %f4664;
-
-$L__BB0_1704:
- selp.f32 %f4666, 0f3C0885E4, 0f3D2AAABB, %p1442;
- fma.rn.f32 %f4667, %f5847, %f1916, %f4666;
- selp.f32 %f4668, 0fBE2AAAA8, 0fBEFFFFFF, %p1442;
- fma.rn.f32 %f4669, %f4667, %f1916, %f4668;
- mov.f32 %f4670, 0f00000000;
- fma.rn.f32 %f4671, %f1916, %f1915, %f4670;
- fma.rn.f32 %f5215, %f4669, %f4671, %f1915;
- and.b32 %r7239, %r2243, 2;
- setp.eq.s32 %p1444, %r7239, 0;
- @%p1444 bra $L__BB0_1706;
-
- mov.f32 %f4673, 0fBF800000;
- fma.rn.f32 %f5215, %f5215, %f4673, %f4670;
-
-$L__BB0_1706:
- selp.f32 %f1923, %f5215, %f5216, %p1434;
- @%p1422 bra $L__BB0_1901;
-
- add.f32 %f5898, %f1909, %f1923;
+ cvt.rn.f32.f64 %f4716, %fd224;
+ setp.eq.s32 %p1413, %r7063, 0;
+ neg.f32 %f4717, %f4716;
+ selp.f32 %f5938, %f4716, %f4717, %p1413;
+
+$L__BB0_1666:
+ add.s32 %r2195, %r8414, 1;
+ and.b32 %r2196, %r2195, 1;
+ setp.eq.s32 %p1414, %r2196, 0;
+ selp.f32 %f1947, %f5938, 0f3F800000, %p1414;
+ mul.rn.f32 %f1948, %f5938, %f5938;
+ mov.f32 %f5939, 0fB94D4153;
+ @%p1414 bra $L__BB0_1668;
+
+ mov.f32 %f4720, 0fBAB607ED;
+ mov.f32 %f4721, 0f37CBAC00;
+ fma.rn.f32 %f5939, %f4721, %f1948, %f4720;
+
+$L__BB0_1668:
+ selp.f32 %f4722, 0f3C0885E4, 0f3D2AAABB, %p1414;
+ fma.rn.f32 %f4723, %f5939, %f1948, %f4722;
+ selp.f32 %f4724, 0fBE2AAAA8, 0fBEFFFFFF, %p1414;
+ fma.rn.f32 %f4725, %f4723, %f1948, %f4724;
+ mov.f32 %f4726, 0f00000000;
+ fma.rn.f32 %f4727, %f1948, %f1947, %f4726;
+ fma.rn.f32 %f5283, %f4725, %f4727, %f1947;
+ and.b32 %r7069, %r2195, 2;
+ setp.eq.s32 %p1416, %r7069, 0;
+ @%p1416 bra $L__BB0_1670;
+
+ mov.f32 %f4729, 0fBF800000;
+ fma.rn.f32 %f5283, %f5283, %f4729, %f4726;
+
+$L__BB0_1670:
+ selp.f32 %f1955, %f5283, %f5284, %p1406;
+ @%p1394 bra $L__BB0_1865;
+
+ add.f32 %f5990, %f1941, %f1955;
+
+$L__BB0_1865:
+ shl.b32 %r2493, %r12, 1;
+ setp.lt.s32 %p1579, %r2493, 3;
+ and.pred %p1581, %p33, %p1579;
+ @%p1581 bra $L__BB0_1868;
+ bra.uni $L__BB0_1866;
+
+$L__BB0_1868:
+ add.f32 %f5081, %f5400, 0f00000000;
+ add.f32 %f5082, %f5081, %f5399;
+ add.f32 %f5083, %f5082, %f5398;
+ add.f32 %f5084, %f5083, %f5397;
+ add.f32 %f5085, %f5084, %f5396;
+ add.f32 %f5086, %f5085, %f5395;
+ add.f32 %f5087, %f5086, %f5394;
+ add.f32 %f5998, %f5087, %f5393;
+ bra.uni $L__BB0_1869;
+
+$L__BB0_1866:
+ setp.lt.s32 %p1582, %r2493, 15;
+ shl.b32 %r7628, %r12, 6;
+ neg.s32 %r7629, %r7628;
+ setp.lt.s32 %p1583, %r11, %r7629;
+ and.pred %p1584, %p1582, %p1583;
+ add.f32 %f5067, %f5400, 0f00000000;
+ selp.f32 %f5068, %f5067, 0f00000000, %p1584;
+ mov.u32 %r7630, -32;
+ sub.s32 %r2494, %r7630, %r7628;
+ setp.lt.s32 %p1585, %r11, %r2494;
+ add.f32 %f5069, %f5068, %f5399;
+ and.pred %p1586, %p1582, %p1585;
+ selp.f32 %f5070, %f5069, %f5068, %p1586;
+ add.f32 %f5071, %f5070, %f5398;
+ setp.lt.s32 %p1587, %r2493, 14;
+ and.pred %p1588, %p1587, %p1583;
+ selp.f32 %f5072, %f5071, %f5070, %p1588;
+ add.f32 %f5073, %f5072, %f5397;
+ and.pred %p1589, %p1587, %p1585;
+ selp.f32 %f5074, %f5073, %f5072, %p1589;
+ add.s32 %r7631, %r2493, 2;
+ setp.lt.s32 %p1590, %r7631, 15;
+ add.f32 %f5075, %f5074, %f5396;
+ and.pred %p1591, %p1590, %p1583;
+ selp.f32 %f5076, %f5075, %f5074, %p1591;
+ add.f32 %f5077, %f5076, %f5395;
+ and.pred %p1592, %p1590, %p1585;
+ selp.f32 %f5078, %f5077, %f5076, %p1592;
+ add.s32 %r7632, %r2493, 3;
+ setp.gt.s32 %p1593, %r7632, 14;
+ setp.lt.s32 %p1594, %r7632, 15;
+ add.f32 %f5079, %f5078, %f5394;
+ and.pred %p1595, %p1594, %p1583;
+ selp.f32 %f5998, %f5079, %f5078, %p1595;
+ @%p1593 bra $L__BB0_1869;
+
+ add.f32 %f5080, %f5998, %f5393;
+ selp.f32 %f5998, %f5080, %f5998, %p1585;
+
+$L__BB0_1869:
+ setp.lt.s32 %p1597, %r2493, 2;
+ and.pred %p1599, %p33, %p1597;
+ @%p1599 bra $L__BB0_1872;
+ bra.uni $L__BB0_1870;
+
+$L__BB0_1872:
+ add.f32 %f5102, %f5998, %f5599;
+ add.f32 %f5103, %f5102, %f5598;
+ add.f32 %f5104, %f5103, %f5597;
+ add.f32 %f5105, %f5104, %f5596;
+ add.f32 %f5106, %f5105, %f5595;
+ add.f32 %f5107, %f5106, %f5594;
+ add.f32 %f5108, %f5107, %f5593;
+ add.f32 %f5999, %f5108, %f5592;
+ bra.uni $L__BB0_1873;
+
+$L__BB0_1870:
+ add.s32 %r7633, %r2493, 4;
+ setp.lt.s32 %p1600, %r7633, 15;
+ shl.b32 %r7634, %r12, 6;
+ neg.s32 %r7635, %r7634;
+ setp.lt.s32 %p1601, %r11, %r7635;
+ and.pred %p1602, %p1600, %p1601;
+ add.f32 %f5088, %f5998, %f5599;
+ selp.f32 %f5089, %f5088, %f5998, %p1602;
+ mov.u32 %r7636, -32;
+ sub.s32 %r2495, %r7636, %r7634;
+ setp.lt.s32 %p1603, %r11, %r2495;
+ add.f32 %f5090, %f5089, %f5598;
+ and.pred %p1604, %p1600, %p1603;
+ selp.f32 %f5091, %f5090, %f5089, %p1604;
+ add.s32 %r7637, %r2493, 5;
+ setp.lt.s32 %p1605, %r7637, 15;
+ add.f32 %f5092, %f5091, %f5597;
+ and.pred %p1606, %p1605, %p1601;
+ selp.f32 %f5093, %f5092, %f5091, %p1606;
+ add.f32 %f5094, %f5093, %f5596;
+ and.pred %p1607, %p1605, %p1603;
+ selp.f32 %f5095, %f5094, %f5093, %p1607;
+ add.s32 %r7638, %r2493, 6;
+ setp.lt.s32 %p1608, %r7638, 15;
+ add.f32 %f5096, %f5095, %f5595;
+ and.pred %p1609, %p1608, %p1601;
+ selp.f32 %f5097, %f5096, %f5095, %p1609;
+ add.f32 %f5098, %f5097, %f5594;
+ and.pred %p1610, %p1608, %p1603;
+ selp.f32 %f5099, %f5098, %f5097, %p1610;
+ add.s32 %r7639, %r2493, 7;
+ setp.gt.s32 %p1611, %r7639, 14;
+ setp.lt.s32 %p1612, %r7639, 15;
+ add.f32 %f5100, %f5099, %f5593;
+ and.pred %p1613, %p1612, %p1601;
+ selp.f32 %f5999, %f5100, %f5099, %p1613;
+ @%p1611 bra $L__BB0_1873;
+
+ add.f32 %f5101, %f5999, %f5592;
+ selp.f32 %f5999, %f5101, %f5999, %p1603;
+
+$L__BB0_1873:
+ setp.lt.s32 %p1615, %r2493, 1;
+ and.pred %p1617, %p33, %p1615;
+ @%p1617 bra $L__BB0_1876;
+ bra.uni $L__BB0_1874;
+
+$L__BB0_1876:
+ add.f32 %f5123, %f5999, %f5798;
+ add.f32 %f5124, %f5123, %f5797;
+ add.f32 %f5125, %f5124, %f5796;
+ add.f32 %f5126, %f5125, %f5795;
+ add.f32 %f5127, %f5126, %f5794;
+ add.f32 %f5128, %f5127, %f5793;
+ add.f32 %f5129, %f5128, %f5792;
+ add.f32 %f6000, %f5129, %f5791;
+ bra.uni $L__BB0_1877;
+
+$L__BB0_1874:
+ add.s32 %r7640, %r2493, 8;
+ setp.lt.s32 %p1618, %r7640, 15;
+ shl.b32 %r7641, %r12, 6;
+ neg.s32 %r7642, %r7641;
+ setp.lt.s32 %p1619, %r11, %r7642;
+ and.pred %p1620, %p1618, %p1619;
+ add.f32 %f5109, %f5999, %f5798;
+ selp.f32 %f5110, %f5109, %f5999, %p1620;
+ mov.u32 %r7643, -32;
+ sub.s32 %r2496, %r7643, %r7641;
+ setp.lt.s32 %p1621, %r11, %r2496;
+ add.f32 %f5111, %f5110, %f5797;
+ and.pred %p1622, %p1618, %p1621;
+ selp.f32 %f5112, %f5111, %f5110, %p1622;
+ add.s32 %r7644, %r2493, 9;
+ setp.lt.s32 %p1623, %r7644, 15;
+ add.f32 %f5113, %f5112, %f5796;
+ and.pred %p1624, %p1623, %p1619;
+ selp.f32 %f5114, %f5113, %f5112, %p1624;
+ add.f32 %f5115, %f5114, %f5795;
+ and.pred %p1625, %p1623, %p1621;
+ selp.f32 %f5116, %f5115, %f5114, %p1625;
+ add.s32 %r7645, %r2493, 10;
+ setp.lt.s32 %p1626, %r7645, 15;
+ add.f32 %f5117, %f5116, %f5794;
+ and.pred %p1627, %p1626, %p1619;
+ selp.f32 %f5118, %f5117, %f5116, %p1627;
+ add.f32 %f5119, %f5118, %f5793;
+ and.pred %p1628, %p1626, %p1621;
+ selp.f32 %f5120, %f5119, %f5118, %p1628;
+ add.s32 %r7646, %r2493, 11;
+ setp.gt.s32 %p1629, %r7646, 14;
+ setp.lt.s32 %p1630, %r7646, 15;
+ add.f32 %f5121, %f5120, %f5792;
+ and.pred %p1631, %p1630, %p1619;
+ selp.f32 %f6000, %f5121, %f5120, %p1631;
+ @%p1629 bra $L__BB0_1877;
+
+ add.f32 %f5122, %f6000, %f5791;
+ selp.f32 %f6000, %f5122, %f6000, %p1621;
+
+$L__BB0_1877:
+ setp.lt.s32 %p1633, %r2493, 0;
+ and.pred %p1635, %p33, %p1633;
+ @%p1635 bra $L__BB0_1880;
+ bra.uni $L__BB0_1878;
+
+$L__BB0_1880:
+ add.f32 %f5144, %f6000, %f5997;
+ add.f32 %f5145, %f5144, %f5996;
+ add.f32 %f5146, %f5145, %f5995;
+ add.f32 %f5147, %f5146, %f5994;
+ add.f32 %f5148, %f5147, %f5993;
+ add.f32 %f5149, %f5148, %f5992;
+ add.f32 %f5150, %f5149, %f5991;
+ add.f32 %f6001, %f5150, %f5990;
+ bra.uni $L__BB0_1881;
+
+$L__BB0_1878:
+ add.s32 %r7647, %r2493, 12;
+ setp.lt.s32 %p1636, %r7647, 15;
+ shl.b32 %r7648, %r12, 6;
+ neg.s32 %r7649, %r7648;
+ setp.lt.s32 %p1637, %r11, %r7649;
+ and.pred %p1638, %p1636, %p1637;
+ add.f32 %f5130, %f6000, %f5997;
+ selp.f32 %f5131, %f5130, %f6000, %p1638;
+ mov.u32 %r7650, -32;
+ sub.s32 %r2497, %r7650, %r7648;
+ setp.lt.s32 %p1639, %r11, %r2497;
+ add.f32 %f5132, %f5131, %f5996;
+ and.pred %p1640, %p1636, %p1639;
+ selp.f32 %f5133, %f5132, %f5131, %p1640;
+ add.s32 %r7651, %r2493, 13;
+ setp.lt.s32 %p1641, %r7651, 15;
+ add.f32 %f5134, %f5133, %f5995;
+ and.pred %p1642, %p1641, %p1637;
+ selp.f32 %f5135, %f5134, %f5133, %p1642;
+ add.f32 %f5136, %f5135, %f5994;
+ and.pred %p1643, %p1641, %p1639;
+ selp.f32 %f5137, %f5136, %f5135, %p1643;
+ add.s32 %r7652, %r2493, 14;
+ setp.lt.s32 %p1644, %r7652, 15;
+ add.f32 %f5138, %f5137, %f5993;
+ and.pred %p1645, %p1644, %p1637;
+ selp.f32 %f5139, %f5138, %f5137, %p1645;
+ add.f32 %f5140, %f5139, %f5992;
+ and.pred %p1646, %p1644, %p1639;
+ selp.f32 %f5141, %f5140, %f5139, %p1646;
+ add.s32 %r7653, %r2493, 15;
+ setp.gt.s32 %p1647, %r7653, 14;
+ setp.lt.s32 %p1648, %r7653, 15;
+ add.f32 %f5142, %f5141, %f5991;
+ and.pred %p1649, %p1648, %p1637;
+ selp.f32 %f6001, %f5142, %f5141, %p1649;
+ @%p1647 bra $L__BB0_1881;
+
+ add.f32 %f5143, %f6001, %f5990;
+ selp.f32 %f6001, %f5143, %f6001, %p1639;
+
+$L__BB0_1881:
+ shl.b32 %r2498, %r12, 2;
+ mov.u32 %r7654, %tid.z;
+ or.b32 %r2499, %r1, %r7654;
+ mov.u32 %r7655, %ntid.z;
+ mov.u32 %r2500, %ntid.x;
+ mul.lo.s32 %r2501, %r2500, %r7655;
+ mul.lo.s32 %r7656, %r7654, %r2500;
+ add.s32 %r2502, %r7656, %r1;
+ mov.u32 %r2503, %tid.y;
+ mul.lo.s32 %r2504, %r2503, %r2500;
+ add.s32 %r7657, %r2504, %r1;
+ mov.u32 %r2505, %ntid.y;
+ mad.lo.s32 %r2506, %r7656, %r2505, %r7657;
+ mul.wide.u32 %rd2489, %r2506, 4;
+ mov.u64 %rd2490, _ZN11kernelscope6kernelE;
+ add.s64 %rd581, %rd2490, %rd2489;
+ st.shared.f32 [%rd581], %f6001;
+ bar.sync 0;
+ clz.b32 %r7658, %r2501;
+ mov.u32 %r7659, 31;
+ sub.s32 %r7660, %r7659, %r7658;
+ mov.u32 %r7661, 1;
+ shl.b32 %r8480, %r7661, %r7660;
+ setp.ge.u32 %p1651, %r2502, %r8480;
+ add.s32 %r2508, %r8480, %r2502;
+ setp.ge.u32 %p1652, %r2508, %r2501;
+ or.pred %p1653, %p1651, %p1652;
+ @%p1653 bra $L__BB0_1885;
+
+ add.s32 %r8479, %r8480, %r2506;
+ setp.lt.u32 %p1654, %r2505, 2;
+ @%p1654 bra $L__BB0_1884;
+
+ rem.u32 %r7662, %r2508, %r2500;
+ add.s32 %r7663, %r7662, %r2504;
+ sub.s32 %r7664, %r2508, %r7662;
+ mad.lo.s32 %r8479, %r7664, %r2505, %r7663;
+
+$L__BB0_1884:
+ mul.wide.s32 %rd2491, %r8479, 4;
+ add.s64 %rd2493, %rd2490, %rd2491;
+ ld.shared.f32 %f5151, [%rd581];
+ ld.shared.f32 %f5152, [%rd2493];
+ add.f32 %f5153, %f5152, %f5151;
+ st.shared.f32 [%rd581], %f5153;
+
+$L__BB0_1885:
+ bar.sync 0;
+ setp.lt.s32 %p1655, %r8480, 4;
+ @%p1655 bra $L__BB0_1891;
+
+$L__BB0_1886:
+ shr.u32 %r2513, %r8480, 1;
+ setp.ge.u32 %p1656, %r2502, %r2513;
+ @%p1656 bra $L__BB0_1890;
+
+ setp.lt.u32 %p1657, %r2505, 2;
+ add.s32 %r8481, %r2513, %r2506;
+ @%p1657 bra $L__BB0_1889;
+
+ add.s32 %r7665, %r2513, %r2502;
+ rem.u32 %r7666, %r7665, %r2500;
+ add.s32 %r7667, %r7666, %r2504;
+ sub.s32 %r7668, %r7665, %r7666;
+ mad.lo.s32 %r8481, %r7668, %r2505, %r7667;
+
+$L__BB0_1889:
+ mul.wide.s32 %rd2494, %r8481, 4;
+ add.s64 %rd2496, %rd2490, %rd2494;
+ ld.shared.f32 %f5154, [%rd581];
+ ld.shared.f32 %f5155, [%rd2496];
+ add.f32 %f5156, %f5155, %f5154;
+ st.shared.f32 [%rd581], %f5156;
+
+$L__BB0_1890:
+ bar.sync 0;
+ setp.gt.u32 %p1658, %r8480, 7;
+ mov.u32 %r8480, %r2513;
+ @%p1658 bra $L__BB0_1886;
+
+$L__BB0_1891:
+ setp.ne.s32 %p1659, %r2499, 0;
+ mov.f32 %f6002, 0f00000000;
+ @%p1659 bra $L__BB0_1894;
+
+ ld.shared.f32 %f5158, [%rd581];
+ add.f32 %f6002, %f5158, 0f00000000;
+ setp.lt.u32 %p1660, %r2501, 2;
+ @%p1660 bra $L__BB0_1894;
+
+ add.s32 %r7669, %r2506, 1;
+ mul.wide.u32 %rd2497, %r7669, 4;
+ add.s64 %rd2499, %rd2490, %rd2497;
+ ld.shared.f32 %f5159, [%rd2499];
+ add.f32 %f6002, %f6002, %f5159;
+
+$L__BB0_1894:
+ bar.sync 0;
+ mul.wide.s32 %rd2500, %r2503, 4;
+ add.s64 %rd582, %rd2490, %rd2500;
+ setp.eq.s32 %p1661, %r2499, 0;
+ @%p1661 bra $L__BB0_1895;
+ bra.uni $L__BB0_1896;
+
+$L__BB0_1895:
+ st.shared.f32 [%rd582], %f6002;
+
+$L__BB0_1896:
+ ld.param.u64 %rd2504, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEES1_NS0_IfLi3ELi3EEE_param_2];
+ mov.u32 %r7784, %ctaid.x;
+ bar.sync 0;
+ ld.shared.f32 %f2200, [%rd582];
+ bar.sync 0;
+ setp.lt.s32 %p1663, %r2498, 3;
+ and.pred %p1664, %p33, %p1663;
+ mad.lo.s32 %r7671, %r7784, 63, %r1;
+ shl.b32 %r7672, %r12, 7;
+ add.s32 %r7673, %r7671, %r7672;
+ cvta.to.global.u64 %rd2502, %rd2504;
+ mul.wide.s32 %rd2503, %r7673, 4;
+ add.s64 %rd583, %rd2502, %rd2503;
+ @%p1664 bra $L__BB0_1925;
+ bra.uni $L__BB0_1897;
+
+$L__BB0_1925:
+ add.f32 %f5165, %f2200, %f5400;
+ st.global.f32 [%rd583], %f5165;
+ add.f32 %f5166, %f2200, %f5399;
+ st.global.f32 [%rd583+128], %f5166;
+ add.f32 %f5167, %f2200, %f5398;
+ st.global.f32 [%rd583+10584], %f5167;
+ add.f32 %f5168, %f2200, %f5397;
+ st.global.f32 [%rd583+10712], %f5168;
+ add.f32 %f5169, %f2200, %f5396;
+ st.global.f32 [%rd583+21168], %f5169;
+ add.f32 %f5170, %f2200, %f5395;
+ st.global.f32 [%rd583+21296], %f5170;
+ add.f32 %f5171, %f2200, %f5394;
+ st.global.f32 [%rd583+31752], %f5171;
+ add.f32 %f5172, %f2200, %f5393;
+ st.global.f32 [%rd583+31880], %f5172;
+ bra.uni $L__BB0_1926;
+
+$L__BB0_1897:
+ setp.gt.s32 %p1665, %r2498, 14;
+ @%p1665 bra $L__BB0_1899;
+
+ add.f32 %f6034, %f2200, %f5400;
+ add.f32 %f6033, %f2200, %f5399;
+
+$L__BB0_1899:
+ setp.gt.s32 %p1666, %r2498, 13;
+ @%p1666 bra $L__BB0_1901;
+
+ add.f32 %f6016, %f2200, %f5398;
+ add.f32 %f6015, %f2200, %f5397;
$L__BB0_1901:
- shl.b32 %r2541, %r12, 1;
- @%p32 bra $L__BB0_1903;
-
- shl.b32 %r7804, %r12, 3;
- neg.s32 %r7805, %r7804;
- add.s32 %r7806, %r13, -12;
- setp.lt.s32 %p1608, %r7806, %r7805;
- @%p1608 bra $L__BB0_1905;
- bra.uni $L__BB0_1903;
+ setp.gt.s32 %p1667, %r2498, 12;
+ @%p1667 bra $L__BB0_1903;
+
+ add.f32 %f6014, %f2200, %f5396;
+ add.f32 %f6013, %f2200, %f5395;
+
+$L__BB0_1903:
+ setp.gt.s32 %p1668, %r2498, 11;
+ @%p1668 bra $L__BB0_1905;
+
+ add.f32 %f6012, %f2200, %f5394;
+ add.f32 %f6011, %f2200, %f5393;
$L__BB0_1905:
- add.f32 %f5025, %f5332, 0f00000000;
- add.f32 %f5026, %f5025, %f5331;
- add.f32 %f5027, %f5026, %f5330;
- add.f32 %f5028, %f5027, %f5329;
- add.f32 %f5029, %f5028, %f5328;
- add.f32 %f5030, %f5029, %f5327;
- add.f32 %f5031, %f5030, %f5326;
- add.f32 %f5906, %f5031, %f5325;
- bra.uni $L__BB0_1906;
-
-$L__BB0_1903:
- neg.s32 %r7807, %r2541;
- add.s32 %r7808, %r13, -15;
- setp.lt.s32 %p1609, %r7808, %r7807;
- shl.b32 %r7809, %r12, 6;
- neg.s32 %r7810, %r7809;
- setp.lt.s32 %p1610, %r14, %r7810;
- and.pred %p1611, %p1609, %p1610;
- add.f32 %f5011, %f5332, 0f00000000;
- selp.f32 %f5012, %f5011, 0f00000000, %p1611;
- mov.u32 %r7811, -32;
- sub.s32 %r2542, %r7811, %r7809;
- setp.lt.s32 %p1612, %r14, %r2542;
- add.f32 %f5013, %f5012, %f5331;
- and.pred %p1613, %p1609, %p1612;
- selp.f32 %f5014, %f5013, %f5012, %p1613;
- not.b32 %r7812, %r2541;
- setp.lt.s32 %p1614, %r7808, %r7812;
- add.f32 %f5015, %f5014, %f5330;
- and.pred %p1615, %p1614, %p1610;
- selp.f32 %f5016, %f5015, %f5014, %p1615;
- add.f32 %f5017, %f5016, %f5329;
- and.pred %p1616, %p1614, %p1612;
- selp.f32 %f5018, %f5017, %f5016, %p1616;
- mov.u32 %r7813, -2;
- sub.s32 %r7814, %r7813, %r2541;
- setp.lt.s32 %p1617, %r7808, %r7814;
- add.f32 %f5019, %f5018, %f5328;
- and.pred %p1618, %p1617, %p1610;
- selp.f32 %f5020, %f5019, %f5018, %p1618;
- add.f32 %f5021, %f5020, %f5327;
- and.pred %p1619, %p1617, %p1612;
- selp.f32 %f5022, %f5021, %f5020, %p1619;
- mov.u32 %r7815, -3;
- sub.s32 %r7816, %r7815, %r2541;
- setp.ge.s32 %p1620, %r7808, %r7816;
- setp.lt.s32 %p1621, %r7808, %r7816;
- add.f32 %f5023, %f5022, %f5326;
- and.pred %p1622, %p1621, %p1610;
- selp.f32 %f5906, %f5023, %f5022, %p1622;
- @%p1620 bra $L__BB0_1906;
-
- add.f32 %f5024, %f5906, %f5325;
- selp.f32 %f5906, %f5024, %f5906, %p1612;
-
-$L__BB0_1906:
- @%p32 bra $L__BB0_1908;
-
- shl.b32 %r7817, %r12, 3;
- mov.u32 %r7818, -4;
- sub.s32 %r7819, %r7818, %r7817;
- add.s32 %r7820, %r13, -12;
- setp.lt.s32 %p1625, %r7820, %r7819;
- @%p1625 bra $L__BB0_1910;
- bra.uni $L__BB0_1908;
+ @%p1665 bra $L__BB0_1910;
+
+ neg.s32 %r7674, %r7672;
+ setp.ge.s32 %p1670, %r11, %r7674;
+ @%p1670 bra $L__BB0_1908;
+
+ st.global.f32 [%rd583], %f6034;
+
+$L__BB0_1908:
+ mov.u32 %r7675, -32;
+ sub.s32 %r7676, %r7675, %r7672;
+ setp.ge.s32 %p1671, %r11, %r7676;
+ @%p1671 bra $L__BB0_1910;
+
+ st.global.f32 [%rd583+128], %f6033;
$L__BB0_1910:
- add.f32 %f5046, %f5906, %f5523;
- add.f32 %f5047, %f5046, %f5522;
- add.f32 %f5048, %f5047, %f5521;
- add.f32 %f5049, %f5048, %f5520;
- add.f32 %f5050, %f5049, %f5519;
- add.f32 %f5051, %f5050, %f5518;
- add.f32 %f5052, %f5051, %f5517;
- add.f32 %f5907, %f5052, %f5516;
- bra.uni $L__BB0_1911;
-
-$L__BB0_1908:
- mov.u32 %r7821, -4;
- sub.s32 %r7822, %r7821, %r2541;
- add.s32 %r7823, %r13, -15;
- setp.lt.s32 %p1626, %r7823, %r7822;
- shl.b32 %r7824, %r12, 6;
- neg.s32 %r7825, %r7824;
- setp.lt.s32 %p1627, %r14, %r7825;
- and.pred %p1628, %p1626, %p1627;
- add.f32 %f5032, %f5906, %f5523;
- selp.f32 %f5033, %f5032, %f5906, %p1628;
- mov.u32 %r7826, -32;
- sub.s32 %r2543, %r7826, %r7824;
- setp.lt.s32 %p1629, %r14, %r2543;
- add.f32 %f5034, %f5033, %f5522;
- and.pred %p1630, %p1626, %p1629;
- selp.f32 %f5035, %f5034, %f5033, %p1630;
- mov.u32 %r7827, -5;
- sub.s32 %r7828, %r7827, %r2541;
- setp.lt.s32 %p1631, %r7823, %r7828;
- add.f32 %f5036, %f5035, %f5521;
- and.pred %p1632, %p1631, %p1627;
- selp.f32 %f5037, %f5036, %f5035, %p1632;
- add.f32 %f5038, %f5037, %f5520;
- and.pred %p1633, %p1631, %p1629;
- selp.f32 %f5039, %f5038, %f5037, %p1633;
- mov.u32 %r7829, -6;
- sub.s32 %r7830, %r7829, %r2541;
- setp.lt.s32 %p1634, %r7823, %r7830;
- add.f32 %f5040, %f5039, %f5519;
- and.pred %p1635, %p1634, %p1627;
- selp.f32 %f5041, %f5040, %f5039, %p1635;
- add.f32 %f5042, %f5041, %f5518;
- and.pred %p1636, %p1634, %p1629;
- selp.f32 %f5043, %f5042, %f5041, %p1636;
- mov.u32 %r7831, -7;
- sub.s32 %r7832, %r7831, %r2541;
- setp.ge.s32 %p1637, %r7823, %r7832;
- setp.lt.s32 %p1638, %r7823, %r7832;
- add.f32 %f5044, %f5043, %f5517;
- and.pred %p1639, %p1638, %p1627;
- selp.f32 %f5907, %f5044, %f5043, %p1639;
- @%p1637 bra $L__BB0_1911;
-
- add.f32 %f5045, %f5907, %f5516;
- selp.f32 %f5907, %f5045, %f5907, %p1629;
-
-$L__BB0_1911:
- @%p32 bra $L__BB0_1913;
-
- shl.b32 %r7833, %r12, 3;
- mov.u32 %r7834, -8;
- sub.s32 %r7835, %r7834, %r7833;
- add.s32 %r7836, %r13, -12;
- setp.lt.s32 %p1642, %r7836, %r7835;
- @%p1642 bra $L__BB0_1915;
- bra.uni $L__BB0_1913;
+ @%p1666 bra $L__BB0_1915;
+
+ neg.s32 %r7677, %r7672;
+ setp.ge.s32 %p1673, %r11, %r7677;
+ @%p1673 bra $L__BB0_1913;
+
+ st.global.f32 [%rd583+10584], %f6016;
+
+$L__BB0_1913:
+ mov.u32 %r7678, -32;
+ sub.s32 %r7679, %r7678, %r7672;
+ setp.ge.s32 %p1674, %r11, %r7679;
+ @%p1674 bra $L__BB0_1915;
+
+ st.global.f32 [%rd583+10712], %f6015;
$L__BB0_1915:
- add.f32 %f5067, %f5907, %f5714;
- add.f32 %f5068, %f5067, %f5713;
- add.f32 %f5069, %f5068, %f5712;
- add.f32 %f5070, %f5069, %f5711;
- add.f32 %f5071, %f5070, %f5710;
- add.f32 %f5072, %f5071, %f5709;
- add.f32 %f5073, %f5072, %f5708;
- add.f32 %f5908, %f5073, %f5707;
- bra.uni $L__BB0_1916;
-
-$L__BB0_1913:
- mov.u32 %r7837, -8;
- sub.s32 %r7838, %r7837, %r2541;
- add.s32 %r7839, %r13, -15;
- setp.lt.s32 %p1643, %r7839, %r7838;
- shl.b32 %r7840, %r12, 6;
- neg.s32 %r7841, %r7840;
- setp.lt.s32 %p1644, %r14, %r7841;
- and.pred %p1645, %p1643, %p1644;
- add.f32 %f5053, %f5907, %f5714;
- selp.f32 %f5054, %f5053, %f5907, %p1645;
- mov.u32 %r7842, -32;
- sub.s32 %r2544, %r7842, %r7840;
- setp.lt.s32 %p1646, %r14, %r2544;
- add.f32 %f5055, %f5054, %f5713;
- and.pred %p1647, %p1643, %p1646;
- selp.f32 %f5056, %f5055, %f5054, %p1647;
- mov.u32 %r7843, -9;
- sub.s32 %r7844, %r7843, %r2541;
- setp.lt.s32 %p1648, %r7839, %r7844;
- add.f32 %f5057, %f5056, %f5712;
- and.pred %p1649, %p1648, %p1644;
- selp.f32 %f5058, %f5057, %f5056, %p1649;
- add.f32 %f5059, %f5058, %f5711;
- and.pred %p1650, %p1648, %p1646;
- selp.f32 %f5060, %f5059, %f5058, %p1650;
- mov.u32 %r7845, -10;
- sub.s32 %r7846, %r7845, %r2541;
- setp.lt.s32 %p1651, %r7839, %r7846;
- add.f32 %f5061, %f5060, %f5710;
- and.pred %p1652, %p1651, %p1644;
- selp.f32 %f5062, %f5061, %f5060, %p1652;
- add.f32 %f5063, %f5062, %f5709;
- and.pred %p1653, %p1651, %p1646;
- selp.f32 %f5064, %f5063, %f5062, %p1653;
- mov.u32 %r7847, -11;
- sub.s32 %r7848, %r7847, %r2541;
- setp.ge.s32 %p1654, %r7839, %r7848;
- setp.lt.s32 %p1655, %r7839, %r7848;
- add.f32 %f5065, %f5064, %f5708;
- and.pred %p1656, %p1655, %p1644;
- selp.f32 %f5908, %f5065, %f5064, %p1656;
- @%p1654 bra $L__BB0_1916;
-
- add.f32 %f5066, %f5908, %f5707;
- selp.f32 %f5908, %f5066, %f5908, %p1646;
-
-$L__BB0_1916:
- @%p32 bra $L__BB0_1918;
-
- shl.b32 %r7849, %r12, 3;
- mov.u32 %r7850, -12;
- sub.s32 %r7851, %r7850, %r7849;
- add.s32 %r7852, %r13, -12;
- setp.lt.s32 %p1659, %r7852, %r7851;
- @%p1659 bra $L__BB0_1920;
- bra.uni $L__BB0_1918;
+ @%p1667 bra $L__BB0_1920;
+
+ neg.s32 %r7680, %r7672;
+ setp.ge.s32 %p1676, %r11, %r7680;
+ @%p1676 bra $L__BB0_1918;
+
+ st.global.f32 [%rd583+21168], %f6014;
+
+$L__BB0_1918:
+ mov.u32 %r7681, -32;
+ sub.s32 %r7682, %r7681, %r7672;
+ setp.ge.s32 %p1677, %r11, %r7682;
+ @%p1677 bra $L__BB0_1920;
+
+ st.global.f32 [%rd583+21296], %f6013;
$L__BB0_1920:
- add.f32 %f5088, %f5908, %f5905;
- add.f32 %f5089, %f5088, %f5904;
- add.f32 %f5090, %f5089, %f5903;
- add.f32 %f5091, %f5090, %f5902;
- add.f32 %f5092, %f5091, %f5901;
- add.f32 %f5093, %f5092, %f5900;
- add.f32 %f5094, %f5093, %f5899;
- add.f32 %f5909, %f5094, %f5898;
- bra.uni $L__BB0_1921;
-
-$L__BB0_1918:
- mov.u32 %r7853, -12;
- sub.s32 %r7854, %r7853, %r2541;
- add.s32 %r7855, %r13, -15;
- mov.u32 %r7856, -15;
- setp.lt.s32 %p1660, %r7855, %r7854;
- shl.b32 %r7857, %r12, 6;
- neg.s32 %r7858, %r7857;
- setp.lt.s32 %p1661, %r14, %r7858;
- and.pred %p1662, %p1660, %p1661;
- add.f32 %f5074, %f5908, %f5905;
- selp.f32 %f5075, %f5074, %f5908, %p1662;
- mov.u32 %r7859, -32;
- sub.s32 %r2545, %r7859, %r7857;
- setp.lt.s32 %p1663, %r14, %r2545;
- add.f32 %f5076, %f5075, %f5904;
- and.pred %p1664, %p1660, %p1663;
- selp.f32 %f5077, %f5076, %f5075, %p1664;
- mov.u32 %r7860, -13;
- sub.s32 %r7861, %r7860, %r2541;
- setp.lt.s32 %p1665, %r7855, %r7861;
- add.f32 %f5078, %f5077, %f5903;
- and.pred %p1666, %p1665, %p1661;
- selp.f32 %f5079, %f5078, %f5077, %p1666;
- add.f32 %f5080, %f5079, %f5902;
- and.pred %p1667, %p1665, %p1663;
- selp.f32 %f5081, %f5080, %f5079, %p1667;
- mov.u32 %r7862, -14;
- sub.s32 %r7863, %r7862, %r2541;
- setp.lt.s32 %p1668, %r7855, %r7863;
- add.f32 %f5082, %f5081, %f5901;
- and.pred %p1669, %p1668, %p1661;
- selp.f32 %f5083, %f5082, %f5081, %p1669;
- add.f32 %f5084, %f5083, %f5900;
- and.pred %p1670, %p1668, %p1663;
- selp.f32 %f5085, %f5084, %f5083, %p1670;
- sub.s32 %r7864, %r7856, %r2541;
- setp.ge.s32 %p1671, %r7855, %r7864;
- setp.lt.s32 %p1672, %r7855, %r7864;
- add.f32 %f5086, %f5085, %f5899;
- and.pred %p1673, %p1672, %p1661;
- selp.f32 %f5909, %f5086, %f5085, %p1673;
- @%p1671 bra $L__BB0_1921;
-
- add.f32 %f5087, %f5909, %f5898;
- selp.f32 %f5909, %f5087, %f5909, %p1663;
-
-$L__BB0_1921:
- shl.b32 %r2546, %r12, 2;
- or.b32 %r2547, %r1, %r13;
- mov.u32 %r7865, %ntid.z;
- mov.u32 %r2548, %ntid.x;
- mul.lo.s32 %r2549, %r2548, %r7865;
- mul.lo.s32 %r7866, %r13, %r2548;
- add.s32 %r2550, %r7866, %r1;
- mov.u32 %r2551, %tid.y;
- mul.lo.s32 %r2552, %r2551, %r2548;
- add.s32 %r7867, %r2552, %r1;
- mov.u32 %r2553, %ntid.y;
- mad.lo.s32 %r2554, %r7866, %r2553, %r7867;
- mul.wide.u32 %rd2473, %r2554, 4;
- mov.u64 %rd2474, _ZN11kernelscope6kernelE;
- add.s64 %rd602, %rd2474, %rd2473;
- st.shared.f32 [%rd602], %f5909;
- bar.sync 0;
- clz.b32 %r7868, %r2549;
- mov.u32 %r7869, 31;
- sub.s32 %r7870, %r7869, %r7868;
- mov.u32 %r7871, 1;
- shl.b32 %r8739, %r7871, %r7870;
- setp.ge.u32 %p1675, %r2550, %r8739;
- add.s32 %r2556, %r8739, %r2550;
- setp.ge.u32 %p1676, %r2556, %r2549;
- or.pred %p1677, %p1675, %p1676;
- @%p1677 bra $L__BB0_1925;
-
- add.s32 %r8738, %r8739, %r2554;
- setp.lt.u32 %p1678, %r2553, 2;
- @%p1678 bra $L__BB0_1924;
-
- rem.u32 %r7872, %r2556, %r2548;
- add.s32 %r7873, %r7872, %r2552;
- sub.s32 %r7874, %r2556, %r7872;
- mad.lo.s32 %r8738, %r7874, %r2553, %r7873;
-
-$L__BB0_1924:
- mul.wide.s32 %rd2475, %r8738, 4;
- add.s64 %rd2477, %rd2474, %rd2475;
- ld.shared.f32 %f5095, [%rd602];
- ld.shared.f32 %f5096, [%rd2477];
- add.f32 %f5097, %f5096, %f5095;
- st.shared.f32 [%rd602], %f5097;
-
-$L__BB0_1925:
- bar.sync 0;
- setp.lt.s32 %p1679, %r8739, 4;
- @%p1679 bra $L__BB0_1931;
+ @%p1668 bra $L__BB0_1926;
+
+ neg.s32 %r7683, %r7672;
+ setp.ge.s32 %p1679, %r11, %r7683;
+ @%p1679 bra $L__BB0_1923;
+
+ st.global.f32 [%rd583+31752], %f6012;
+
+$L__BB0_1923:
+ mov.u32 %r7684, -32;
+ sub.s32 %r7685, %r7684, %r7672;
+ setp.ge.s32 %p1680, %r11, %r7685;
+ @%p1680 bra $L__BB0_1926;
+
+ st.global.f32 [%rd583+31880], %f6011;
$L__BB0_1926:
- shr.u32 %r2561, %r8739, 1;
- setp.ge.u32 %p1680, %r2550, %r2561;
- @%p1680 bra $L__BB0_1930;
-
- setp.lt.u32 %p1681, %r2553, 2;
- add.s32 %r8740, %r2561, %r2554;
- @%p1681 bra $L__BB0_1929;
-
- add.s32 %r7875, %r2561, %r2550;
- rem.u32 %r7876, %r7875, %r2548;
- add.s32 %r7877, %r7876, %r2552;
- sub.s32 %r7878, %r7875, %r7876;
- mad.lo.s32 %r8740, %r7878, %r2553, %r7877;
+ setp.lt.s32 %p1681, %r2498, 2;
+ and.pred %p1683, %p33, %p1681;
+ @%p1683 bra $L__BB0_1955;
+ bra.uni $L__BB0_1927;
+
+$L__BB0_1955:
+ add.f32 %f5181, %f2200, %f5599;
+ st.global.f32 [%rd583+42336], %f5181;
+ add.f32 %f5182, %f2200, %f5598;
+ st.global.f32 [%rd583+42464], %f5182;
+ add.f32 %f5183, %f2200, %f5597;
+ st.global.f32 [%rd583+52920], %f5183;
+ add.f32 %f5184, %f2200, %f5596;
+ st.global.f32 [%rd583+53048], %f5184;
+ add.f32 %f5185, %f2200, %f5595;
+ st.global.f32 [%rd583+63504], %f5185;
+ add.f32 %f5186, %f2200, %f5594;
+ st.global.f32 [%rd583+63632], %f5186;
+ add.f32 %f5187, %f2200, %f5593;
+ st.global.f32 [%rd583+74088], %f5187;
+ add.f32 %f5188, %f2200, %f5592;
+ st.global.f32 [%rd583+74216], %f5188;
+ bra.uni $L__BB0_1956;
+
+$L__BB0_1927:
+ add.s32 %r2521, %r2498, 4;
+ setp.gt.s32 %p1684, %r2521, 14;
+ @%p1684 bra $L__BB0_1929;
+
+ neg.s32 %r7687, %r7672;
+ setp.lt.s32 %p1685, %r11, %r7687;
+ add.f32 %f5173, %f2200, %f5599;
+ selp.f32 %f6034, %f5173, %f6034, %p1685;
+ mov.u32 %r7688, -32;
+ sub.s32 %r7689, %r7688, %r7672;
+ setp.lt.s32 %p1686, %r11, %r7689;
+ add.f32 %f5174, %f2200, %f5598;
+ selp.f32 %f6033, %f5174, %f6033, %p1686;
$L__BB0_1929:
- mul.wide.s32 %rd2478, %r8740, 4;
- add.s64 %rd2480, %rd2474, %rd2478;
- ld.shared.f32 %f5098, [%rd602];
- ld.shared.f32 %f5099, [%rd2480];
- add.f32 %f5100, %f5099, %f5098;
- st.shared.f32 [%rd602], %f5100;
-
-$L__BB0_1930:
- bar.sync 0;
- setp.gt.u32 %p1682, %r8739, 7;
- mov.u32 %r8739, %r2561;
- @%p1682 bra $L__BB0_1926;
+ add.s32 %r2522, %r2498, 5;
+ setp.gt.s32 %p1687, %r2522, 14;
+ @%p1687 bra $L__BB0_1931;
+
+ neg.s32 %r7691, %r7672;
+ setp.lt.s32 %p1688, %r11, %r7691;
+ add.f32 %f5175, %f2200, %f5597;
+ selp.f32 %f6016, %f5175, %f6016, %p1688;
+ mov.u32 %r7692, -32;
+ sub.s32 %r7693, %r7692, %r7672;
+ setp.lt.s32 %p1689, %r11, %r7693;
+ add.f32 %f5176, %f2200, %f5596;
+ selp.f32 %f6015, %f5176, %f6015, %p1689;
$L__BB0_1931:
- setp.ne.s32 %p1683, %r2547, 0;
- mov.f32 %f5910, 0f00000000;
- @%p1683 bra $L__BB0_1934;
-
- ld.shared.f32 %f5102, [%rd602];
- add.f32 %f5910, %f5102, 0f00000000;
- setp.lt.u32 %p1684, %r2549, 2;
- @%p1684 bra $L__BB0_1934;
-
- add.s32 %r7879, %r2554, 1;
- mul.wide.u32 %rd2481, %r7879, 4;
- add.s64 %rd2483, %rd2474, %rd2481;
- ld.shared.f32 %f5103, [%rd2483];
- add.f32 %f5910, %f5910, %f5103;
-
-$L__BB0_1934:
- bar.sync 0;
- mul.wide.s32 %rd2484, %r2551, 4;
- add.s64 %rd603, %rd2474, %rd2484;
- setp.eq.s32 %p1685, %r2547, 0;
- @%p1685 bra $L__BB0_1935;
- bra.uni $L__BB0_1936;
+ add.s32 %r2523, %r2498, 6;
+ setp.gt.s32 %p1690, %r2523, 14;
+ @%p1690 bra $L__BB0_1933;
+
+ neg.s32 %r7695, %r7672;
+ setp.lt.s32 %p1691, %r11, %r7695;
+ add.f32 %f5177, %f2200, %f5595;
+ selp.f32 %f6014, %f5177, %f6014, %p1691;
+ mov.u32 %r7696, -32;
+ sub.s32 %r7697, %r7696, %r7672;
+ setp.lt.s32 %p1692, %r11, %r7697;
+ add.f32 %f5178, %f2200, %f5594;
+ selp.f32 %f6013, %f5178, %f6013, %p1692;
+
+$L__BB0_1933:
+ add.s32 %r2524, %r2498, 7;
+ setp.gt.s32 %p1693, %r2524, 14;
+ @%p1693 bra $L__BB0_1935;
+
+ neg.s32 %r7699, %r7672;
+ setp.lt.s32 %p1694, %r11, %r7699;
+ add.f32 %f5179, %f2200, %f5593;
+ selp.f32 %f6012, %f5179, %f6012, %p1694;
+ mov.u32 %r7700, -32;
+ sub.s32 %r7701, %r7700, %r7672;
+ setp.lt.s32 %p1695, %r11, %r7701;
+ add.f32 %f5180, %f2200, %f5592;
+ selp.f32 %f6011, %f5180, %f6011, %p1695;
$L__BB0_1935:
- st.shared.f32 [%rd603], %f5910;
-
-$L__BB0_1936:
- bar.sync 0;
- ld.shared.f32 %f2168, [%rd603];
- bar.sync 0;
- @%p32 bra $L__BB0_1938;
-
- shl.b32 %r7880, %r12, 4;
- neg.s32 %r7881, %r7880;
- add.s32 %r7882, %r13, -12;
- setp.lt.s32 %p1687, %r7882, %r7881;
- @%p1687 bra $L__BB0_1962;
- bra.uni $L__BB0_1938;
-
-$L__BB0_1962:
- add.f32 %f5105, %f2168, %f5332;
- mov.u32 %r7920, %ctaid.x;
- mad.lo.s32 %r7921, %r7920, 63, %r1;
- mad.lo.s32 %r7922, %r13, 2646, %r7921;
- shl.b32 %r7923, %r12, 7;
- add.s32 %r7924, %r7922, %r7923;
- mul.wide.s32 %rd2487, %r7924, 4;
- add.s64 %rd2488, %rd4, %rd2487;
- st.global.f32 [%rd2488], %f5105;
- add.f32 %f5106, %f2168, %f5331;
- st.global.f32 [%rd2488+128], %f5106;
- add.f32 %f5107, %f2168, %f5330;
- st.global.f32 [%rd2488+10584], %f5107;
- add.f32 %f5108, %f2168, %f5329;
- st.global.f32 [%rd2488+10712], %f5108;
- add.f32 %f5109, %f2168, %f5328;
- st.global.f32 [%rd2488+21168], %f5109;
- add.f32 %f5110, %f2168, %f5327;
- st.global.f32 [%rd2488+21296], %f5110;
- add.f32 %f5111, %f2168, %f5326;
- st.global.f32 [%rd2488+31752], %f5111;
- add.f32 %f5112, %f2168, %f5325;
- st.global.f32 [%rd2488+31880], %f5112;
- bra.uni $L__BB0_1963;
+ @%p1684 bra $L__BB0_1940;
+
+ neg.s32 %r7702, %r7672;
+ setp.ge.s32 %p1697, %r11, %r7702;
+ @%p1697 bra $L__BB0_1938;
+
+ st.global.f32 [%rd583+42336], %f6034;
$L__BB0_1938:
- neg.s32 %r2565, %r2546;
- add.s32 %r7883, %r13, -15;
- setp.ge.s32 %p1688, %r7883, %r2565;
- add.f32 %f5926, %f2168, %f5332;
- add.f32 %f5925, %f2168, %f5331;
- add.f32 %f5924, %f2168, %f5330;
- add.f32 %f5923, %f2168, %f5329;
- add.f32 %f5922, %f2168, %f5328;
- add.f32 %f5921, %f2168, %f5327;
- add.f32 %f5920, %f2168, %f5326;
- add.f32 %f5919, %f2168, %f5325;
- mov.u32 %r7884, %ctaid.x;
- mad.lo.s32 %r7885, %r7884, 63, %r1;
- mad.lo.s32 %r7886, %r13, 2646, %r7885;
- shl.b32 %r2566, %r12, 7;
- add.s32 %r7887, %r7886, %r2566;
- mul.wide.s32 %rd2486, %r7887, 4;
- add.s64 %rd604, %rd4, %rd2486;
- @%p1688 bra $L__BB0_1941;
-
- neg.s32 %r7888, %r2566;
- setp.ge.s32 %p1689, %r14, %r7888;
- @%p1689 bra $L__BB0_1941;
-
- st.global.f32 [%rd604], %f5926;
-
-$L__BB0_1941:
- @%p1688 bra $L__BB0_1944;
-
- mov.u32 %r7891, -32;
- sub.s32 %r7892, %r7891, %r2566;
- setp.ge.s32 %p1691, %r14, %r7892;
- @%p1691 bra $L__BB0_1944;
-
- st.global.f32 [%rd604+128], %f5925;
-
-$L__BB0_1944:
- add.s32 %r7894, %r2565, -1;
- setp.ge.s32 %p1692, %r7883, %r7894;
- @%p1692 bra $L__BB0_1947;
-
- neg.s32 %r7896, %r2566;
- setp.ge.s32 %p1693, %r14, %r7896;
- @%p1693 bra $L__BB0_1947;
-
- st.global.f32 [%rd604+10584], %f5924;
-
-$L__BB0_1947:
- @%p1692 bra $L__BB0_1950;
-
- mov.u32 %r7900, -32;
- sub.s32 %r7901, %r7900, %r2566;
- setp.ge.s32 %p1695, %r14, %r7901;
- @%p1695 bra $L__BB0_1950;
-
- st.global.f32 [%rd604+10712], %f5923;
+ mov.u32 %r7703, -32;
+ sub.s32 %r7704, %r7703, %r7672;
+ setp.ge.s32 %p1698, %r11, %r7704;
+ @%p1698 bra $L__BB0_1940;
+
+ st.global.f32 [%rd583+42464], %f6033;
+
+$L__BB0_1940:
+ @%p1687 bra $L__BB0_1945;
+
+ neg.s32 %r7705, %r7672;
+ setp.ge.s32 %p1700, %r11, %r7705;
+ @%p1700 bra $L__BB0_1943;
+
+ st.global.f32 [%rd583+52920], %f6016;
+
+$L__BB0_1943:
+ mov.u32 %r7706, -32;
+ sub.s32 %r7707, %r7706, %r7672;
+ setp.ge.s32 %p1701, %r11, %r7707;
+ @%p1701 bra $L__BB0_1945;
+
+ st.global.f32 [%rd583+53048], %f6015;
+
+$L__BB0_1945:
+ @%p1690 bra $L__BB0_1950;
+
+ neg.s32 %r7708, %r7672;
+ setp.ge.s32 %p1703, %r11, %r7708;
+ @%p1703 bra $L__BB0_1948;
+
+ st.global.f32 [%rd583+63504], %f6014;
+
+$L__BB0_1948:
+ mov.u32 %r7709, -32;
+ sub.s32 %r7710, %r7709, %r7672;
+ setp.ge.s32 %p1704, %r11, %r7710;
+ @%p1704 bra $L__BB0_1950;
+
+ st.global.f32 [%rd583+63632], %f6013;
$L__BB0_1950:
- add.s32 %r7903, %r2565, -2;
- setp.ge.s32 %p1696, %r7883, %r7903;
- @%p1696 bra $L__BB0_1953;
-
- neg.s32 %r7905, %r2566;
- setp.ge.s32 %p1697, %r14, %r7905;
- @%p1697 bra $L__BB0_1953;
-
- st.global.f32 [%rd604+21168], %f5922;
+ @%p1693 bra $L__BB0_1956;
+
+ neg.s32 %r7711, %r7672;
+ setp.ge.s32 %p1706, %r11, %r7711;
+ @%p1706 bra $L__BB0_1953;
+
+ st.global.f32 [%rd583+74088], %f6012;
$L__BB0_1953:
- @%p1696 bra $L__BB0_1956;
-
- mov.u32 %r7909, -32;
- sub.s32 %r7910, %r7909, %r2566;
- setp.ge.s32 %p1699, %r14, %r7910;
- @%p1699 bra $L__BB0_1956;
-
- st.global.f32 [%rd604+21296], %f5921;
+ mov.u32 %r7712, -32;
+ sub.s32 %r7713, %r7712, %r7672;
+ setp.ge.s32 %p1707, %r11, %r7713;
+ @%p1707 bra $L__BB0_1956;
+
+ st.global.f32 [%rd583+74216], %f6011;
$L__BB0_1956:
- add.s32 %r7912, %r2565, -3;
- setp.ge.s32 %p1700, %r7883, %r7912;
- @%p1700 bra $L__BB0_1959;
-
- neg.s32 %r7914, %r2566;
- setp.ge.s32 %p1701, %r14, %r7914;
- @%p1701 bra $L__BB0_1959;
-
- st.global.f32 [%rd604+31752], %f5920;
+ setp.lt.s32 %p1708, %r2498, 1;
+ and.pred %p1710, %p33, %p1708;
+ @%p1710 bra $L__BB0_1985;
+ bra.uni $L__BB0_1957;
+
+$L__BB0_1985:
+ add.f32 %f5197, %f2200, %f5798;
+ st.global.f32 [%rd583+84672], %f5197;
+ add.f32 %f5198, %f2200, %f5797;
+ st.global.f32 [%rd583+84800], %f5198;
+ add.f32 %f5199, %f2200, %f5796;
+ st.global.f32 [%rd583+95256], %f5199;
+ add.f32 %f5200, %f2200, %f5795;
+ st.global.f32 [%rd583+95384], %f5200;
+ add.f32 %f5201, %f2200, %f5794;
+ st.global.f32 [%rd583+105840], %f5201;
+ add.f32 %f5202, %f2200, %f5793;
+ st.global.f32 [%rd583+105968], %f5202;
+ add.f32 %f5203, %f2200, %f5792;
+ st.global.f32 [%rd583+116424], %f5203;
+ add.f32 %f5204, %f2200, %f5791;
+ st.global.f32 [%rd583+116552], %f5204;
+ bra.uni $L__BB0_1986;
+
+$L__BB0_1957:
+ add.s32 %r2529, %r2498, 8;
+ setp.gt.s32 %p1711, %r2529, 14;
+ @%p1711 bra $L__BB0_1959;
+
+ neg.s32 %r7715, %r7672;
+ setp.lt.s32 %p1712, %r11, %r7715;
+ add.f32 %f5189, %f2200, %f5798;
+ selp.f32 %f6034, %f5189, %f6034, %p1712;
+ mov.u32 %r7716, -32;
+ sub.s32 %r7717, %r7716, %r7672;
+ setp.lt.s32 %p1713, %r11, %r7717;
+ add.f32 %f5190, %f2200, %f5797;
+ selp.f32 %f6033, %f5190, %f6033, %p1713;
$L__BB0_1959:
- @%p1700 bra $L__BB0_1963;
-
- mov.u32 %r7918, -32;
- sub.s32 %r7919, %r7918, %r2566;
- setp.ge.s32 %p1703, %r14, %r7919;
- @%p1703 bra $L__BB0_1963;
-
- st.global.f32 [%rd604+31880], %f5919;
+ add.s32 %r2530, %r2498, 9;
+ setp.gt.s32 %p1714, %r2530, 14;
+ @%p1714 bra $L__BB0_1961;
+
+ neg.s32 %r7719, %r7672;
+ setp.lt.s32 %p1715, %r11, %r7719;
+ add.f32 %f5191, %f2200, %f5796;
+ selp.f32 %f6016, %f5191, %f6016, %p1715;
+ mov.u32 %r7720, -32;
+ sub.s32 %r7721, %r7720, %r7672;
+ setp.lt.s32 %p1716, %r11, %r7721;
+ add.f32 %f5192, %f2200, %f5795;
+ selp.f32 %f6015, %f5192, %f6015, %p1716;
+
+$L__BB0_1961:
+ add.s32 %r2531, %r2498, 10;
+ setp.gt.s32 %p1717, %r2531, 14;
+ @%p1717 bra $L__BB0_1963;
+
+ neg.s32 %r7723, %r7672;
+ setp.lt.s32 %p1718, %r11, %r7723;
+ add.f32 %f5193, %f2200, %f5794;
+ selp.f32 %f6014, %f5193, %f6014, %p1718;
+ mov.u32 %r7724, -32;
+ sub.s32 %r7725, %r7724, %r7672;
+ setp.lt.s32 %p1719, %r11, %r7725;
+ add.f32 %f5194, %f2200, %f5793;
+ selp.f32 %f6013, %f5194, %f6013, %p1719;
$L__BB0_1963:
- @%p32 bra $L__BB0_1965;
-
- shl.b32 %r7925, %r12, 4;
- mov.u32 %r7926, -4;
- sub.s32 %r7927, %r7926, %r7925;
- add.s32 %r7928, %r13, -12;
- setp.lt.s32 %p1705, %r7928, %r7927;
- @%p1705 bra $L__BB0_1989;
- bra.uni $L__BB0_1965;
+ add.s32 %r2532, %r2498, 11;
+ setp.gt.s32 %p1720, %r2532, 14;
+ @%p1720 bra $L__BB0_1965;
+
+ neg.s32 %r7727, %r7672;
+ setp.lt.s32 %p1721, %r11, %r7727;
+ add.f32 %f5195, %f2200, %f5792;
+ selp.f32 %f6012, %f5195, %f6012, %p1721;
+ mov.u32 %r7728, -32;
+ sub.s32 %r7729, %r7728, %r7672;
+ setp.lt.s32 %p1722, %r11, %r7729;
+ add.f32 %f5196, %f2200, %f5791;
+ selp.f32 %f6011, %f5196, %f6011, %p1722;
+
+$L__BB0_1965:
+ @%p1711 bra $L__BB0_1970;
+
+ neg.s32 %r7730, %r7672;
+ setp.ge.s32 %p1724, %r11, %r7730;
+ @%p1724 bra $L__BB0_1968;
+
+ st.global.f32 [%rd583+84672], %f6034;
+
+$L__BB0_1968:
+ mov.u32 %r7731, -32;
+ sub.s32 %r7732, %r7731, %r7672;
+ setp.ge.s32 %p1725, %r11, %r7732;
+ @%p1725 bra $L__BB0_1970;
+
+ st.global.f32 [%rd583+84800], %f6033;
+
+$L__BB0_1970:
+ @%p1714 bra $L__BB0_1975;
+
+ neg.s32 %r7733, %r7672;
+ setp.ge.s32 %p1727, %r11, %r7733;
+ @%p1727 bra $L__BB0_1973;
+
+ st.global.f32 [%rd583+95256], %f6016;
+
+$L__BB0_1973:
+ mov.u32 %r7734, -32;
+ sub.s32 %r7735, %r7734, %r7672;
+ setp.ge.s32 %p1728, %r11, %r7735;
+ @%p1728 bra $L__BB0_1975;
+
+ st.global.f32 [%rd583+95384], %f6015;
+
+$L__BB0_1975:
+ @%p1717 bra $L__BB0_1980;
+
+ neg.s32 %r7736, %r7672;
+ setp.ge.s32 %p1730, %r11, %r7736;
+ @%p1730 bra $L__BB0_1978;
+
+ st.global.f32 [%rd583+105840], %f6014;
+
+$L__BB0_1978:
+ mov.u32 %r7737, -32;
+ sub.s32 %r7738, %r7737, %r7672;
+ setp.ge.s32 %p1731, %r11, %r7738;
+ @%p1731 bra $L__BB0_1980;
+
+ st.global.f32 [%rd583+105968], %f6013;
+
+$L__BB0_1980:
+ @%p1720 bra $L__BB0_1986;
+
+ neg.s32 %r7739, %r7672;
+ setp.ge.s32 %p1733, %r11, %r7739;
+ @%p1733 bra $L__BB0_1983;
+
+ st.global.f32 [%rd583+116424], %f6012;
+
+$L__BB0_1983:
+ mov.u32 %r7740, -32;
+ sub.s32 %r7741, %r7740, %r7672;
+ setp.ge.s32 %p1734, %r11, %r7741;
+ @%p1734 bra $L__BB0_1986;
+
+ st.global.f32 [%rd583+116552], %f6011;
+
+$L__BB0_1986:
+ setp.lt.s32 %p1735, %r2498, 0;
+ and.pred %p1737, %p33, %p1735;
+ @%p1737 bra $L__BB0_2015;
+ bra.uni $L__BB0_1987;
+
+$L__BB0_2015:
+ add.f32 %f5213, %f2200, %f5997;
+ st.global.f32 [%rd583+127008], %f5213;
+ add.f32 %f5214, %f2200, %f5996;
+ st.global.f32 [%rd583+127136], %f5214;
+ add.f32 %f5215, %f2200, %f5995;
+ st.global.f32 [%rd583+137592], %f5215;
+ add.f32 %f5216, %f2200, %f5994;
+ st.global.f32 [%rd583+137720], %f5216;
+ add.f32 %f5217, %f2200, %f5993;
+ st.global.f32 [%rd583+148176], %f5217;
+ add.f32 %f5218, %f2200, %f5992;
+ st.global.f32 [%rd583+148304], %f5218;
+ add.f32 %f5219, %f2200, %f5991;
+ st.global.f32 [%rd583+158760], %f5219;
+ add.f32 %f5220, %f2200, %f5990;
+ st.global.f32 [%rd583+158888], %f5220;
+ bra.uni $L__BB0_2016;
+
+$L__BB0_1987:
+ add.s32 %r2537, %r2498, 12;
+ setp.gt.s32 %p1738, %r2537, 14;
+ @%p1738 bra $L__BB0_1989;
+
+ neg.s32 %r7743, %r7672;
+ setp.lt.s32 %p1739, %r11, %r7743;
+ add.f32 %f5205, %f2200, %f5997;
+ selp.f32 %f6034, %f5205, %f6034, %p1739;
+ mov.u32 %r7744, -32;
+ sub.s32 %r7745, %r7744, %r7672;
+ setp.lt.s32 %p1740, %r11, %r7745;
+ add.f32 %f5206, %f2200, %f5996;
+ selp.f32 %f6033, %f5206, %f6033, %p1740;
$L__BB0_1989:
- add.f32 %f5121, %f2168, %f5523;
- mov.u32 %r7962, %ctaid.x;
- mad.lo.s32 %r7963, %r7962, 63, %r1;
- mad.lo.s32 %r7964, %r13, 2646, %r7963;
- shl.b32 %r7965, %r12, 7;
- add.s32 %r7966, %r7964, %r7965;
- add.s32 %r7967, %r7966, 10584;
- mul.wide.s32 %rd2490, %r7967, 4;
- add.s64 %rd2491, %rd4, %rd2490;
- st.global.f32 [%rd2491], %f5121;
- add.f32 %f5122, %f2168, %f5522;
- st.global.f32 [%rd2491+128], %f5122;
- add.f32 %f5123, %f2168, %f5521;
- st.global.f32 [%rd2491+10584], %f5123;
- add.f32 %f5124, %f2168, %f5520;
- st.global.f32 [%rd2491+10712], %f5124;
- add.f32 %f5125, %f2168, %f5519;
- st.global.f32 [%rd2491+21168], %f5125;
- add.f32 %f5126, %f2168, %f5518;
- st.global.f32 [%rd2491+21296], %f5126;
- add.f32 %f5127, %f2168, %f5517;
- st.global.f32 [%rd2491+31752], %f5127;
- add.f32 %f5128, %f2168, %f5516;
- st.global.f32 [%rd2491+31880], %f5128;
- bra.uni $L__BB0_1990;
-
-$L__BB0_1965:
- mov.u32 %r7929, -4;
- sub.s32 %r2567, %r7929, %r2546;
- add.s32 %r7930, %r13, -15;
- setp.ge.s32 %p1706, %r7930, %r2567;
- setp.lt.s32 %p1707, %r7930, %r2567;
- shl.b32 %r7931, %r12, 7;
- neg.s32 %r2568, %r7931;
- setp.lt.s32 %p1708, %r14, %r2568;
- and.pred %p1709, %p1707, %p1708;
- add.f32 %f5113, %f2168, %f5523;
- selp.f32 %f5926, %f5113, %f5926, %p1709;
- mov.u32 %r7932, -32;
- sub.s32 %r7933, %r7932, %r7931;
- setp.lt.s32 %p1710, %r14, %r7933;
- and.pred %p1711, %p1707, %p1710;
- add.f32 %f5114, %f2168, %f5522;
- selp.f32 %f5925, %f5114, %f5925, %p1711;
- mov.u32 %r7934, -5;
- sub.s32 %r7935, %r7934, %r2546;
- setp.lt.s32 %p1712, %r7930, %r7935;
- and.pred %p1713, %p1712, %p1708;
- add.f32 %f5115, %f2168, %f5521;
- selp.f32 %f5924, %f5115, %f5924, %p1713;
- and.pred %p1714, %p1712, %p1710;
- add.f32 %f5116, %f2168, %f5520;
- selp.f32 %f5923, %f5116, %f5923, %p1714;
- mov.u32 %r7936, -6;
- sub.s32 %r7937, %r7936, %r2546;
- setp.lt.s32 %p1715, %r7930, %r7937;
- and.pred %p1716, %p1715, %p1708;
- add.f32 %f5117, %f2168, %f5519;
- selp.f32 %f5922, %f5117, %f5922, %p1716;
- and.pred %p1717, %p1715, %p1710;
- add.f32 %f5118, %f2168, %f5518;
- selp.f32 %f5921, %f5118, %f5921, %p1717;
- mov.u32 %r7938, -7;
- sub.s32 %r7939, %r7938, %r2546;
- setp.lt.s32 %p1718, %r7930, %r7939;
- and.pred %p1719, %p1718, %p1708;
- add.f32 %f5119, %f2168, %f5517;
- selp.f32 %f5920, %f5119, %f5920, %p1719;
- and.pred %p1720, %p1718, %p1710;
- add.f32 %f5120, %f2168, %f5516;
- selp.f32 %f5919, %f5120, %f5919, %p1720;
- mov.u32 %r7940, %ctaid.x;
- mad.lo.s32 %r7941, %r7940, 63, %r1;
- mad.lo.s32 %r7942, %r13, 2646, %r7941;
- add.s32 %r7943, %r7942, %r7931;
- add.s32 %r7944, %r7943, 10584;
- mul.wide.s32 %rd2489, %r7944, 4;
- add.s64 %rd605, %rd4, %rd2489;
- @%p1706 bra $L__BB0_1968;
-
- setp.ge.s32 %p1721, %r14, %r2568;
- @%p1721 bra $L__BB0_1968;
-
- st.global.f32 [%rd605], %f5926;
-
-$L__BB0_1968:
- @%p1706 bra $L__BB0_1971;
-
- add.s32 %r7946, %r2568, -32;
- setp.ge.s32 %p1723, %r14, %r7946;
- @%p1723 bra $L__BB0_1971;
-
- st.global.f32 [%rd605+128], %f5925;
-
-$L__BB0_1971:
- add.s32 %r7948, %r2567, -1;
- setp.ge.s32 %p1724, %r7930, %r7948;
- @%p1724 bra $L__BB0_1974;
-
- setp.ge.s32 %p1725, %r14, %r2568;
- @%p1725 bra $L__BB0_1974;
-
- st.global.f32 [%rd605+10584], %f5924;
-
-$L__BB0_1974:
- @%p1724 bra $L__BB0_1977;
-
- add.s32 %r7951, %r2568, -32;
- setp.ge.s32 %p1727, %r14, %r7951;
- @%p1727 bra $L__BB0_1977;
-
- st.global.f32 [%rd605+10712], %f5923;
-
-$L__BB0_1977:
- add.s32 %r7953, %r2567, -2;
- setp.ge.s32 %p1728, %r7930, %r7953;
- @%p1728 bra $L__BB0_1980;
-
- setp.ge.s32 %p1729, %r14, %r2568;
- @%p1729 bra $L__BB0_1980;
-
- st.global.f32 [%rd605+21168], %f5922;
-
-$L__BB0_1980:
- @%p1728 bra $L__BB0_1983;
-
- add.s32 %r7956, %r2568, -32;
- setp.ge.s32 %p1731, %r14, %r7956;
- @%p1731 bra $L__BB0_1983;
-
- st.global.f32 [%rd605+21296], %f5921;
-
-$L__BB0_1983:
- add.s32 %r7958, %r2567, -3;
- setp.ge.s32 %p1732, %r7930, %r7958;
- @%p1732 bra $L__BB0_1986;
-
- setp.ge.s32 %p1733, %r14, %r2568;
- @%p1733 bra $L__BB0_1986;
-
- st.global.f32 [%rd605+31752], %f5920;
-
-$L__BB0_1986:
- @%p1732 bra $L__BB0_1990;
-
- add.s32 %r7961, %r2568, -32;
- setp.ge.s32 %p1735, %r14, %r7961;
- @%p1735 bra $L__BB0_1990;
-
- st.global.f32 [%rd605+31880], %f5919;
-
-$L__BB0_1990:
- @%p32 bra $L__BB0_1992;
-
- shl.b32 %r7968, %r12, 4;
- mov.u32 %r7969, -8;
- sub.s32 %r7970, %r7969, %r7968;
- add.s32 %r7971, %r13, -12;
- setp.lt.s32 %p1737, %r7971, %r7970;
- @%p1737 bra $L__BB0_2016;
- bra.uni $L__BB0_1992;
+ add.s32 %r2538, %r2498, 13;
+ setp.gt.s32 %p1741, %r2538, 14;
+ @%p1741 bra $L__BB0_1991;
+
+ neg.s32 %r7747, %r7672;
+ setp.lt.s32 %p1742, %r11, %r7747;
+ add.f32 %f5207, %f2200, %f5995;
+ selp.f32 %f6016, %f5207, %f6016, %p1742;
+ mov.u32 %r7748, -32;
+ sub.s32 %r7749, %r7748, %r7672;
+ setp.lt.s32 %p1743, %r11, %r7749;
+ add.f32 %f5208, %f2200, %f5994;
+ selp.f32 %f6015, %f5208, %f6015, %p1743;
+
+$L__BB0_1991:
+ add.s32 %r2539, %r2498, 14;
+ setp.gt.s32 %p1744, %r2539, 14;
+ @%p1744 bra $L__BB0_1993;
+
+ neg.s32 %r7751, %r7672;
+ setp.lt.s32 %p1745, %r11, %r7751;
+ add.f32 %f5209, %f2200, %f5993;
+ selp.f32 %f6014, %f5209, %f6014, %p1745;
+ mov.u32 %r7752, -32;
+ sub.s32 %r7753, %r7752, %r7672;
+ setp.lt.s32 %p1746, %r11, %r7753;
+ add.f32 %f5210, %f2200, %f5992;
+ selp.f32 %f6013, %f5210, %f6013, %p1746;
+
+$L__BB0_1993:
+ add.s32 %r2540, %r2498, 15;
+ setp.gt.s32 %p1747, %r2540, 14;
+ @%p1747 bra $L__BB0_1995;
+
+ neg.s32 %r7755, %r7672;
+ setp.lt.s32 %p1748, %r11, %r7755;
+ add.f32 %f5211, %f2200, %f5991;
+ selp.f32 %f6012, %f5211, %f6012, %p1748;
+ mov.u32 %r7756, -32;
+ sub.s32 %r7757, %r7756, %r7672;
+ setp.lt.s32 %p1749, %r11, %r7757;
+ add.f32 %f5212, %f2200, %f5990;
+ selp.f32 %f6011, %f5212, %f6011, %p1749;
+
+$L__BB0_1995:
+ @%p1738 bra $L__BB0_2000;
+
+ neg.s32 %r7758, %r7672;
+ setp.ge.s32 %p1751, %r11, %r7758;
+ @%p1751 bra $L__BB0_1998;
+
+ st.global.f32 [%rd583+127008], %f6034;
+
+$L__BB0_1998:
+ mov.u32 %r7759, -32;
+ sub.s32 %r7760, %r7759, %r7672;
+ setp.ge.s32 %p1752, %r11, %r7760;
+ @%p1752 bra $L__BB0_2000;
+
+ st.global.f32 [%rd583+127136], %f6033;
+
+$L__BB0_2000:
+ @%p1741 bra $L__BB0_2005;
+
+ neg.s32 %r7761, %r7672;
+ setp.ge.s32 %p1754, %r11, %r7761;
+ @%p1754 bra $L__BB0_2003;
+
+ st.global.f32 [%rd583+137592], %f6016;
+
+$L__BB0_2003:
+ mov.u32 %r7762, -32;
+ sub.s32 %r7763, %r7762, %r7672;
+ setp.ge.s32 %p1755, %r11, %r7763;
+ @%p1755 bra $L__BB0_2005;
+
+ st.global.f32 [%rd583+137720], %f6015;
+
+$L__BB0_2005:
+ @%p1744 bra $L__BB0_2010;
+
+ neg.s32 %r7764, %r7672;
+ setp.ge.s32 %p1757, %r11, %r7764;
+ @%p1757 bra $L__BB0_2008;
+
+ st.global.f32 [%rd583+148176], %f6014;
+
+$L__BB0_2008:
+ mov.u32 %r7765, -32;
+ sub.s32 %r7766, %r7765, %r7672;
+ setp.ge.s32 %p1758, %r11, %r7766;
+ @%p1758 bra $L__BB0_2010;
+
+ st.global.f32 [%rd583+148304], %f6013;
+
+$L__BB0_2010:
+ @%p1747 bra $L__BB0_2016;
+
+ neg.s32 %r7767, %r7672;
+ setp.ge.s32 %p1760, %r11, %r7767;
+ @%p1760 bra $L__BB0_2013;
+
+ st.global.f32 [%rd583+158760], %f6012;
+
+$L__BB0_2013:
+ mov.u32 %r7768, -32;
+ sub.s32 %r7769, %r7768, %r7672;
+ setp.ge.s32 %p1761, %r11, %r7769;
+ @%p1761 bra $L__BB0_2016;
+
+ st.global.f32 [%rd583+158888], %f6011;
$L__BB0_2016:
- add.f32 %f5137, %f2168, %f5714;
- mov.u32 %r8005, %ctaid.x;
- mad.lo.s32 %r8006, %r8005, 63, %r1;
- mad.lo.s32 %r8007, %r13, 2646, %r8006;
- shl.b32 %r8008, %r12, 7;
- add.s32 %r8009, %r8007, %r8008;
- add.s32 %r8010, %r8009, 21168;
- mul.wide.s32 %rd2493, %r8010, 4;
- add.s64 %rd2494, %rd4, %rd2493;
- st.global.f32 [%rd2494], %f5137;
- add.f32 %f5138, %f2168, %f5713;
- st.global.f32 [%rd2494+128], %f5138;
- add.f32 %f5139, %f2168, %f5712;
- st.global.f32 [%rd2494+10584], %f5139;
- add.f32 %f5140, %f2168, %f5711;
- st.global.f32 [%rd2494+10712], %f5140;
- add.f32 %f5141, %f2168, %f5710;
- st.global.f32 [%rd2494+21168], %f5141;
- add.f32 %f5142, %f2168, %f5709;
- st.global.f32 [%rd2494+21296], %f5142;
- add.f32 %f5143, %f2168, %f5708;
- st.global.f32 [%rd2494+31752], %f5143;
- add.f32 %f5144, %f2168, %f5707;
- st.global.f32 [%rd2494+31880], %f5144;
- bra.uni $L__BB0_2017;
-
-$L__BB0_1992:
- mov.u32 %r7972, -8;
- sub.s32 %r2569, %r7972, %r2546;
- add.s32 %r7973, %r13, -15;
- setp.ge.s32 %p1738, %r7973, %r2569;
- setp.lt.s32 %p1739, %r7973, %r2569;
- shl.b32 %r7974, %r12, 7;
- neg.s32 %r2570, %r7974;
- setp.lt.s32 %p1740, %r14, %r2570;
- and.pred %p1741, %p1739, %p1740;
- add.f32 %f5129, %f2168, %f5714;
- selp.f32 %f5926, %f5129, %f5926, %p1741;
- mov.u32 %r7975, -32;
- sub.s32 %r7976, %r7975, %r7974;
- setp.lt.s32 %p1742, %r14, %r7976;
- and.pred %p1743, %p1739, %p1742;
- add.f32 %f5130, %f2168, %f5713;
- selp.f32 %f5925, %f5130, %f5925, %p1743;
- mov.u32 %r7977, -9;
- sub.s32 %r7978, %r7977, %r2546;
- setp.lt.s32 %p1744, %r7973, %r7978;
- and.pred %p1745, %p1744, %p1740;
- add.f32 %f5131, %f2168, %f5712;
- selp.f32 %f5924, %f5131, %f5924, %p1745;
- and.pred %p1746, %p1744, %p1742;
- add.f32 %f5132, %f2168, %f5711;
- selp.f32 %f5923, %f5132, %f5923, %p1746;
- mov.u32 %r7979, -10;
- sub.s32 %r7980, %r7979, %r2546;
- setp.lt.s32 %p1747, %r7973, %r7980;
- and.pred %p1748, %p1747, %p1740;
- add.f32 %f5133, %f2168, %f5710;
- selp.f32 %f5922, %f5133, %f5922, %p1748;
- and.pred %p1749, %p1747, %p1742;
- add.f32 %f5134, %f2168, %f5709;
- selp.f32 %f5921, %f5134, %f5921, %p1749;
- mov.u32 %r7981, -11;
- sub.s32 %r7982, %r7981, %r2546;
- setp.lt.s32 %p1750, %r7973, %r7982;
- and.pred %p1751, %p1750, %p1740;
- add.f32 %f5135, %f2168, %f5708;
- selp.f32 %f5920, %f5135, %f5920, %p1751;
- and.pred %p1752, %p1750, %p1742;
- add.f32 %f5136, %f2168, %f5707;
- selp.f32 %f5919, %f5136, %f5919, %p1752;
- mov.u32 %r7983, %ctaid.x;
- mad.lo.s32 %r7984, %r7983, 63, %r1;
- mad.lo.s32 %r7985, %r13, 2646, %r7984;
- add.s32 %r7986, %r7985, %r7974;
- add.s32 %r7987, %r7986, 21168;
- mul.wide.s32 %rd2492, %r7987, 4;
- add.s64 %rd606, %rd4, %rd2492;
- @%p1738 bra $L__BB0_1995;
-
- setp.ge.s32 %p1753, %r14, %r2570;
- @%p1753 bra $L__BB0_1995;
-
- st.global.f32 [%rd606], %f5926;
-
-$L__BB0_1995:
- @%p1738 bra $L__BB0_1998;
-
- add.s32 %r7989, %r2570, -32;
- setp.ge.s32 %p1755, %r14, %r7989;
- @%p1755 bra $L__BB0_1998;
-
- st.global.f32 [%rd606+128], %f5925;
-
-$L__BB0_1998:
- add.s32 %r7991, %r2569, -1;
- setp.ge.s32 %p1756, %r7973, %r7991;
- @%p1756 bra $L__BB0_2001;
-
- setp.ge.s32 %p1757, %r14, %r2570;
- @%p1757 bra $L__BB0_2001;
-
- st.global.f32 [%rd606+10584], %f5924;
-
-$L__BB0_2001:
- @%p1756 bra $L__BB0_2004;
-
- add.s32 %r7994, %r2570, -32;
- setp.ge.s32 %p1759, %r14, %r7994;
- @%p1759 bra $L__BB0_2004;
-
- st.global.f32 [%rd606+10712], %f5923;
-
-$L__BB0_2004:
- add.s32 %r7996, %r2569, -2;
- setp.ge.s32 %p1760, %r7973, %r7996;
- @%p1760 bra $L__BB0_2007;
-
- setp.ge.s32 %p1761, %r14, %r2570;
- @%p1761 bra $L__BB0_2007;
-
- st.global.f32 [%rd606+21168], %f5922;
-
-$L__BB0_2007:
- @%p1760 bra $L__BB0_2010;
-
- add.s32 %r7999, %r2570, -32;
- setp.ge.s32 %p1763, %r14, %r7999;
- @%p1763 bra $L__BB0_2010;
-
- st.global.f32 [%rd606+21296], %f5921;
-
-$L__BB0_2010:
- add.s32 %r8001, %r2569, -3;
- setp.ge.s32 %p1764, %r7973, %r8001;
- @%p1764 bra $L__BB0_2013;
-
- setp.ge.s32 %p1765, %r14, %r2570;
- @%p1765 bra $L__BB0_2013;
-
- st.global.f32 [%rd606+31752], %f5920;
-
-$L__BB0_2013:
- @%p1764 bra $L__BB0_2017;
-
- add.s32 %r8004, %r2570, -32;
- setp.ge.s32 %p1767, %r14, %r8004;
- @%p1767 bra $L__BB0_2017;
-
- st.global.f32 [%rd606+31880], %f5919;
-
-$L__BB0_2017:
- @%p32 bra $L__BB0_2019;
-
- shl.b32 %r8011, %r12, 4;
- mov.u32 %r8012, -12;
- sub.s32 %r8013, %r8012, %r8011;
- add.s32 %r8014, %r13, -12;
- setp.lt.s32 %p1769, %r8014, %r8013;
- @%p1769 bra $L__BB0_2043;
- bra.uni $L__BB0_2019;
-
-$L__BB0_2043:
- add.f32 %f5153, %f2168, %f5905;
- mov.u32 %r8034, %ctaid.x;
- mad.lo.s32 %r8035, %r8034, 63, %r1;
- mad.lo.s32 %r8036, %r13, 2646, %r8035;
- shl.b32 %r8037, %r12, 7;
- add.s32 %r8038, %r8036, %r8037;
- add.s32 %r8039, %r8038, 31752;
- mul.wide.s32 %rd2496, %r8039, 4;
- add.s64 %rd2497, %rd4, %rd2496;
- st.global.f32 [%rd2497], %f5153;
- add.f32 %f5154, %f2168, %f5904;
- st.global.f32 [%rd2497+128], %f5154;
- add.f32 %f5155, %f2168, %f5903;
- st.global.f32 [%rd2497+10584], %f5155;
- add.f32 %f5156, %f2168, %f5902;
- st.global.f32 [%rd2497+10712], %f5156;
- add.f32 %f5157, %f2168, %f5901;
- st.global.f32 [%rd2497+21168], %f5157;
- add.f32 %f5158, %f2168, %f5900;
- st.global.f32 [%rd2497+21296], %f5158;
- add.f32 %f5159, %f2168, %f5899;
- st.global.f32 [%rd2497+31752], %f5159;
- add.f32 %f5160, %f2168, %f5898;
- st.global.f32 [%rd2497+31880], %f5160;
- bra.uni $L__BB0_2044;
-
-$L__BB0_2019:
- mov.u32 %r8015, -12;
- sub.s32 %r2571, %r8015, %r2546;
- add.s32 %r8016, %r13, -15;
- mov.u32 %r8017, -15;
- setp.ge.s32 %p1770, %r8016, %r2571;
- setp.lt.s32 %p1771, %r8016, %r2571;
- shl.b32 %r8018, %r12, 7;
- neg.s32 %r2572, %r8018;
- setp.lt.s32 %p1772, %r14, %r2572;
- and.pred %p1773, %p1771, %p1772;
- add.f32 %f5145, %f2168, %f5905;
- selp.f32 %f2217, %f5145, %f5926, %p1773;
- mov.u32 %r8019, -32;
- sub.s32 %r2573, %r8019, %r8018;
- setp.lt.s32 %p1774, %r14, %r2573;
- and.pred %p1775, %p1771, %p1774;
- add.f32 %f5146, %f2168, %f5904;
- selp.f32 %f2218, %f5146, %f5925, %p1775;
- mov.u32 %r8020, -13;
- sub.s32 %r2574, %r8020, %r2546;
- setp.lt.s32 %p1776, %r8016, %r2574;
- and.pred %p1777, %p1776, %p1772;
- add.f32 %f5147, %f2168, %f5903;
- selp.f32 %f2219, %f5147, %f5924, %p1777;
- and.pred %p1778, %p1776, %p1774;
- add.f32 %f5148, %f2168, %f5902;
- selp.f32 %f2220, %f5148, %f5923, %p1778;
- mov.u32 %r8021, -14;
- sub.s32 %r2575, %r8021, %r2546;
- setp.lt.s32 %p1779, %r8016, %r2575;
- and.pred %p1780, %p1779, %p1772;
- add.f32 %f5149, %f2168, %f5901;
- selp.f32 %f2221, %f5149, %f5922, %p1780;
- and.pred %p1781, %p1779, %p1774;
- add.f32 %f5150, %f2168, %f5900;
- selp.f32 %f2222, %f5150, %f5921, %p1781;
- sub.s32 %r2576, %r8017, %r2546;
- setp.lt.s32 %p1782, %r8016, %r2576;
- and.pred %p1783, %p1782, %p1772;
- add.f32 %f5151, %f2168, %f5899;
- selp.f32 %f2223, %f5151, %f5920, %p1783;
- and.pred %p1784, %p1782, %p1774;
- add.f32 %f5152, %f2168, %f5898;
- selp.f32 %f2224, %f5152, %f5919, %p1784;
- mov.u32 %r8022, %ctaid.x;
- mad.lo.s32 %r8023, %r8022, 63, %r1;
- mad.lo.s32 %r8024, %r13, 2646, %r8023;
- add.s32 %r8025, %r8024, %r8018;
- add.s32 %r8026, %r8025, 31752;
- mul.wide.s32 %rd2495, %r8026, 4;
- add.s64 %rd607, %rd4, %rd2495;
- @%p1770 bra $L__BB0_2022;
-
- setp.ge.s32 %p1785, %r14, %r2572;
- @%p1785 bra $L__BB0_2022;
-
- st.global.f32 [%rd607], %f2217;
-
-$L__BB0_2022:
- @%p1770 bra $L__BB0_2025;
-
- setp.ge.s32 %p1787, %r14, %r2573;
- @%p1787 bra $L__BB0_2025;
-
- st.global.f32 [%rd607+128], %f2218;
-
-$L__BB0_2025:
- setp.ge.s32 %p1788, %r8016, %r2574;
- @%p1788 bra $L__BB0_2028;
-
- setp.ge.s32 %p1789, %r14, %r2572;
- @%p1789 bra $L__BB0_2028;
-
- st.global.f32 [%rd607+10584], %f2219;
-
-$L__BB0_2028:
- @%p1788 bra $L__BB0_2031;
-
- setp.ge.s32 %p1791, %r14, %r2573;
- @%p1791 bra $L__BB0_2031;
-
- st.global.f32 [%rd607+10712], %f2220;
-
-$L__BB0_2031:
- setp.ge.s32 %p1792, %r8016, %r2575;
- @%p1792 bra $L__BB0_2034;
-
- setp.ge.s32 %p1793, %r14, %r2572;
- @%p1793 bra $L__BB0_2034;
-
- st.global.f32 [%rd607+21168], %f2221;
-
-$L__BB0_2034:
- @%p1792 bra $L__BB0_2037;
-
- setp.ge.s32 %p1795, %r14, %r2573;
- @%p1795 bra $L__BB0_2037;
-
- st.global.f32 [%rd607+21296], %f2222;
-
-$L__BB0_2037:
- setp.ge.s32 %p1796, %r8016, %r2576;
- @%p1796 bra $L__BB0_2040;
-
- setp.ge.s32 %p1797, %r14, %r2572;
- @%p1797 bra $L__BB0_2040;
-
- st.global.f32 [%rd607+31752], %f2223;
-
-$L__BB0_2040:
- @%p1796 bra $L__BB0_2044;
-
- setp.ge.s32 %p1799, %r14, %r2573;
- @%p1799 bra $L__BB0_2044;
-
- st.global.f32 [%rd607+31880], %f2224;
-
-$L__BB0_2044:
ret;
}
8: GpuViewTest.FusionIssue2076
Kernel 1
CUDA
PTX
53997da5d
Diff
03a1b695e
-1
+1 index type: int
registers: 30
gmem: 3
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<bool, 4, 4> T0, Tensor<float, 3, 3> T1, Tensor<float, 4, 4> T2, Tensor<float, 4, 4> T28, Tensor<float, 3, 3> T24) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
Tensor<bool, 4, 4> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 4, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[3LL];
Array<nvfuser_index_t, 4, 1> a3;
a3 = s0.logical_size;
nvfuser_index_t i4;
i4 = a3[2LL];
Tensor<float, 3, 3> s5;
s5.data = T1.data;
s5.logical_size = T1.logical_size;
s5.alloc_stride = T1.alloc_stride;
Array<nvfuser_index_t, 3, 1> a6;
a6 = s5.logical_size;
nvfuser_index_t i7;
i7 = a6[0LL];
nvfuser_index_t i8;
i8 = ceilDiv(i7, 4);
nvfuser_index_t i9;
i9 = i8 * 4;
Array<float, 4, 1> T13;
if (((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((nvfuser_index_t)blockIdx.x) < (ceilDiv(((4 * i8) * i4), ((nvfuser_index_t)blockDim.y))))) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < ((4 * i8) * i4)))) {
Array<float, 4, 4> T26;
T26.set(float(0));
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T26[0], &T1[(((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * ((nvfuser_index_t)blockIdx.x)))]);
Array<float, 4, 4> T27;
T27.set(float(0));
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::AllLevels>(&T27[0], &T2[(((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) % i4))) + ((i2 * i4) * (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) / i4) / i8)))]);
Array<bool, 4, 4> T25;
T25.set(bool(0));
loadGlobalToLocal<bool, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::AllLevels>(&T25[0], &T0[(((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) % i4))) + ((i2 * i4) * (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) / i4) / i8)))]);
// Alias Allocation - register
auto& T29 = T27;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 4; ++i10) {
Array<float, 1, 1> T3;
T3[0]
= (float)(T25[i10]);
Array<float, 1, 1> T5;
T5[0]
= T3[0]
* (float) 1.00000000000000000e+00;
Array<float, 1, 1> T6;
T6[0]
= (float) 1.00000000000000000e+00
- T5[0];
Array<bool, 1, 1> T7;
T7[0]
= (bool)(T6[0]);
Array<float, 1, 1> T8;
T8[0]
= T7[0] ? (float) -3.40282000000000014e+38 : T6[0];
Array<float, 1, 1> T4;
T4[0]
= T26[i10];
Array<float, 1, 1> T9;
T9[0]
= T8[0]
+ T27[i10];
T29[i10]
= T9[0];
Array<float, 1, 1> T10;
T10[0]
= T9[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0];
Array<float, 1, 1> T12;
T12[0]
= T4[0]
+ T11[0];
T13[i10]
= T12[0];
}
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T28[(((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) % i4))) + ((i2 * i4) * (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) / i4) / i8)))], &T29[0]);
} else {
Array<float, 4, 4> T26;
T26.set(float(0));
if (((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((nvfuser_index_t)blockIdx.x) < (ceilDiv(((4 * i8) * i4), ((nvfuser_index_t)blockDim.y))))) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < ((4 * i8) * i4)))) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T26[0], &T1[(((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * ((nvfuser_index_t)blockIdx.x)))]);
}
Array<float, 4, 4> T27;
T27.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < ((4 * i8) * i4)))) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::AllLevels>(&T27[0], &T2[(((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) % i4))) + ((i2 * i4) * (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) / i4) / i8)))]);
}
Array<bool, 4, 4> T25;
T25.set(bool(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < ((4 * i8) * i4)))) {
loadGlobalToLocal<bool, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::AllLevels>(&T25[0], &T0[(((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) % i4))) + ((i2 * i4) * (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) / i4) / i8)))]);
}
// Alias Allocation - register
auto& T29 = T27;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 4; ++i10) {
Array<float, 1, 1> T3;
T3[0]
= (float)(T25[i10]);
Array<float, 1, 1> T5;
T5[0]
= T3[0]
* (float) 1.00000000000000000e+00;
Array<float, 1, 1> T6;
T6[0]
= (float) 1.00000000000000000e+00
- T5[0];
Array<bool, 1, 1> T7;
T7[0]
= (bool)(T6[0]);
Array<float, 1, 1> T8;
T8[0]
= T7[0] ? (float) -3.40282000000000014e+38 : T6[0];
Array<float, 1, 1> T4;
T4[0]
= T26[i10];
Array<float, 1, 1> T9;
T9[0]
= T8[0]
+ T27[i10];
T29[i10]
= T9[0];
Array<float, 1, 1> T10;
T10[0]
= T9[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0];
Array<float, 1, 1> T12;
T12[0]
= T4[0]
+ T11[0];
T13[i10]
= T12[0];
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < ((4 * i8) * i4)))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T28[(((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) % i4))) + ((i2 * i4) * (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) / i4) / i8)))], &T29[0]);
}
}
Array<float, 1, 1> T14;
T14[0] = NEG_INFINITY;
Array<float, 1, 1> T31;
T31[0] = NEG_INFINITY;
if (((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((nvfuser_index_t)blockIdx.x) < (ceilDiv(((4 * i8) * i4), ((nvfuser_index_t)blockDim.y))))) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < ((4 * i8) * i4)))) {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
T31[0] = fmax(
T31[0],
T13[i11]);
}
} else {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
if ((((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((nvfuser_index_t)blockIdx.x) < (ceilDiv(((4 * i8) * i4), ((nvfuser_index_t)blockDim.y))))) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < ((4 * i8) * i4)))) {
T31[0] = fmax(
T31[0],
T13[i11]);
}
}
}
warp::warpReduceTIDX<false, true>(T14[0], T31[0], [](float &a, float b) { a = fmax(a, b); }, static_cast<float*>(shared_mem), true, static_cast<float>(NEG_INFINITY), DefaultBlockDim());
Array<float, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T17;
T17[0]
= T15[0];
Array<float, 4, 1> T19;
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 4; ++i12) {
Array<float, 1, 1> T18;
T18[0]
= T13[i12]
- T17[0];
T19[i12]
= expf(T18[0]);
}
Array<float, 1, 1> T20;
T20[0] = 0.000000000e+00f;
Array<float, 1, 1> T32;
T32[0] = 0.000000000e+00f;
if (((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((nvfuser_index_t)blockIdx.x) < (ceilDiv(((4 * i8) * i4), ((nvfuser_index_t)blockDim.y))))) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < ((4 * i8) * i4)))) {
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 4; ++i13) {
T32[0]
= T32[0]
+ T19[i13];
}
} else {
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 4; ++i13) {
if ((((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((nvfuser_index_t)blockIdx.x) < (ceilDiv(((4 * i8) * i4), ((nvfuser_index_t)blockDim.y))))) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < ((4 * i8) * i4)))) {
T32[0]
= T32[0]
+ T19[i13];
}
}
}
warp::warpReduceTIDX<false, true>(T20[0], T32[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T21;
broadcast::blockBroadcast<true, false, false, true>(T21[0], T20[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T23;
T23[0]
= T21[0];
if (((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((nvfuser_index_t)blockIdx.x) < (ceilDiv(((4 * i8) * i4), ((nvfuser_index_t)blockDim.y))))) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < ((4 * i8) * i4)))) {
Array<float, 4, 4> T30;
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 4; ++i14) {
T30[i14]
= T19[i14]
/ T23[0];
}
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T24[(((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * ((nvfuser_index_t)blockIdx.x)))], &T30[0]);
} else {
Array<float, 4, 4> T30;
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 4; ++i14) {
T30[i14]
= T19[i14]
/ T23[0];
}
if (((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((nvfuser_index_t)blockIdx.x) < (ceilDiv(((4 * i8) * i4), ((nvfuser_index_t)blockDim.y))))) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < ((4 * i8) * i4)))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T24[(((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * ((nvfuser_index_t)blockIdx.x)))], &T30[0]);
}
}
}
__global__ void nvfuser_N(Tensor<bool, 4, 4> T0, Tensor<float, 3, 3> T1, Tensor<float, 4, 4> T2, Tensor<float, 4, 4> T28, Tensor<float, 3, 3> T24) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
Tensor<bool, 4, 4> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 4, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[3LL];
Array<nvfuser_index_t, 4, 1> a3;
a3 = s0.logical_size;
nvfuser_index_t i4;
i4 = a3[2LL];
Tensor<float, 3, 3> s5;
s5.data = T1.data;
s5.logical_size = T1.logical_size;
s5.alloc_stride = T1.alloc_stride;
Array<nvfuser_index_t, 3, 1> a6;
a6 = s5.logical_size;
nvfuser_index_t i7;
i7 = a6[0LL];
nvfuser_index_t i8;
i8 = ceilDiv(i7, 4);
nvfuser_index_t i9;
i9 = i8 * 4;
Array<float, 4, 1> T13;
if (((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((nvfuser_index_t)blockIdx.x) < (ceilDiv(((4 * i4) * i8), ((nvfuser_index_t)blockDim.y))))) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < ((4 * i4) * i8)))) {
Array<float, 4, 4> T26;
T26.set(float(0));
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T26[0], &T1[(((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * ((nvfuser_index_t)blockIdx.x)))]);
Array<float, 4, 4> T27;
T27.set(float(0));
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::AllLevels>(&T27[0], &T2[(((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) % i4))) + ((i2 * i4) * (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) / i4) / i8)))]);
Array<bool, 4, 4> T25;
T25.set(bool(0));
loadGlobalToLocal<bool, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::AllLevels>(&T25[0], &T0[(((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) % i4))) + ((i2 * i4) * (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) / i4) / i8)))]);
// Alias Allocation - register
auto& T29 = T27;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 4; ++i10) {
Array<float, 1, 1> T3;
T3[0]
= (float)(T25[i10]);
Array<float, 1, 1> T5;
T5[0]
= T3[0]
* (float) 1.00000000000000000e+00;
Array<float, 1, 1> T6;
T6[0]
= (float) 1.00000000000000000e+00
- T5[0];
Array<bool, 1, 1> T7;
T7[0]
= (bool)(T6[0]);
Array<float, 1, 1> T8;
T8[0]
= T7[0] ? (float) -3.40282000000000014e+38 : T6[0];
Array<float, 1, 1> T4;
T4[0]
= T26[i10];
Array<float, 1, 1> T9;
T9[0]
= T8[0]
+ T27[i10];
T29[i10]
= T9[0];
Array<float, 1, 1> T10;
T10[0]
= T9[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0];
Array<float, 1, 1> T12;
T12[0]
= T4[0]
+ T11[0];
T13[i10]
= T12[0];
}
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T28[(((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) % i4))) + ((i2 * i4) * (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) / i4) / i8)))], &T29[0]);
} else {
Array<float, 4, 4> T26;
T26.set(float(0));
if (((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((nvfuser_index_t)blockIdx.x) < (ceilDiv(((4 * i8) * i4), ((nvfuser_index_t)blockDim.y))))) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < ((4 * i8) * i4)))) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T26[0], &T1[(((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * ((nvfuser_index_t)blockIdx.x)))]);
}
Array<float, 4, 4> T27;
T27.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < ((4 * i8) * i4)))) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::AllLevels>(&T27[0], &T2[(((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) % i4))) + ((i2 * i4) * (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) / i4) / i8)))]);
}
Array<bool, 4, 4> T25;
T25.set(bool(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < ((4 * i8) * i4)))) {
loadGlobalToLocal<bool, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::AllLevels>(&T25[0], &T0[(((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) % i4))) + ((i2 * i4) * (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) / i4) / i8)))]);
}
// Alias Allocation - register
auto& T29 = T27;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 4; ++i10) {
Array<float, 1, 1> T3;
T3[0]
= (float)(T25[i10]);
Array<float, 1, 1> T5;
T5[0]
= T3[0]
* (float) 1.00000000000000000e+00;
Array<float, 1, 1> T6;
T6[0]
= (float) 1.00000000000000000e+00
- T5[0];
Array<bool, 1, 1> T7;
T7[0]
= (bool)(T6[0]);
Array<float, 1, 1> T8;
T8[0]
= T7[0] ? (float) -3.40282000000000014e+38 : T6[0];
Array<float, 1, 1> T4;
T4[0]
= T26[i10];
Array<float, 1, 1> T9;
T9[0]
= T8[0]
+ T27[i10];
T29[i10]
= T9[0];
Array<float, 1, 1> T10;
T10[0]
= T9[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0];
Array<float, 1, 1> T12;
T12[0]
= T4[0]
+ T11[0];
T13[i10]
= T12[0];
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < ((4 * i8) * i4)))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T28[(((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) % i4))) + ((i2 * i4) * (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) / i4) / i8)))], &T29[0]);
}
}
Array<float, 1, 1> T14;
T14[0] = NEG_INFINITY;
Array<float, 1, 1> T31;
T31[0] = NEG_INFINITY;
if (((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((nvfuser_index_t)blockIdx.x) < (ceilDiv(((4 * i8) * i4), ((nvfuser_index_t)blockDim.y))))) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < ((4 * i8) * i4)))) {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
T31[0] = fmax(
T31[0],
T13[i11]);
}
} else {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
if ((((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((nvfuser_index_t)blockIdx.x) < (ceilDiv(((4 * i8) * i4), ((nvfuser_index_t)blockDim.y))))) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < ((4 * i8) * i4)))) {
T31[0] = fmax(
T31[0],
T13[i11]);
}
}
}
warp::warpReduceTIDX<false, true>(T14[0], T31[0], [](float &a, float b) { a = fmax(a, b); }, static_cast<float*>(shared_mem), true, static_cast<float>(NEG_INFINITY), DefaultBlockDim());
Array<float, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T17;
T17[0]
= T15[0];
Array<float, 4, 1> T19;
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 4; ++i12) {
Array<float, 1, 1> T18;
T18[0]
= T13[i12]
- T17[0];
T19[i12]
= expf(T18[0]);
}
Array<float, 1, 1> T20;
T20[0] = 0.000000000e+00f;
Array<float, 1, 1> T32;
T32[0] = 0.000000000e+00f;
if (((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((nvfuser_index_t)blockIdx.x) < (ceilDiv(((4 * i8) * i4), ((nvfuser_index_t)blockDim.y))))) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < ((4 * i8) * i4)))) {
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 4; ++i13) {
T32[0]
= T32[0]
+ T19[i13];
}
} else {
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 4; ++i13) {
if ((((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((nvfuser_index_t)blockIdx.x) < (ceilDiv(((4 * i8) * i4), ((nvfuser_index_t)blockDim.y))))) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < ((4 * i8) * i4)))) {
T32[0]
= T32[0]
+ T19[i13];
}
}
}
warp::warpReduceTIDX<false, true>(T20[0], T32[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T21;
broadcast::blockBroadcast<true, false, false, true>(T21[0], T20[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T23;
T23[0]
= T21[0];
if (((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((nvfuser_index_t)blockIdx.x) < (ceilDiv(((4 * i8) * i4), ((nvfuser_index_t)blockDim.y))))) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < ((4 * i8) * i4)))) {
Array<float, 4, 4> T30;
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 4; ++i14) {
T30[i14]
= T19[i14]
/ T23[0];
}
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T24[(((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * ((nvfuser_index_t)blockIdx.x)))], &T30[0]);
} else {
Array<float, 4, 4> T30;
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 4; ++i14) {
T30[i14]
= T19[i14]
/ T23[0];
}
if (((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((nvfuser_index_t)blockIdx.x) < (ceilDiv(((4 * i8) * i4), ((nvfuser_index_t)blockDim.y))))) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < ((4 * i8) * i4)))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T24[(((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * ((nvfuser_index_t)blockIdx.x)))], &T30[0]);
}
}
}
--- 53997da5d
+++ 03a1b695e
@@ -24,11 +24,11 @@
nvfuser_index_t i8;
i8 = ceilDiv(i7, 4);
nvfuser_index_t i9;
i9 = i8 * 4;
Array<float, 4, 1> T13;
- if (((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((nvfuser_index_t)blockIdx.x) < (ceilDiv(((4 * i8) * i4), ((nvfuser_index_t)blockDim.y))))) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < ((4 * i8) * i4)))) {
+ if (((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((nvfuser_index_t)blockIdx.x) < (ceilDiv(((4 * i4) * i8), ((nvfuser_index_t)blockDim.y))))) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < ((4 * i4) * i8)))) {
Array<float, 4, 4> T26;
T26.set(float(0));
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T26[0], &T1[(((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * ((nvfuser_index_t)blockIdx.x)))]);
Array<float, 4, 4> T27;
T27.set(float(0));
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_372_cu_aaf28710_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_372_cu_aaf28710_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_372_cu_aaf28710_191103std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_372_cu_aaf28710_191105arrayE[];
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_372_cu_aaf28710_1911011nvfuser_372ENS_6TensorIbLi4ELi4EEENS0_IfLi3ELi3EEENS0_IfLi4ELi4EEES3_S2_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_372_cu_aaf28710_1911011nvfuser_372ENS_6TensorIbLi4ELi4EEENS0_IfLi3ELi3EEENS0_IfLi4ELi4EEES3_S2__param_0[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_372_cu_aaf28710_1911011nvfuser_372ENS_6TensorIbLi4ELi4EEENS0_IfLi3ELi3EEENS0_IfLi4ELi4EEES3_S2__param_1[32],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_372_cu_aaf28710_1911011nvfuser_372ENS_6TensorIbLi4ELi4EEENS0_IfLi3ELi3EEENS0_IfLi4ELi4EEES3_S2__param_2[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_372_cu_aaf28710_1911011nvfuser_372ENS_6TensorIbLi4ELi4EEENS0_IfLi3ELi3EEENS0_IfLi4ELi4EEES3_S2__param_3[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_372_cu_aaf28710_1911011nvfuser_372ENS_6TensorIbLi4ELi4EEENS0_IfLi3ELi3EEENS0_IfLi4ELi4EEES3_S2__param_4[32]
)
{
.reg .pred %p<144>;
.reg .b16 %rs<32>;
.reg .f32 %f<219>;
.reg .b32 %r<319>;
.reg .b64 %rd<36>;
ld.param.v2.u32 {%r66, %r67}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_372_cu_aaf28710_1911011nvfuser_372ENS_6TensorIbLi4ELi4EEENS0_IfLi3ELi3EEENS0_IfLi4ELi4EEES3_S2__param_0+16];
ld.param.v2.u32 {%r68, %r69}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_372_cu_aaf28710_1911011nvfuser_372ENS_6TensorIbLi4ELi4EEENS0_IfLi3ELi3EEENS0_IfLi4ELi4EEES3_S2__param_1+8];
ld.param.u64 %rd9, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_372_cu_aaf28710_1911011nvfuser_372ENS_6TensorIbLi4ELi4EEENS0_IfLi3ELi3EEENS0_IfLi4ELi4EEES3_S2__param_4];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_372_cu_aaf28710_1911011nvfuser_372ENS_6TensorIbLi4ELi4EEENS0_IfLi3ELi3EEENS0_IfLi4ELi4EEES3_S2__param_3];
ld.param.u64 %rd4, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_372_cu_aaf28710_1911011nvfuser_372ENS_6TensorIbLi4ELi4EEENS0_IfLi3ELi3EEENS0_IfLi4ELi4EEES3_S2__param_2];
ld.param.u64 %rd10, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_372_cu_aaf28710_1911011nvfuser_372ENS_6TensorIbLi4ELi4EEENS0_IfLi3ELi3EEENS0_IfLi4ELi4EEES3_S2__param_0];
cvta.to.global.u64 %rd1, %rd10;
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_372_cu_aaf28710_1911011nvfuser_372ENS_6TensorIbLi4ELi4EEENS0_IfLi3ELi3EEENS0_IfLi4ELi4EEES3_S2__param_1];
add.s32 %r77, %r68, 3;
shr.s32 %r78, %r77, 31;
shr.u32 %r79, %r78, 30;
add.s32 %r80, %r77, %r79;
shr.s32 %r2, %r80, 2;
add.s32 %r81, %r67, 3;
shr.s32 %r82, %r81, 31;
shr.u32 %r83, %r82, 30;
add.s32 %r84, %r81, %r83;
shr.s32 %r85, %r84, 2;
mov.u32 %r4, %tid.x;
setp.lt.s32 %p2, %r4, %r85;
shl.b32 %r5, %r4, 2;
or.b32 %r86, %r5, 3;
setp.lt.s32 %p3, %r86, %r67;
and.pred %p1, %p3, %p2;
not.pred %p4, %p1;
@%p4 bra $L__BB0_3;
mov.u32 %r6, %ctaid.x;
mul.lo.s32 %r87, %r2, %r66;
shl.b32 %r7, %r87, 2;
mov.u32 %r8, %ntid.y;
add.s32 %r88, %r8, %r7;
add.s32 %r89, %r88, -1;
div.s32 %r90, %r89, %r8;
setp.ge.s32 %p5, %r6, %r90;
@%p5 bra $L__BB0_3;
mov.u32 %r91, %tid.y;
mad.lo.s32 %r9, %r8, %r6, %r91;
setp.lt.s32 %p6, %r9, %r7;
@%p6 bra $L__BB0_16;
bra.uni $L__BB0_3;
$L__BB0_16:
mad.lo.s32 %r179, %r9, %r67, %r5;
mul.wide.s32 %rd22, %r179, 4;
add.s64 %rd19, %rd2, %rd22;
// begin inline asm
ld.global.cs.v4.u32 {%r167,%r168,%r169,%r170}, [%rd19];
// end inline asm
div.s32 %r180, %r9, %r66;
div.s32 %r181, %r180, %r2;
mul.lo.s32 %r182, %r180, %r66;
sub.s32 %r183, %r9, %r182;
mad.lo.s32 %r184, %r181, %r66, %r183;
mad.lo.s32 %r185, %r184, %r67, %r5;
cvt.s64.s32 %rd23, %r185;
mul.wide.s32 %rd24, %r185, 4;
add.s64 %rd20, %rd4, %rd24;
// begin inline asm
ld.global.ca.v4.u32 {%r171,%r172,%r173,%r174}, [%rd20];
// end inline asm
add.s64 %rd25, %rd1, %rd23;
ld.global.u32 %r186, [%rd25];
cvt.u16.u32 %rs21, %r186;
cvt.s16.s8 %rs22, %rs21;
shr.s16 %rs23, %rs21, 8;
shr.u32 %r187, %r186, 16;
cvt.u16.u32 %rs24, %r187;
cvt.s16.s8 %rs25, %rs24;
shr.u32 %r188, %r186, 24;
cvt.u16.u32 %rs26, %r188;
cvt.s16.s8 %rs27, %rs26;
cvt.rn.f32.s16 %f75, %rs22;
mov.f32 %f76, 0f3F800000;
sub.f32 %f77, %f76, %f75;
setp.neu.f32 %p20, %f77, 0f00000000;
selp.f32 %f78, 0fFF7FFFEE, %f77, %p20;
mov.b32 %f79, %r167;
mov.b32 %f80, %r171;
add.f32 %f81, %f78, %f80;
mov.b32 %r175, %f81;
add.f32 %f203, %f81, %f79;
cvt.rn.f32.s16 %f82, %rs23;
sub.f32 %f83, %f76, %f82;
setp.neu.f32 %p21, %f83, 0f00000000;
selp.f32 %f84, 0fFF7FFFEE, %f83, %p21;
mov.b32 %f85, %r168;
mov.b32 %f86, %r172;
add.f32 %f87, %f84, %f86;
mov.b32 %r176, %f87;
add.f32 %f204, %f87, %f85;
cvt.rn.f32.s16 %f88, %rs25;
sub.f32 %f89, %f76, %f88;
setp.neu.f32 %p22, %f89, 0f00000000;
selp.f32 %f90, 0fFF7FFFEE, %f89, %p22;
mov.b32 %f91, %r169;
mov.b32 %f92, %r173;
add.f32 %f93, %f90, %f92;
mov.b32 %r177, %f93;
add.f32 %f205, %f93, %f91;
cvt.rn.f32.s16 %f94, %rs27;
sub.f32 %f95, %f76, %f94;
setp.neu.f32 %p23, %f95, 0f00000000;
selp.f32 %f96, 0fFF7FFFEE, %f95, %p23;
mov.b32 %f97, %r170;
mov.b32 %f98, %r174;
add.f32 %f99, %f96, %f98;
mov.b32 %r178, %f99;
add.f32 %f206, %f99, %f97;
add.s64 %rd21, %rd3, %rd24;
// begin inline asm
st.global.cs.v4.s32 [%rd21], {%r175,%r176,%r177,%r178};
// end inline asm
bra.uni $L__BB0_17;
$L__BB0_3:
mov.u32 %r315, 0;
mov.u32 %r311, %r315;
mov.u32 %r312, %r315;
mov.u32 %r313, %r315;
mov.u32 %r314, %r315;
@%p4 bra $L__BB0_7;
mov.u32 %r10, %ctaid.x;
mul.lo.s32 %r100, %r2, %r66;
shl.b32 %r11, %r100, 2;
mov.u32 %r12, %ntid.y;
add.s32 %r101, %r12, %r11;
add.s32 %r102, %r101, -1;
div.s32 %r103, %r102, %r12;
setp.ge.s32 %p8, %r10, %r103;
@%p8 bra $L__BB0_7;
mov.u32 %r108, %tid.y;
mad.lo.s32 %r13, %r12, %r10, %r108;
setp.ge.s32 %p9, %r13, %r11;
mov.u32 %r311, %r315;
mov.u32 %r312, %r315;
mov.u32 %r313, %r315;
mov.u32 %r314, %r315;
@%p9 bra $L__BB0_7;
mad.lo.s32 %r113, %r13, %r67, %r5;
mul.wide.s32 %rd12, %r113, 4;
add.s64 %rd11, %rd2, %rd12;
// begin inline asm
ld.global.cs.v4.u32 {%r311,%r312,%r313,%r314}, [%rd11];
// end inline asm
$L__BB0_7:
mov.u32 %r316, %r315;
mov.u32 %r317, %r315;
mov.u32 %r318, %r315;
@%p4 bra $L__BB0_10;
mov.u32 %r122, %tid.y;
mov.u32 %r123, %ctaid.x;
mov.u32 %r124, %ntid.y;
mad.lo.s32 %r22, %r124, %r123, %r122;
mul.lo.s32 %r125, %r66, %r2;
shl.b32 %r126, %r125, 2;
setp.ge.s32 %p11, %r22, %r126;
mov.u32 %r316, %r315;
mov.u32 %r317, %r315;
mov.u32 %r318, %r315;
@%p11 bra $L__BB0_10;
div.s32 %r131, %r22, %r66;
mul.lo.s32 %r132, %r131, %r66;
sub.s32 %r133, %r22, %r132;
div.s32 %r134, %r131, %r2;
mad.lo.s32 %r135, %r134, %r66, %r133;
mad.lo.s32 %r136, %r135, %r67, %r5;
mul.wide.s32 %rd14, %r136, 4;
add.s64 %rd13, %rd4, %rd14;
// begin inline asm
ld.global.ca.v4.u32 {%r318,%r317,%r316,%r315}, [%rd13];
// end inline asm
$L__BB0_10:
mov.u16 %rs28, 0;
mov.u16 %rs29, %rs28;
mov.u16 %rs30, %rs28;
mov.u16 %rs31, %rs28;
@%p4 bra $L__BB0_13;
mov.u32 %r137, %tid.y;
mov.u32 %r138, %ctaid.x;
mov.u32 %r139, %ntid.y;
mad.lo.s32 %r31, %r139, %r138, %r137;
mul.lo.s32 %r140, %r66, %r2;
shl.b32 %r141, %r140, 2;
setp.ge.s32 %p13, %r31, %r141;
@%p13 bra $L__BB0_13;
div.s32 %r142, %r31, %r66;
mul.lo.s32 %r143, %r142, %r66;
sub.s32 %r144, %r31, %r143;
div.s32 %r145, %r142, %r2;
mad.lo.s32 %r146, %r145, %r66, %r144;
mad.lo.s32 %r147, %r146, %r67, %r5;
cvt.s64.s32 %rd15, %r147;
add.s64 %rd16, %rd1, %rd15;
ld.global.u32 %r148, [%rd16];
cvt.u16.u32 %rs31, %r148;
shr.u32 %r149, %r148, 8;
cvt.u16.u32 %rs30, %r149;
shr.u32 %r150, %r148, 16;
cvt.u16.u32 %rs29, %r150;
shr.u32 %r151, %r148, 24;
cvt.u16.u32 %rs28, %r151;
$L__BB0_13:
cvt.s16.s8 %rs17, %rs31;
cvt.rn.f32.s16 %f54, %rs17;
mov.f32 %f55, 0f3F800000;
sub.f32 %f56, %f55, %f54;
setp.neu.f32 %p14, %f56, 0f00000000;
selp.f32 %f57, 0fFF7FFFEE, %f56, %p14;
mov.b32 %f58, %r318;
add.f32 %f1, %f57, %f58;
mov.b32 %f59, %r311;
add.f32 %f203, %f1, %f59;
cvt.s16.s8 %rs18, %rs30;
cvt.rn.f32.s16 %f60, %rs18;
sub.f32 %f61, %f55, %f60;
setp.neu.f32 %p15, %f61, 0f00000000;
selp.f32 %f62, 0fFF7FFFEE, %f61, %p15;
mov.b32 %f63, %r317;
add.f32 %f3, %f62, %f63;
mov.b32 %f64, %r312;
add.f32 %f204, %f3, %f64;
cvt.s16.s8 %rs19, %rs29;
cvt.rn.f32.s16 %f65, %rs19;
sub.f32 %f66, %f55, %f65;
setp.neu.f32 %p16, %f66, 0f00000000;
selp.f32 %f67, 0fFF7FFFEE, %f66, %p16;
mov.b32 %f68, %r316;
add.f32 %f5, %f67, %f68;
mov.b32 %f69, %r313;
add.f32 %f205, %f5, %f69;
cvt.s16.s8 %rs20, %rs28;
cvt.rn.f32.s16 %f70, %rs20;
sub.f32 %f71, %f55, %f70;
setp.neu.f32 %p17, %f71, 0f00000000;
selp.f32 %f72, 0fFF7FFFEE, %f71, %p17;
mov.b32 %f73, %r315;
add.f32 %f7, %f72, %f73;
mov.b32 %f74, %r314;
add.f32 %f206, %f7, %f74;
@%p4 bra $L__BB0_17;
mov.u32 %r152, %tid.y;
mov.u32 %r153, %ctaid.x;
mov.u32 %r154, %ntid.y;
mad.lo.s32 %r32, %r154, %r153, %r152;
mul.lo.s32 %r155, %r66, %r2;
shl.b32 %r156, %r155, 2;
setp.ge.s32 %p19, %r32, %r156;
@%p19 bra $L__BB0_17;
div.s32 %r161, %r32, %r66;
mul.lo.s32 %r162, %r161, %r66;
sub.s32 %r163, %r32, %r162;
div.s32 %r164, %r161, %r2;
mad.lo.s32 %r165, %r164, %r66, %r163;
mad.lo.s32 %r166, %r165, %r67, %r5;
mul.wide.s32 %rd18, %r166, 4;
add.s64 %rd17, %rd3, %rd18;
mov.b32 %r157, %f1;
mov.b32 %r158, %f3;
mov.b32 %r159, %f5;
mov.b32 %r160, %f7;
// begin inline asm
st.global.cs.v4.s32 [%rd17], {%r157,%r158,%r159,%r160};
// end inline asm
$L__BB0_17:
@%p1 bra $L__BB0_18;
bra.uni $L__BB0_20;
$L__BB0_18:
mov.u32 %r33, %ctaid.x;
mul.lo.s32 %r189, %r2, %r66;
shl.b32 %r34, %r189, 2;
mov.u32 %r35, %ntid.y;
add.s32 %r190, %r35, %r34;
add.s32 %r191, %r190, -1;
div.s32 %r192, %r191, %r35;
setp.ge.s32 %p24, %r33, %r192;
@%p24 bra $L__BB0_20;
mov.u32 %r193, %tid.y;
mad.lo.s32 %r194, %r35, %r33, %r193;
setp.lt.s32 %p25, %r194, %r34;
@%p25 bra $L__BB0_31;
bra.uni $L__BB0_20;
$L__BB0_31:
setp.nan.f32 %p51, %f203, %f203;
setp.gt.f32 %p52, %f203, %f204;
or.pred %p53, %p51, %p52;
selp.f32 %f101, %f203, %f204, %p53;
setp.nan.f32 %p54, %f101, %f101;
setp.gt.f32 %p55, %f101, %f205;
or.pred %p56, %p54, %p55;
selp.f32 %f102, %f101, %f205, %p56;
setp.nan.f32 %p57, %f102, %f102;
setp.gt.f32 %p58, %f102, %f206;
or.pred %p59, %p57, %p58;
selp.f32 %f209, %f102, %f206, %p59;
bra.uni $L__BB0_32;
$L__BB0_20:
mov.u32 %r36, %ctaid.x;
mul.lo.s32 %r195, %r2, %r66;
shl.b32 %r37, %r195, 2;
mov.u32 %r39, %ntid.y;
add.s32 %r196, %r39, %r37;
add.s32 %r38, %r196, -1;
mov.u32 %r197, %tid.y;
mad.lo.s32 %r40, %r39, %r36, %r197;
mov.f32 %f209, 0fFF800000;
@%p4 bra $L__BB0_22;
div.s32 %r198, %r38, %r39;
setp.ge.s32 %p27, %r36, %r198;
setp.ge.s32 %p28, %r40, %r37;
or.pred %p29, %p27, %p28;
selp.f32 %f209, 0fFF800000, %f203, %p29;
$L__BB0_22:
@%p4 bra $L__BB0_25;
div.s32 %r199, %r38, %r39;
setp.ge.s32 %p31, %r36, %r199;
setp.ge.s32 %p32, %r40, %r37;
or.pred %p33, %p31, %p32;
@%p33 bra $L__BB0_25;
setp.nan.f32 %p34, %f209, %f209;
setp.gt.f32 %p35, %f209, %f204;
or.pred %p36, %p34, %p35;
selp.f32 %f209, %f209, %f204, %p36;
$L__BB0_25:
@%p4 bra $L__BB0_28;
div.s32 %r200, %r38, %r39;
setp.ge.s32 %p38, %r36, %r200;
setp.ge.s32 %p39, %r40, %r37;
or.pred %p40, %p38, %p39;
@%p40 bra $L__BB0_28;
setp.nan.f32 %p41, %f209, %f209;
setp.gt.f32 %p42, %f209, %f205;
or.pred %p43, %p41, %p42;
selp.f32 %f209, %f209, %f205, %p43;
$L__BB0_28:
@%p4 bra $L__BB0_32;
div.s32 %r201, %r38, %r39;
setp.ge.s32 %p45, %r36, %r201;
setp.ge.s32 %p46, %r40, %r37;
or.pred %p47, %p45, %p46;
@%p47 bra $L__BB0_32;
setp.nan.f32 %p48, %f209, %f209;
setp.gt.f32 %p49, %f209, %f206;
or.pred %p50, %p48, %p49;
selp.f32 %f209, %f209, %f206, %p50;
$L__BB0_32:
mov.b32 %r202, %f209;
mov.u32 %r203, 31;
mov.u32 %r204, 16;
mov.u32 %r205, -1;
shfl.sync.bfly.b32 %r206|%p60, %r202, %r204, %r203, %r205;
mov.b32 %f103, %r206;
setp.gt.f32 %p61, %f209, %f103;
setp.nan.f32 %p62, %f209, %f209;
or.pred %p63, %p62, %p61;
selp.f32 %f104, %f209, %f103, %p63;
mov.b32 %r207, %f104;
mov.u32 %r208, 8;
shfl.sync.bfly.b32 %r209|%p64, %r207, %r208, %r203, %r205;
mov.b32 %f105, %r209;
setp.nan.f32 %p65, %f104, %f104;
setp.gt.f32 %p66, %f104, %f105;
or.pred %p67, %p65, %p66;
selp.f32 %f106, %f104, %f105, %p67;
mov.b32 %r210, %f106;
mov.u32 %r211, 4;
shfl.sync.bfly.b32 %r212|%p68, %r210, %r211, %r203, %r205;
mov.b32 %f107, %r212;
setp.nan.f32 %p69, %f106, %f106;
setp.gt.f32 %p70, %f106, %f107;
or.pred %p71, %p69, %p70;
selp.f32 %f108, %f106, %f107, %p71;
mov.b32 %r213, %f108;
mov.u32 %r214, 2;
shfl.sync.bfly.b32 %r215|%p72, %r213, %r214, %r203, %r205;
mov.b32 %f109, %r215;
setp.nan.f32 %p73, %f108, %f108;
setp.gt.f32 %p74, %f108, %f109;
or.pred %p75, %p73, %p74;
selp.f32 %f110, %f108, %f109, %p75;
mov.b32 %r216, %f110;
mov.u32 %r217, 1;
shfl.sync.bfly.b32 %r218|%p76, %r216, %r217, %r203, %r205;
mov.b32 %f111, %r218;
setp.nan.f32 %p77, %f110, %f110;
setp.gt.f32 %p78, %f110, %f111;
or.pred %p79, %p77, %p78;
selp.f32 %f212, %f110, %f111, %p79;
mov.u32 %r219, %tid.z;
mov.u32 %r41, %ntid.y;
mov.u32 %r42, %tid.y;
mad.lo.s32 %r43, %r219, %r41, %r42;
and.b32 %r44, %r4, 31;
setp.ne.s32 %p80, %r44, 0;
mov.u32 %r220, %ntid.x;
shr.u32 %r45, %r220, 5;
mul.lo.s32 %r46, %r43, %r45;
bar.sync 0;
shr.u32 %r47, %r4, 5;
add.s32 %r221, %r46, %r47;
mul.wide.u32 %rd26, %r221, 4;
mov.u64 %rd27, _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_372_cu_aaf28710_191105arrayE;
add.s64 %rd5, %rd27, %rd26;
@%p80 bra $L__BB0_34;
st.shared.f32 [%rd5], %f212;
$L__BB0_34:
bar.sync 0;
add.s32 %r222, %r46, %r44;
mul.wide.u32 %rd28, %r222, 4;
add.s64 %rd6, %rd27, %rd28;
setp.ne.s32 %p81, %r47, 0;
@%p81 bra $L__BB0_38;
setp.ge.u32 %p82, %r44, %r45;
mov.f32 %f211, 0fFF800000;
@%p82 bra $L__BB0_37;
ld.shared.f32 %f211, [%rd6];
$L__BB0_37:
mov.b32 %r223, %f211;
mov.u32 %r224, 31;
mov.u32 %r225, 16;
mov.u32 %r226, -1;
shfl.sync.bfly.b32 %r227|%p83, %r223, %r225, %r224, %r226;
mov.b32 %f113, %r227;
setp.gt.f32 %p84, %f211, %f113;
setp.nan.f32 %p85, %f211, %f211;
or.pred %p86, %p85, %p84;
selp.f32 %f114, %f211, %f113, %p86;
mov.b32 %r228, %f114;
mov.u32 %r229, 8;
shfl.sync.bfly.b32 %r230|%p87, %r228, %r229, %r224, %r226;
mov.b32 %f115, %r230;
setp.nan.f32 %p88, %f114, %f114;
setp.gt.f32 %p89, %f114, %f115;
or.pred %p90, %p88, %p89;
selp.f32 %f116, %f114, %f115, %p90;
mov.b32 %r231, %f116;
mov.u32 %r232, 4;
shfl.sync.bfly.b32 %r233|%p91, %r231, %r232, %r224, %r226;
mov.b32 %f117, %r233;
setp.nan.f32 %p92, %f116, %f116;
setp.gt.f32 %p93, %f116, %f117;
or.pred %p94, %p92, %p93;
selp.f32 %f118, %f116, %f117, %p94;
mov.b32 %r234, %f118;
mov.u32 %r235, 2;
shfl.sync.bfly.b32 %r236|%p95, %r234, %r235, %r224, %r226;
mov.b32 %f119, %r236;
setp.nan.f32 %p96, %f118, %f118;
setp.gt.f32 %p97, %f118, %f119;
or.pred %p98, %p96, %p97;
selp.f32 %f120, %f118, %f119, %p98;
mov.b32 %r237, %f120;
mov.u32 %r238, 1;
shfl.sync.bfly.b32 %r239|%p99, %r237, %r238, %r224, %r226;
mov.b32 %f121, %r239;
setp.nan.f32 %p100, %f120, %f120;
setp.gt.f32 %p101, %f120, %f121;
or.pred %p102, %p100, %p101;
selp.f32 %f212, %f120, %f121, %p102;
$L__BB0_38:
bar.sync 0;
mul.wide.s32 %rd30, %r43, 4;
add.s64 %rd7, %rd27, %rd30;
setp.eq.s32 %p103, %r4, 0;
@%p103 bra $L__BB0_39;
bra.uni $L__BB0_40;
$L__BB0_39:
setp.eq.s32 %p104, %r44, 0;
selp.f32 %f122, %f212, 0fFF800000, %p104;
st.shared.f32 [%rd7], %f122;
$L__BB0_40:
bar.sync 0;
ld.shared.f32 %f123, [%rd7];
bar.sync 0;
sub.f32 %f124, %f203, %f123;
mov.f32 %f125, 0f3F000000;
mov.f32 %f126, 0f3BBB989D;
fma.rn.f32 %f127, %f124, %f126, %f125;
cvt.sat.f32.f32 %f128, %f127;
mov.f32 %f129, 0f4B400001;
mov.f32 %f130, 0f437C0000;
fma.rm.f32 %f131, %f128, %f130, %f129;
add.f32 %f132, %f131, 0fCB40007F;
neg.f32 %f133, %f132;
mov.f32 %f134, 0f3FB8AA3B;
fma.rn.f32 %f135, %f124, %f134, %f133;
mov.f32 %f136, 0f32A57060;
fma.rn.f32 %f137, %f124, %f136, %f135;
mov.b32 %r240, %f131;
shl.b32 %r241, %r240, 23;
mov.b32 %f138, %r241;
ex2.approx.ftz.f32 %f139, %f137;
mul.f32 %f31, %f139, %f138;
sub.f32 %f140, %f204, %f123;
fma.rn.f32 %f141, %f140, %f126, %f125;
cvt.sat.f32.f32 %f142, %f141;
fma.rm.f32 %f143, %f142, %f130, %f129;
add.f32 %f144, %f143, 0fCB40007F;
neg.f32 %f145, %f144;
fma.rn.f32 %f146, %f140, %f134, %f145;
fma.rn.f32 %f147, %f140, %f136, %f146;
mov.b32 %r242, %f143;
shl.b32 %r243, %r242, 23;
mov.b32 %f148, %r243;
ex2.approx.ftz.f32 %f149, %f147;
mul.f32 %f32, %f149, %f148;
sub.f32 %f150, %f205, %f123;
fma.rn.f32 %f151, %f150, %f126, %f125;
cvt.sat.f32.f32 %f152, %f151;
fma.rm.f32 %f153, %f152, %f130, %f129;
add.f32 %f154, %f153, 0fCB40007F;
neg.f32 %f155, %f154;
fma.rn.f32 %f156, %f150, %f134, %f155;
fma.rn.f32 %f157, %f150, %f136, %f156;
mov.b32 %r244, %f153;
shl.b32 %r245, %r244, 23;
mov.b32 %f158, %r245;
ex2.approx.ftz.f32 %f159, %f157;
mul.f32 %f33, %f159, %f158;
sub.f32 %f160, %f206, %f123;
fma.rn.f32 %f161, %f160, %f126, %f125;
cvt.sat.f32.f32 %f162, %f161;
fma.rm.f32 %f163, %f162, %f130, %f129;
add.f32 %f164, %f163, 0fCB40007F;
neg.f32 %f165, %f164;
fma.rn.f32 %f166, %f160, %f134, %f165;
fma.rn.f32 %f167, %f160, %f136, %f166;
mov.b32 %r246, %f163;
shl.b32 %r247, %r246, 23;
mov.b32 %f168, %r247;
ex2.approx.ftz.f32 %f169, %f167;
mul.f32 %f34, %f169, %f168;
@%p1 bra $L__BB0_41;
bra.uni $L__BB0_43;
$L__BB0_41:
mov.u32 %r48, %ctaid.x;
mul.lo.s32 %r248, %r2, %r66;
shl.b32 %r49, %r248, 2;
add.s32 %r249, %r41, %r49;
add.s32 %r250, %r249, -1;
div.s32 %r251, %r250, %r41;
setp.ge.s32 %p105, %r48, %r251;
@%p105 bra $L__BB0_43;
mad.lo.s32 %r252, %r41, %r48, %r42;
setp.lt.s32 %p106, %r252, %r49;
@%p106 bra $L__BB0_51;
bra.uni $L__BB0_43;
$L__BB0_51:
add.f32 %f175, %f31, 0f00000000;
add.f32 %f176, %f175, %f32;
add.f32 %f177, %f176, %f33;
add.f32 %f214, %f177, %f34;
bra.uni $L__BB0_52;
$L__BB0_43:
mov.u32 %r50, %ctaid.x;
mul.lo.s32 %r253, %r2, %r66;
shl.b32 %r51, %r253, 2;
add.s32 %r254, %r41, %r51;
add.s32 %r52, %r254, -1;
mad.lo.s32 %r53, %r41, %r50, %r42;
mov.f32 %f214, 0f00000000;
@%p4 bra $L__BB0_45;
div.s32 %r255, %r52, %r41;
setp.ge.s32 %p108, %r50, %r255;
setp.ge.s32 %p109, %r53, %r51;
or.pred %p110, %p108, %p109;
add.f32 %f171, %f31, 0f00000000;
selp.f32 %f214, 0f00000000, %f171, %p110;
$L__BB0_45:
@%p4 bra $L__BB0_47;
div.s32 %r256, %r52, %r41;
setp.ge.s32 %p112, %r50, %r256;
setp.ge.s32 %p113, %r53, %r51;
or.pred %p114, %p112, %p113;
add.f32 %f172, %f214, %f32;
selp.f32 %f214, %f214, %f172, %p114;
$L__BB0_47:
@%p4 bra $L__BB0_49;
div.s32 %r257, %r52, %r41;
setp.ge.s32 %p116, %r50, %r257;
setp.ge.s32 %p117, %r53, %r51;
or.pred %p118, %p116, %p117;
add.f32 %f173, %f214, %f33;
selp.f32 %f214, %f214, %f173, %p118;
$L__BB0_49:
@%p4 bra $L__BB0_52;
div.s32 %r258, %r52, %r41;
setp.ge.s32 %p120, %r50, %r258;
setp.ge.s32 %p121, %r53, %r51;
or.pred %p122, %p120, %p121;
add.f32 %f174, %f214, %f34;
selp.f32 %f214, %f214, %f174, %p122;
$L__BB0_52:
mov.b32 %r259, %f214;
mov.u32 %r260, 31;
mov.u32 %r261, 16;
mov.u32 %r262, -1;
shfl.sync.bfly.b32 %r263|%p123, %r259, %r261, %r260, %r262;
mov.b32 %f178, %r263;
add.f32 %f179, %f214, %f178;
mov.b32 %r264, %f179;
mov.u32 %r265, 8;
shfl.sync.bfly.b32 %r266|%p124, %r264, %r265, %r260, %r262;
mov.b32 %f180, %r266;
add.f32 %f181, %f179, %f180;
mov.b32 %r267, %f181;
mov.u32 %r268, 4;
shfl.sync.bfly.b32 %r269|%p125, %r267, %r268, %r260, %r262;
mov.b32 %f182, %r269;
add.f32 %f183, %f181, %f182;
mov.b32 %r270, %f183;
mov.u32 %r271, 2;
shfl.sync.bfly.b32 %r272|%p126, %r270, %r271, %r260, %r262;
mov.b32 %f184, %r272;
add.f32 %f185, %f183, %f184;
mov.b32 %r273, %f185;
mov.u32 %r274, 1;
shfl.sync.bfly.b32 %r275|%p127, %r273, %r274, %r260, %r262;
mov.b32 %f186, %r275;
add.f32 %f218, %f185, %f186;
bar.sync 0;
@%p80 bra $L__BB0_54;
st.shared.f32 [%rd5], %f218;
$L__BB0_54:
bar.sync 0;
@%p81 bra $L__BB0_58;
setp.ge.u32 %p130, %r44, %r45;
mov.f32 %f217, 0f00000000;
@%p130 bra $L__BB0_57;
ld.shared.f32 %f217, [%rd6];
$L__BB0_57:
mov.b32 %r276, %f217;
mov.u32 %r277, 31;
mov.u32 %r278, 16;
mov.u32 %r279, -1;
shfl.sync.bfly.b32 %r280|%p131, %r276, %r278, %r277, %r279;
mov.b32 %f188, %r280;
add.f32 %f189, %f217, %f188;
mov.b32 %r281, %f189;
mov.u32 %r282, 8;
shfl.sync.bfly.b32 %r283|%p132, %r281, %r282, %r277, %r279;
mov.b32 %f190, %r283;
add.f32 %f191, %f189, %f190;
mov.b32 %r284, %f191;
mov.u32 %r285, 4;
shfl.sync.bfly.b32 %r286|%p133, %r284, %r285, %r277, %r279;
mov.b32 %f192, %r286;
add.f32 %f193, %f191, %f192;
mov.b32 %r287, %f193;
mov.u32 %r288, 2;
shfl.sync.bfly.b32 %r289|%p134, %r287, %r288, %r277, %r279;
mov.b32 %f194, %r289;
add.f32 %f195, %f193, %f194;
mov.b32 %r290, %f195;
mov.u32 %r291, 1;
shfl.sync.bfly.b32 %r292|%p135, %r290, %r291, %r277, %r279;
mov.b32 %f196, %r292;
add.f32 %f218, %f195, %f196;
$L__BB0_58:
bar.sync 0;
setp.ne.s32 %p136, %r4, 0;
@%p136 bra $L__BB0_60;
setp.eq.s32 %p137, %r44, 0;
add.f32 %f197, %f218, 0f00000000;
selp.f32 %f198, %f197, 0f00000000, %p137;
st.shared.f32 [%rd7], %f198;
$L__BB0_60:
bar.sync 0;
ld.shared.f32 %f49, [%rd7];
bar.sync 0;
@%p4 bra $L__BB0_63;
mov.u32 %r54, %ctaid.x;
mul.lo.s32 %r293, %r2, %r66;
shl.b32 %r55, %r293, 2;
add.s32 %r294, %r41, %r55;
add.s32 %r295, %r294, -1;
div.s32 %r296, %r295, %r41;
setp.ge.s32 %p139, %r54, %r296;
@%p139 bra $L__BB0_63;
mad.lo.s32 %r56, %r41, %r54, %r42;
setp.lt.s32 %p140, %r56, %r55;
@%p140 bra $L__BB0_67;
bra.uni $L__BB0_63;
$L__BB0_67:
div.rn.f32 %f199, %f31, %f49;
mov.b32 %r306, %f199;
div.rn.f32 %f200, %f32, %f49;
mov.b32 %r307, %f200;
div.rn.f32 %f201, %f33, %f49;
mov.b32 %r308, %f201;
div.rn.f32 %f202, %f34, %f49;
mov.b32 %r309, %f202;
mad.lo.s32 %r310, %r56, %r67, %r5;
mul.wide.s32 %rd35, %r310, 4;
add.s64 %rd34, %rd9, %rd35;
// begin inline asm
st.global.cs.v4.s32 [%rd34], {%r306,%r307,%r308,%r309};
// end inline asm
bra.uni $L__BB0_68;
$L__BB0_63:
div.rn.f32 %f50, %f31, %f49;
div.rn.f32 %f51, %f32, %f49;
div.rn.f32 %f52, %f33, %f49;
div.rn.f32 %f53, %f34, %f49;
@%p4 bra $L__BB0_68;
mov.u32 %r57, %ctaid.x;
mul.lo.s32 %r297, %r2, %r66;
shl.b32 %r58, %r297, 2;
add.s32 %r298, %r41, %r58;
add.s32 %r299, %r298, -1;
div.s32 %r300, %r299, %r41;
setp.ge.s32 %p142, %r57, %r300;
@%p142 bra $L__BB0_68;
mad.lo.s32 %r59, %r41, %r57, %r42;
setp.ge.s32 %p143, %r59, %r58;
@%p143 bra $L__BB0_68;
mad.lo.s32 %r305, %r59, %r67, %r5;
mul.wide.s32 %rd33, %r305, 4;
add.s64 %rd32, %rd9, %rd33;
mov.b32 %r304, %f53;
mov.b32 %r303, %f52;
mov.b32 %r302, %f51;
mov.b32 %r301, %f50;
// begin inline asm
st.global.cs.v4.s32 [%rd32], {%r301,%r302,%r303,%r304};
// end inline asm
$L__BB0_68:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_372_cu_b415bb8a_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_372_cu_b415bb8a_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_372_cu_b415bb8a_160113std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_372_cu_b415bb8a_160115arrayE[];
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_372_cu_b415bb8a_1601111nvfuser_372ENS_6TensorIbLi4ELi4EEENS0_IfLi3ELi3EEENS0_IfLi4ELi4EEES3_S2_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_372_cu_b415bb8a_1601111nvfuser_372ENS_6TensorIbLi4ELi4EEENS0_IfLi3ELi3EEENS0_IfLi4ELi4EEES3_S2__param_0[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_372_cu_b415bb8a_1601111nvfuser_372ENS_6TensorIbLi4ELi4EEENS0_IfLi3ELi3EEENS0_IfLi4ELi4EEES3_S2__param_1[32],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_372_cu_b415bb8a_1601111nvfuser_372ENS_6TensorIbLi4ELi4EEENS0_IfLi3ELi3EEENS0_IfLi4ELi4EEES3_S2__param_2[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_372_cu_b415bb8a_1601111nvfuser_372ENS_6TensorIbLi4ELi4EEENS0_IfLi3ELi3EEENS0_IfLi4ELi4EEES3_S2__param_3[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_372_cu_b415bb8a_1601111nvfuser_372ENS_6TensorIbLi4ELi4EEENS0_IfLi3ELi3EEENS0_IfLi4ELi4EEES3_S2__param_4[32]
)
{
.reg .pred %p<144>;
.reg .b16 %rs<32>;
.reg .f32 %f<219>;
.reg .b32 %r<319>;
.reg .b64 %rd<36>;
ld.param.v2.u32 {%r66, %r67}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_372_cu_b415bb8a_1601111nvfuser_372ENS_6TensorIbLi4ELi4EEENS0_IfLi3ELi3EEENS0_IfLi4ELi4EEES3_S2__param_0+16];
ld.param.v2.u32 {%r68, %r69}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_372_cu_b415bb8a_1601111nvfuser_372ENS_6TensorIbLi4ELi4EEENS0_IfLi3ELi3EEENS0_IfLi4ELi4EEES3_S2__param_1+8];
ld.param.u64 %rd9, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_372_cu_b415bb8a_1601111nvfuser_372ENS_6TensorIbLi4ELi4EEENS0_IfLi3ELi3EEENS0_IfLi4ELi4EEES3_S2__param_4];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_372_cu_b415bb8a_1601111nvfuser_372ENS_6TensorIbLi4ELi4EEENS0_IfLi3ELi3EEENS0_IfLi4ELi4EEES3_S2__param_3];
ld.param.u64 %rd4, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_372_cu_b415bb8a_1601111nvfuser_372ENS_6TensorIbLi4ELi4EEENS0_IfLi3ELi3EEENS0_IfLi4ELi4EEES3_S2__param_2];
ld.param.u64 %rd10, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_372_cu_b415bb8a_1601111nvfuser_372ENS_6TensorIbLi4ELi4EEENS0_IfLi3ELi3EEENS0_IfLi4ELi4EEES3_S2__param_0];
cvta.to.global.u64 %rd1, %rd10;
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_372_cu_b415bb8a_1601111nvfuser_372ENS_6TensorIbLi4ELi4EEENS0_IfLi3ELi3EEENS0_IfLi4ELi4EEES3_S2__param_1];
add.s32 %r77, %r68, 3;
shr.s32 %r78, %r77, 31;
shr.u32 %r79, %r78, 30;
add.s32 %r80, %r77, %r79;
shr.s32 %r2, %r80, 2;
add.s32 %r81, %r67, 3;
shr.s32 %r82, %r81, 31;
shr.u32 %r83, %r82, 30;
add.s32 %r84, %r81, %r83;
shr.s32 %r85, %r84, 2;
mov.u32 %r4, %tid.x;
setp.lt.s32 %p2, %r4, %r85;
shl.b32 %r5, %r4, 2;
or.b32 %r86, %r5, 3;
setp.lt.s32 %p3, %r86, %r67;
and.pred %p1, %p3, %p2;
not.pred %p4, %p1;
@%p4 bra $L__BB0_3;
mov.u32 %r6, %ctaid.x;
mul.lo.s32 %r87, %r66, %r2;
shl.b32 %r7, %r87, 2;
mov.u32 %r8, %ntid.y;
add.s32 %r88, %r8, %r7;
add.s32 %r89, %r88, -1;
div.s32 %r90, %r89, %r8;
setp.ge.s32 %p5, %r6, %r90;
@%p5 bra $L__BB0_3;
mov.u32 %r91, %tid.y;
mad.lo.s32 %r9, %r8, %r6, %r91;
setp.lt.s32 %p6, %r9, %r7;
@%p6 bra $L__BB0_16;
bra.uni $L__BB0_3;
$L__BB0_16:
mad.lo.s32 %r179, %r9, %r67, %r5;
mul.wide.s32 %rd22, %r179, 4;
add.s64 %rd19, %rd2, %rd22;
// begin inline asm
ld.global.cs.v4.u32 {%r167,%r168,%r169,%r170}, [%rd19];
// end inline asm
div.s32 %r180, %r9, %r66;
div.s32 %r181, %r180, %r2;
mul.lo.s32 %r182, %r180, %r66;
sub.s32 %r183, %r9, %r182;
mad.lo.s32 %r184, %r181, %r66, %r183;
mad.lo.s32 %r185, %r184, %r67, %r5;
cvt.s64.s32 %rd23, %r185;
mul.wide.s32 %rd24, %r185, 4;
add.s64 %rd20, %rd4, %rd24;
// begin inline asm
ld.global.ca.v4.u32 {%r171,%r172,%r173,%r174}, [%rd20];
// end inline asm
add.s64 %rd25, %rd1, %rd23;
ld.global.u32 %r186, [%rd25];
cvt.u16.u32 %rs21, %r186;
cvt.s16.s8 %rs22, %rs21;
shr.s16 %rs23, %rs21, 8;
shr.u32 %r187, %r186, 16;
cvt.u16.u32 %rs24, %r187;
cvt.s16.s8 %rs25, %rs24;
shr.u32 %r188, %r186, 24;
cvt.u16.u32 %rs26, %r188;
cvt.s16.s8 %rs27, %rs26;
cvt.rn.f32.s16 %f75, %rs22;
mov.f32 %f76, 0f3F800000;
sub.f32 %f77, %f76, %f75;
setp.neu.f32 %p20, %f77, 0f00000000;
selp.f32 %f78, 0fFF7FFFEE, %f77, %p20;
mov.b32 %f79, %r167;
mov.b32 %f80, %r171;
add.f32 %f81, %f78, %f80;
mov.b32 %r175, %f81;
add.f32 %f203, %f81, %f79;
cvt.rn.f32.s16 %f82, %rs23;
sub.f32 %f83, %f76, %f82;
setp.neu.f32 %p21, %f83, 0f00000000;
selp.f32 %f84, 0fFF7FFFEE, %f83, %p21;
mov.b32 %f85, %r168;
mov.b32 %f86, %r172;
add.f32 %f87, %f84, %f86;
mov.b32 %r176, %f87;
add.f32 %f204, %f87, %f85;
cvt.rn.f32.s16 %f88, %rs25;
sub.f32 %f89, %f76, %f88;
setp.neu.f32 %p22, %f89, 0f00000000;
selp.f32 %f90, 0fFF7FFFEE, %f89, %p22;
mov.b32 %f91, %r169;
mov.b32 %f92, %r173;
add.f32 %f93, %f90, %f92;
mov.b32 %r177, %f93;
add.f32 %f205, %f93, %f91;
cvt.rn.f32.s16 %f94, %rs27;
sub.f32 %f95, %f76, %f94;
setp.neu.f32 %p23, %f95, 0f00000000;
selp.f32 %f96, 0fFF7FFFEE, %f95, %p23;
mov.b32 %f97, %r170;
mov.b32 %f98, %r174;
add.f32 %f99, %f96, %f98;
mov.b32 %r178, %f99;
add.f32 %f206, %f99, %f97;
add.s64 %rd21, %rd3, %rd24;
// begin inline asm
st.global.cs.v4.s32 [%rd21], {%r175,%r176,%r177,%r178};
// end inline asm
bra.uni $L__BB0_17;
$L__BB0_3:
mov.u32 %r315, 0;
mov.u32 %r311, %r315;
mov.u32 %r312, %r315;
mov.u32 %r313, %r315;
mov.u32 %r314, %r315;
@%p4 bra $L__BB0_7;
mov.u32 %r10, %ctaid.x;
mul.lo.s32 %r100, %r2, %r66;
shl.b32 %r11, %r100, 2;
mov.u32 %r12, %ntid.y;
add.s32 %r101, %r12, %r11;
add.s32 %r102, %r101, -1;
div.s32 %r103, %r102, %r12;
setp.ge.s32 %p8, %r10, %r103;
@%p8 bra $L__BB0_7;
mov.u32 %r108, %tid.y;
mad.lo.s32 %r13, %r12, %r10, %r108;
setp.ge.s32 %p9, %r13, %r11;
mov.u32 %r311, %r315;
mov.u32 %r312, %r315;
mov.u32 %r313, %r315;
mov.u32 %r314, %r315;
@%p9 bra $L__BB0_7;
mad.lo.s32 %r113, %r13, %r67, %r5;
mul.wide.s32 %rd12, %r113, 4;
add.s64 %rd11, %rd2, %rd12;
// begin inline asm
ld.global.cs.v4.u32 {%r311,%r312,%r313,%r314}, [%rd11];
// end inline asm
$L__BB0_7:
mov.u32 %r316, %r315;
mov.u32 %r317, %r315;
mov.u32 %r318, %r315;
@%p4 bra $L__BB0_10;
mov.u32 %r122, %tid.y;
mov.u32 %r123, %ctaid.x;
mov.u32 %r124, %ntid.y;
mad.lo.s32 %r22, %r124, %r123, %r122;
mul.lo.s32 %r125, %r66, %r2;
shl.b32 %r126, %r125, 2;
setp.ge.s32 %p11, %r22, %r126;
mov.u32 %r316, %r315;
mov.u32 %r317, %r315;
mov.u32 %r318, %r315;
@%p11 bra $L__BB0_10;
div.s32 %r131, %r22, %r66;
mul.lo.s32 %r132, %r131, %r66;
sub.s32 %r133, %r22, %r132;
div.s32 %r134, %r131, %r2;
mad.lo.s32 %r135, %r134, %r66, %r133;
mad.lo.s32 %r136, %r135, %r67, %r5;
mul.wide.s32 %rd14, %r136, 4;
add.s64 %rd13, %rd4, %rd14;
// begin inline asm
ld.global.ca.v4.u32 {%r318,%r317,%r316,%r315}, [%rd13];
// end inline asm
$L__BB0_10:
mov.u16 %rs28, 0;
mov.u16 %rs29, %rs28;
mov.u16 %rs30, %rs28;
mov.u16 %rs31, %rs28;
@%p4 bra $L__BB0_13;
mov.u32 %r137, %tid.y;
mov.u32 %r138, %ctaid.x;
mov.u32 %r139, %ntid.y;
mad.lo.s32 %r31, %r139, %r138, %r137;
mul.lo.s32 %r140, %r66, %r2;
shl.b32 %r141, %r140, 2;
setp.ge.s32 %p13, %r31, %r141;
@%p13 bra $L__BB0_13;
div.s32 %r142, %r31, %r66;
mul.lo.s32 %r143, %r142, %r66;
sub.s32 %r144, %r31, %r143;
div.s32 %r145, %r142, %r2;
mad.lo.s32 %r146, %r145, %r66, %r144;
mad.lo.s32 %r147, %r146, %r67, %r5;
cvt.s64.s32 %rd15, %r147;
add.s64 %rd16, %rd1, %rd15;
ld.global.u32 %r148, [%rd16];
cvt.u16.u32 %rs31, %r148;
shr.u32 %r149, %r148, 8;
cvt.u16.u32 %rs30, %r149;
shr.u32 %r150, %r148, 16;
cvt.u16.u32 %rs29, %r150;
shr.u32 %r151, %r148, 24;
cvt.u16.u32 %rs28, %r151;
$L__BB0_13:
cvt.s16.s8 %rs17, %rs31;
cvt.rn.f32.s16 %f54, %rs17;
mov.f32 %f55, 0f3F800000;
sub.f32 %f56, %f55, %f54;
setp.neu.f32 %p14, %f56, 0f00000000;
selp.f32 %f57, 0fFF7FFFEE, %f56, %p14;
mov.b32 %f58, %r318;
add.f32 %f1, %f57, %f58;
mov.b32 %f59, %r311;
add.f32 %f203, %f1, %f59;
cvt.s16.s8 %rs18, %rs30;
cvt.rn.f32.s16 %f60, %rs18;
sub.f32 %f61, %f55, %f60;
setp.neu.f32 %p15, %f61, 0f00000000;
selp.f32 %f62, 0fFF7FFFEE, %f61, %p15;
mov.b32 %f63, %r317;
add.f32 %f3, %f62, %f63;
mov.b32 %f64, %r312;
add.f32 %f204, %f3, %f64;
cvt.s16.s8 %rs19, %rs29;
cvt.rn.f32.s16 %f65, %rs19;
sub.f32 %f66, %f55, %f65;
setp.neu.f32 %p16, %f66, 0f00000000;
selp.f32 %f67, 0fFF7FFFEE, %f66, %p16;
mov.b32 %f68, %r316;
add.f32 %f5, %f67, %f68;
mov.b32 %f69, %r313;
add.f32 %f205, %f5, %f69;
cvt.s16.s8 %rs20, %rs28;
cvt.rn.f32.s16 %f70, %rs20;
sub.f32 %f71, %f55, %f70;
setp.neu.f32 %p17, %f71, 0f00000000;
selp.f32 %f72, 0fFF7FFFEE, %f71, %p17;
mov.b32 %f73, %r315;
add.f32 %f7, %f72, %f73;
mov.b32 %f74, %r314;
add.f32 %f206, %f7, %f74;
@%p4 bra $L__BB0_17;
mov.u32 %r152, %tid.y;
mov.u32 %r153, %ctaid.x;
mov.u32 %r154, %ntid.y;
mad.lo.s32 %r32, %r154, %r153, %r152;
mul.lo.s32 %r155, %r66, %r2;
shl.b32 %r156, %r155, 2;
setp.ge.s32 %p19, %r32, %r156;
@%p19 bra $L__BB0_17;
div.s32 %r161, %r32, %r66;
mul.lo.s32 %r162, %r161, %r66;
sub.s32 %r163, %r32, %r162;
div.s32 %r164, %r161, %r2;
mad.lo.s32 %r165, %r164, %r66, %r163;
mad.lo.s32 %r166, %r165, %r67, %r5;
mul.wide.s32 %rd18, %r166, 4;
add.s64 %rd17, %rd3, %rd18;
mov.b32 %r157, %f1;
mov.b32 %r158, %f3;
mov.b32 %r159, %f5;
mov.b32 %r160, %f7;
// begin inline asm
st.global.cs.v4.s32 [%rd17], {%r157,%r158,%r159,%r160};
// end inline asm
$L__BB0_17:
@%p1 bra $L__BB0_18;
bra.uni $L__BB0_20;
$L__BB0_18:
mov.u32 %r33, %ctaid.x;
mul.lo.s32 %r189, %r2, %r66;
shl.b32 %r34, %r189, 2;
mov.u32 %r35, %ntid.y;
add.s32 %r190, %r35, %r34;
add.s32 %r191, %r190, -1;
div.s32 %r192, %r191, %r35;
setp.ge.s32 %p24, %r33, %r192;
@%p24 bra $L__BB0_20;
mov.u32 %r193, %tid.y;
mad.lo.s32 %r194, %r35, %r33, %r193;
setp.lt.s32 %p25, %r194, %r34;
@%p25 bra $L__BB0_31;
bra.uni $L__BB0_20;
$L__BB0_31:
setp.nan.f32 %p51, %f203, %f203;
setp.gt.f32 %p52, %f203, %f204;
or.pred %p53, %p51, %p52;
selp.f32 %f101, %f203, %f204, %p53;
setp.nan.f32 %p54, %f101, %f101;
setp.gt.f32 %p55, %f101, %f205;
or.pred %p56, %p54, %p55;
selp.f32 %f102, %f101, %f205, %p56;
setp.nan.f32 %p57, %f102, %f102;
setp.gt.f32 %p58, %f102, %f206;
or.pred %p59, %p57, %p58;
selp.f32 %f209, %f102, %f206, %p59;
bra.uni $L__BB0_32;
$L__BB0_20:
mov.u32 %r36, %ctaid.x;
mul.lo.s32 %r195, %r2, %r66;
shl.b32 %r37, %r195, 2;
mov.u32 %r39, %ntid.y;
add.s32 %r196, %r39, %r37;
add.s32 %r38, %r196, -1;
mov.u32 %r197, %tid.y;
mad.lo.s32 %r40, %r39, %r36, %r197;
mov.f32 %f209, 0fFF800000;
@%p4 bra $L__BB0_22;
div.s32 %r198, %r38, %r39;
setp.ge.s32 %p27, %r36, %r198;
setp.ge.s32 %p28, %r40, %r37;
or.pred %p29, %p27, %p28;
selp.f32 %f209, 0fFF800000, %f203, %p29;
$L__BB0_22:
@%p4 bra $L__BB0_25;
div.s32 %r199, %r38, %r39;
setp.ge.s32 %p31, %r36, %r199;
setp.ge.s32 %p32, %r40, %r37;
or.pred %p33, %p31, %p32;
@%p33 bra $L__BB0_25;
setp.nan.f32 %p34, %f209, %f209;
setp.gt.f32 %p35, %f209, %f204;
or.pred %p36, %p34, %p35;
selp.f32 %f209, %f209, %f204, %p36;
$L__BB0_25:
@%p4 bra $L__BB0_28;
div.s32 %r200, %r38, %r39;
setp.ge.s32 %p38, %r36, %r200;
setp.ge.s32 %p39, %r40, %r37;
or.pred %p40, %p38, %p39;
@%p40 bra $L__BB0_28;
setp.nan.f32 %p41, %f209, %f209;
setp.gt.f32 %p42, %f209, %f205;
or.pred %p43, %p41, %p42;
selp.f32 %f209, %f209, %f205, %p43;
$L__BB0_28:
@%p4 bra $L__BB0_32;
div.s32 %r201, %r38, %r39;
setp.ge.s32 %p45, %r36, %r201;
setp.ge.s32 %p46, %r40, %r37;
or.pred %p47, %p45, %p46;
@%p47 bra $L__BB0_32;
setp.nan.f32 %p48, %f209, %f209;
setp.gt.f32 %p49, %f209, %f206;
or.pred %p50, %p48, %p49;
selp.f32 %f209, %f209, %f206, %p50;
$L__BB0_32:
mov.b32 %r202, %f209;
mov.u32 %r203, 31;
mov.u32 %r204, 16;
mov.u32 %r205, -1;
shfl.sync.bfly.b32 %r206|%p60, %r202, %r204, %r203, %r205;
mov.b32 %f103, %r206;
setp.gt.f32 %p61, %f209, %f103;
setp.nan.f32 %p62, %f209, %f209;
or.pred %p63, %p62, %p61;
selp.f32 %f104, %f209, %f103, %p63;
mov.b32 %r207, %f104;
mov.u32 %r208, 8;
shfl.sync.bfly.b32 %r209|%p64, %r207, %r208, %r203, %r205;
mov.b32 %f105, %r209;
setp.nan.f32 %p65, %f104, %f104;
setp.gt.f32 %p66, %f104, %f105;
or.pred %p67, %p65, %p66;
selp.f32 %f106, %f104, %f105, %p67;
mov.b32 %r210, %f106;
mov.u32 %r211, 4;
shfl.sync.bfly.b32 %r212|%p68, %r210, %r211, %r203, %r205;
mov.b32 %f107, %r212;
setp.nan.f32 %p69, %f106, %f106;
setp.gt.f32 %p70, %f106, %f107;
or.pred %p71, %p69, %p70;
selp.f32 %f108, %f106, %f107, %p71;
mov.b32 %r213, %f108;
mov.u32 %r214, 2;
shfl.sync.bfly.b32 %r215|%p72, %r213, %r214, %r203, %r205;
mov.b32 %f109, %r215;
setp.nan.f32 %p73, %f108, %f108;
setp.gt.f32 %p74, %f108, %f109;
or.pred %p75, %p73, %p74;
selp.f32 %f110, %f108, %f109, %p75;
mov.b32 %r216, %f110;
mov.u32 %r217, 1;
shfl.sync.bfly.b32 %r218|%p76, %r216, %r217, %r203, %r205;
mov.b32 %f111, %r218;
setp.nan.f32 %p77, %f110, %f110;
setp.gt.f32 %p78, %f110, %f111;
or.pred %p79, %p77, %p78;
selp.f32 %f212, %f110, %f111, %p79;
mov.u32 %r219, %tid.z;
mov.u32 %r41, %ntid.y;
mov.u32 %r42, %tid.y;
mad.lo.s32 %r43, %r219, %r41, %r42;
and.b32 %r44, %r4, 31;
setp.ne.s32 %p80, %r44, 0;
mov.u32 %r220, %ntid.x;
shr.u32 %r45, %r220, 5;
mul.lo.s32 %r46, %r43, %r45;
bar.sync 0;
shr.u32 %r47, %r4, 5;
add.s32 %r221, %r46, %r47;
mul.wide.u32 %rd26, %r221, 4;
mov.u64 %rd27, _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_372_cu_b415bb8a_160115arrayE;
add.s64 %rd5, %rd27, %rd26;
@%p80 bra $L__BB0_34;
st.shared.f32 [%rd5], %f212;
$L__BB0_34:
bar.sync 0;
add.s32 %r222, %r46, %r44;
mul.wide.u32 %rd28, %r222, 4;
add.s64 %rd6, %rd27, %rd28;
setp.ne.s32 %p81, %r47, 0;
@%p81 bra $L__BB0_38;
setp.ge.u32 %p82, %r44, %r45;
mov.f32 %f211, 0fFF800000;
@%p82 bra $L__BB0_37;
ld.shared.f32 %f211, [%rd6];
$L__BB0_37:
mov.b32 %r223, %f211;
mov.u32 %r224, 31;
mov.u32 %r225, 16;
mov.u32 %r226, -1;
shfl.sync.bfly.b32 %r227|%p83, %r223, %r225, %r224, %r226;
mov.b32 %f113, %r227;
setp.gt.f32 %p84, %f211, %f113;
setp.nan.f32 %p85, %f211, %f211;
or.pred %p86, %p85, %p84;
selp.f32 %f114, %f211, %f113, %p86;
mov.b32 %r228, %f114;
mov.u32 %r229, 8;
shfl.sync.bfly.b32 %r230|%p87, %r228, %r229, %r224, %r226;
mov.b32 %f115, %r230;
setp.nan.f32 %p88, %f114, %f114;
setp.gt.f32 %p89, %f114, %f115;
or.pred %p90, %p88, %p89;
selp.f32 %f116, %f114, %f115, %p90;
mov.b32 %r231, %f116;
mov.u32 %r232, 4;
shfl.sync.bfly.b32 %r233|%p91, %r231, %r232, %r224, %r226;
mov.b32 %f117, %r233;
setp.nan.f32 %p92, %f116, %f116;
setp.gt.f32 %p93, %f116, %f117;
or.pred %p94, %p92, %p93;
selp.f32 %f118, %f116, %f117, %p94;
mov.b32 %r234, %f118;
mov.u32 %r235, 2;
shfl.sync.bfly.b32 %r236|%p95, %r234, %r235, %r224, %r226;
mov.b32 %f119, %r236;
setp.nan.f32 %p96, %f118, %f118;
setp.gt.f32 %p97, %f118, %f119;
or.pred %p98, %p96, %p97;
selp.f32 %f120, %f118, %f119, %p98;
mov.b32 %r237, %f120;
mov.u32 %r238, 1;
shfl.sync.bfly.b32 %r239|%p99, %r237, %r238, %r224, %r226;
mov.b32 %f121, %r239;
setp.nan.f32 %p100, %f120, %f120;
setp.gt.f32 %p101, %f120, %f121;
or.pred %p102, %p100, %p101;
selp.f32 %f212, %f120, %f121, %p102;
$L__BB0_38:
bar.sync 0;
mul.wide.s32 %rd30, %r43, 4;
add.s64 %rd7, %rd27, %rd30;
setp.eq.s32 %p103, %r4, 0;
@%p103 bra $L__BB0_39;
bra.uni $L__BB0_40;
$L__BB0_39:
setp.eq.s32 %p104, %r44, 0;
selp.f32 %f122, %f212, 0fFF800000, %p104;
st.shared.f32 [%rd7], %f122;
$L__BB0_40:
bar.sync 0;
ld.shared.f32 %f123, [%rd7];
bar.sync 0;
sub.f32 %f124, %f203, %f123;
mov.f32 %f125, 0f3F000000;
mov.f32 %f126, 0f3BBB989D;
fma.rn.f32 %f127, %f124, %f126, %f125;
cvt.sat.f32.f32 %f128, %f127;
mov.f32 %f129, 0f4B400001;
mov.f32 %f130, 0f437C0000;
fma.rm.f32 %f131, %f128, %f130, %f129;
add.f32 %f132, %f131, 0fCB40007F;
neg.f32 %f133, %f132;
mov.f32 %f134, 0f3FB8AA3B;
fma.rn.f32 %f135, %f124, %f134, %f133;
mov.f32 %f136, 0f32A57060;
fma.rn.f32 %f137, %f124, %f136, %f135;
mov.b32 %r240, %f131;
shl.b32 %r241, %r240, 23;
mov.b32 %f138, %r241;
ex2.approx.ftz.f32 %f139, %f137;
mul.f32 %f31, %f139, %f138;
sub.f32 %f140, %f204, %f123;
fma.rn.f32 %f141, %f140, %f126, %f125;
cvt.sat.f32.f32 %f142, %f141;
fma.rm.f32 %f143, %f142, %f130, %f129;
add.f32 %f144, %f143, 0fCB40007F;
neg.f32 %f145, %f144;
fma.rn.f32 %f146, %f140, %f134, %f145;
fma.rn.f32 %f147, %f140, %f136, %f146;
mov.b32 %r242, %f143;
shl.b32 %r243, %r242, 23;
mov.b32 %f148, %r243;
ex2.approx.ftz.f32 %f149, %f147;
mul.f32 %f32, %f149, %f148;
sub.f32 %f150, %f205, %f123;
fma.rn.f32 %f151, %f150, %f126, %f125;
cvt.sat.f32.f32 %f152, %f151;
fma.rm.f32 %f153, %f152, %f130, %f129;
add.f32 %f154, %f153, 0fCB40007F;
neg.f32 %f155, %f154;
fma.rn.f32 %f156, %f150, %f134, %f155;
fma.rn.f32 %f157, %f150, %f136, %f156;
mov.b32 %r244, %f153;
shl.b32 %r245, %r244, 23;
mov.b32 %f158, %r245;
ex2.approx.ftz.f32 %f159, %f157;
mul.f32 %f33, %f159, %f158;
sub.f32 %f160, %f206, %f123;
fma.rn.f32 %f161, %f160, %f126, %f125;
cvt.sat.f32.f32 %f162, %f161;
fma.rm.f32 %f163, %f162, %f130, %f129;
add.f32 %f164, %f163, 0fCB40007F;
neg.f32 %f165, %f164;
fma.rn.f32 %f166, %f160, %f134, %f165;
fma.rn.f32 %f167, %f160, %f136, %f166;
mov.b32 %r246, %f163;
shl.b32 %r247, %r246, 23;
mov.b32 %f168, %r247;
ex2.approx.ftz.f32 %f169, %f167;
mul.f32 %f34, %f169, %f168;
@%p1 bra $L__BB0_41;
bra.uni $L__BB0_43;
$L__BB0_41:
mov.u32 %r48, %ctaid.x;
mul.lo.s32 %r248, %r2, %r66;
shl.b32 %r49, %r248, 2;
add.s32 %r249, %r41, %r49;
add.s32 %r250, %r249, -1;
div.s32 %r251, %r250, %r41;
setp.ge.s32 %p105, %r48, %r251;
@%p105 bra $L__BB0_43;
mad.lo.s32 %r252, %r41, %r48, %r42;
setp.lt.s32 %p106, %r252, %r49;
@%p106 bra $L__BB0_51;
bra.uni $L__BB0_43;
$L__BB0_51:
add.f32 %f175, %f31, 0f00000000;
add.f32 %f176, %f175, %f32;
add.f32 %f177, %f176, %f33;
add.f32 %f214, %f177, %f34;
bra.uni $L__BB0_52;
$L__BB0_43:
mov.u32 %r50, %ctaid.x;
mul.lo.s32 %r253, %r2, %r66;
shl.b32 %r51, %r253, 2;
add.s32 %r254, %r41, %r51;
add.s32 %r52, %r254, -1;
mad.lo.s32 %r53, %r41, %r50, %r42;
mov.f32 %f214, 0f00000000;
@%p4 bra $L__BB0_45;
div.s32 %r255, %r52, %r41;
setp.ge.s32 %p108, %r50, %r255;
setp.ge.s32 %p109, %r53, %r51;
or.pred %p110, %p108, %p109;
add.f32 %f171, %f31, 0f00000000;
selp.f32 %f214, 0f00000000, %f171, %p110;
$L__BB0_45:
@%p4 bra $L__BB0_47;
div.s32 %r256, %r52, %r41;
setp.ge.s32 %p112, %r50, %r256;
setp.ge.s32 %p113, %r53, %r51;
or.pred %p114, %p112, %p113;
add.f32 %f172, %f214, %f32;
selp.f32 %f214, %f214, %f172, %p114;
$L__BB0_47:
@%p4 bra $L__BB0_49;
div.s32 %r257, %r52, %r41;
setp.ge.s32 %p116, %r50, %r257;
setp.ge.s32 %p117, %r53, %r51;
or.pred %p118, %p116, %p117;
add.f32 %f173, %f214, %f33;
selp.f32 %f214, %f214, %f173, %p118;
$L__BB0_49:
@%p4 bra $L__BB0_52;
div.s32 %r258, %r52, %r41;
setp.ge.s32 %p120, %r50, %r258;
setp.ge.s32 %p121, %r53, %r51;
or.pred %p122, %p120, %p121;
add.f32 %f174, %f214, %f34;
selp.f32 %f214, %f214, %f174, %p122;
$L__BB0_52:
mov.b32 %r259, %f214;
mov.u32 %r260, 31;
mov.u32 %r261, 16;
mov.u32 %r262, -1;
shfl.sync.bfly.b32 %r263|%p123, %r259, %r261, %r260, %r262;
mov.b32 %f178, %r263;
add.f32 %f179, %f214, %f178;
mov.b32 %r264, %f179;
mov.u32 %r265, 8;
shfl.sync.bfly.b32 %r266|%p124, %r264, %r265, %r260, %r262;
mov.b32 %f180, %r266;
add.f32 %f181, %f179, %f180;
mov.b32 %r267, %f181;
mov.u32 %r268, 4;
shfl.sync.bfly.b32 %r269|%p125, %r267, %r268, %r260, %r262;
mov.b32 %f182, %r269;
add.f32 %f183, %f181, %f182;
mov.b32 %r270, %f183;
mov.u32 %r271, 2;
shfl.sync.bfly.b32 %r272|%p126, %r270, %r271, %r260, %r262;
mov.b32 %f184, %r272;
add.f32 %f185, %f183, %f184;
mov.b32 %r273, %f185;
mov.u32 %r274, 1;
shfl.sync.bfly.b32 %r275|%p127, %r273, %r274, %r260, %r262;
mov.b32 %f186, %r275;
add.f32 %f218, %f185, %f186;
bar.sync 0;
@%p80 bra $L__BB0_54;
st.shared.f32 [%rd5], %f218;
$L__BB0_54:
bar.sync 0;
@%p81 bra $L__BB0_58;
setp.ge.u32 %p130, %r44, %r45;
mov.f32 %f217, 0f00000000;
@%p130 bra $L__BB0_57;
ld.shared.f32 %f217, [%rd6];
$L__BB0_57:
mov.b32 %r276, %f217;
mov.u32 %r277, 31;
mov.u32 %r278, 16;
mov.u32 %r279, -1;
shfl.sync.bfly.b32 %r280|%p131, %r276, %r278, %r277, %r279;
mov.b32 %f188, %r280;
add.f32 %f189, %f217, %f188;
mov.b32 %r281, %f189;
mov.u32 %r282, 8;
shfl.sync.bfly.b32 %r283|%p132, %r281, %r282, %r277, %r279;
mov.b32 %f190, %r283;
add.f32 %f191, %f189, %f190;
mov.b32 %r284, %f191;
mov.u32 %r285, 4;
shfl.sync.bfly.b32 %r286|%p133, %r284, %r285, %r277, %r279;
mov.b32 %f192, %r286;
add.f32 %f193, %f191, %f192;
mov.b32 %r287, %f193;
mov.u32 %r288, 2;
shfl.sync.bfly.b32 %r289|%p134, %r287, %r288, %r277, %r279;
mov.b32 %f194, %r289;
add.f32 %f195, %f193, %f194;
mov.b32 %r290, %f195;
mov.u32 %r291, 1;
shfl.sync.bfly.b32 %r292|%p135, %r290, %r291, %r277, %r279;
mov.b32 %f196, %r292;
add.f32 %f218, %f195, %f196;
$L__BB0_58:
bar.sync 0;
setp.ne.s32 %p136, %r4, 0;
@%p136 bra $L__BB0_60;
setp.eq.s32 %p137, %r44, 0;
add.f32 %f197, %f218, 0f00000000;
selp.f32 %f198, %f197, 0f00000000, %p137;
st.shared.f32 [%rd7], %f198;
$L__BB0_60:
bar.sync 0;
ld.shared.f32 %f49, [%rd7];
bar.sync 0;
@%p4 bra $L__BB0_63;
mov.u32 %r54, %ctaid.x;
mul.lo.s32 %r293, %r2, %r66;
shl.b32 %r55, %r293, 2;
add.s32 %r294, %r41, %r55;
add.s32 %r295, %r294, -1;
div.s32 %r296, %r295, %r41;
setp.ge.s32 %p139, %r54, %r296;
@%p139 bra $L__BB0_63;
mad.lo.s32 %r56, %r41, %r54, %r42;
setp.lt.s32 %p140, %r56, %r55;
@%p140 bra $L__BB0_67;
bra.uni $L__BB0_63;
$L__BB0_67:
div.rn.f32 %f199, %f31, %f49;
mov.b32 %r306, %f199;
div.rn.f32 %f200, %f32, %f49;
mov.b32 %r307, %f200;
div.rn.f32 %f201, %f33, %f49;
mov.b32 %r308, %f201;
div.rn.f32 %f202, %f34, %f49;
mov.b32 %r309, %f202;
mad.lo.s32 %r310, %r56, %r67, %r5;
mul.wide.s32 %rd35, %r310, 4;
add.s64 %rd34, %rd9, %rd35;
// begin inline asm
st.global.cs.v4.s32 [%rd34], {%r306,%r307,%r308,%r309};
// end inline asm
bra.uni $L__BB0_68;
$L__BB0_63:
div.rn.f32 %f50, %f31, %f49;
div.rn.f32 %f51, %f32, %f49;
div.rn.f32 %f52, %f33, %f49;
div.rn.f32 %f53, %f34, %f49;
@%p4 bra $L__BB0_68;
mov.u32 %r57, %ctaid.x;
mul.lo.s32 %r297, %r2, %r66;
shl.b32 %r58, %r297, 2;
add.s32 %r298, %r41, %r58;
add.s32 %r299, %r298, -1;
div.s32 %r300, %r299, %r41;
setp.ge.s32 %p142, %r57, %r300;
@%p142 bra $L__BB0_68;
mad.lo.s32 %r59, %r41, %r57, %r42;
setp.ge.s32 %p143, %r59, %r58;
@%p143 bra $L__BB0_68;
mad.lo.s32 %r305, %r59, %r67, %r5;
mul.wide.s32 %rd33, %r305, 4;
add.s64 %rd32, %rd9, %rd33;
mov.b32 %r304, %f53;
mov.b32 %r303, %f52;
mov.b32 %r302, %f51;
mov.b32 %r301, %f50;
// begin inline asm
st.global.cs.v4.s32 [%rd32], {%r301,%r302,%r303,%r304};
// end inline asm
$L__BB0_68:
ret;
}
--- 53997da5d
+++ 03a1b695e
@@ -56,11 +56,11 @@
and.pred %p1, %p3, %p2;
not.pred %p4, %p1;
@%p4 bra $L__BB0_3;
mov.u32 %r6, %ctaid.x;
- mul.lo.s32 %r87, %r2, %r66;
+ mul.lo.s32 %r87, %r66, %r2;
shl.b32 %r7, %r87, 2;
mov.u32 %r8, %ntid.y;
add.s32 %r88, %r8, %r7;
add.s32 %r89, %r88, -1;
div.s32 %r90, %r89, %r8;
9: GpuViewTest.GroupNormOriginal
Kernel 2
CUDA
PTX
53997da5d
Diff
03a1b695e
-1
+1 index type: int
registers: 40
gmem: 3
static smem: 16
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 1, 1> T2, Tensor<float, 1, 1> T1, Tensor<float, 5, 5> T6, Tensor<float, 5, 5> T4, Tensor<__half, 4, 4> T13) {
if (((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((T4.logical_size[0LL] * T1.logical_size[0LL]), ((nvfuser_index_t)blockDim.y)))) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.y))) < (T4.logical_size[0LL] * T1.logical_size[0LL]))) && (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.y))) % T1.logical_size[0LL]) < (32 * T4.logical_size[2LL]))) && ((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.x))) < (T4.logical_size[3LL] * T4.logical_size[4LL])))) {
Array<float, 1, 1> T15;
T15[0] = 0;
T15[0]
= T1[((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.y))) % T1.logical_size[0LL])];
Array<float, 1, 1> T9;
T9[0]
= T15[0];
Array<float, 1, 1> T14;
T14[0] = 0;
T14[0]
= T2[((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.y))) % T1.logical_size[0LL])];
Array<float, 1, 1> T10;
T10[0]
= T14[0];
Array<float, 1, 1> T16;
T16[0] = 0;
T16[0]
= T6[((32 * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.y))) / T1.logical_size[0LL])) + (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.y))) % T1.logical_size[0LL]) / T4.logical_size[2LL]))];
Array<float, 4, 4> T17;
T17.set(float(0));
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T17[0], &T4[((((4 * ((nvfuser_index_t)threadIdx.x)) + ((T4.logical_size[4LL] * T4.logical_size[3LL]) * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * T4.logical_size[4LL]) * T4.logical_size[3LL]) * ((nvfuser_index_t)blockIdx.y))) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.x)))]);
Array<__half, 4, 4> T18;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
Array<float, 1, 1> T7;
T7[0]
= T17[i0]
/ T16[0];
Array<float, 1, 1> T8;
T8[0]
= T7[0];
Array<float, 1, 1> T11;
T11[0]
= T8[0]
* T9[0];
Array<float, 1, 1> T12;
T12[0]
= T11[0]
+ T10[0];
T18[i0]
= __float2half(T12[0]);
}
loadLocalToGlobal<__half, /*vec_size=*/4, /*is_volatile=*/false>( &T13[((((4 * ((nvfuser_index_t)threadIdx.x)) + ((T4.logical_size[4LL] * T4.logical_size[3LL]) * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * T4.logical_size[4LL]) * T4.logical_size[3LL]) * ((nvfuser_index_t)blockIdx.y))) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.x)))], &T18[0]);
} else {
Array<float, 1, 1> T15;
T15[0] = 0;
T15[0]
= T1[((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.y))) % T1.logical_size[0LL])];
Array<float, 1, 1> T9;
T9[0]
= T15[0];
Array<float, 1, 1> T14;
T14[0] = 0;
T14[0]
= T2[((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.y))) % T1.logical_size[0LL])];
Array<float, 1, 1> T10;
T10[0]
= T14[0];
Array<float, 1, 1> T16;
T16[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.y))) < (T1.logical_size[0LL] * T4.logical_size[0LL])) && (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.y))) % T1.logical_size[0LL]) < (32 * T4.logical_size[2LL])))) {
T16[0]
= T6[((32 * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.y))) / T1.logical_size[0LL])) + (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.y))) % T1.logical_size[0LL]) / T4.logical_size[2LL]))];
}
Array<float, 4, 4> T17;
T17.set(float(0));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((T1.logical_size[0LL] * T4.logical_size[0LL]), ((nvfuser_index_t)blockDim.y)))) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.y))) < (T1.logical_size[0LL] * T4.logical_size[0LL]))) && (((3 + (4 * ((nvfuser_index_t)threadIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.x))) < (T4.logical_size[3LL] * T4.logical_size[4LL])))) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T17[0], &T4[((((4 * ((nvfuser_index_t)threadIdx.x)) + ((T4.logical_size[4LL] * T4.logical_size[3LL]) * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * T4.logical_size[4LL]) * T4.logical_size[3LL]) * ((nvfuser_index_t)blockIdx.y))) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.x)))]);
}
Array<__half, 4, 4> T18;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
Array<float, 1, 1> T7;
T7[0]
= T17[i0]
/ T16[0];
Array<float, 1, 1> T8;
T8[0]
= T7[0];
Array<float, 1, 1> T11;
T11[0]
= T8[0]
* T9[0];
Array<float, 1, 1> T12;
T12[0]
= T11[0]
+ T10[0];
T18[i0]
= __float2half(T12[0]);
}
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((T1.logical_size[0LL] * T4.logical_size[0LL]), ((nvfuser_index_t)blockDim.y)))) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.y))) < (T1.logical_size[0LL] * T4.logical_size[0LL]))) && (((3 + (4 * ((nvfuser_index_t)threadIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.x))) < (T4.logical_size[3LL] * T4.logical_size[4LL])))) {
loadLocalToGlobal<__half, /*vec_size=*/4, /*is_volatile=*/false>( &T13[((((4 * ((nvfuser_index_t)threadIdx.x)) + ((T4.logical_size[4LL] * T4.logical_size[3LL]) * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * T4.logical_size[4LL]) * T4.logical_size[3LL]) * ((nvfuser_index_t)blockIdx.y))) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.x)))], &T18[0]);
}
}
}
__global__ void nvfuser_N(Tensor<float, 1, 1> T2, Tensor<float, 1, 1> T1, Tensor<float, 5, 5> T6, Tensor<float, 5, 5> T4, Tensor<__half, 4, 4> T13) {
if (((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((T1.logical_size[0LL] * T4.logical_size[0LL]), ((nvfuser_index_t)blockDim.y)))) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.y))) < (T1.logical_size[0LL] * T4.logical_size[0LL]))) && (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.y))) % T1.logical_size[0LL]) < (32 * T4.logical_size[2LL]))) && ((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.x))) < (T4.logical_size[3LL] * T4.logical_size[4LL])))) {
Array<float, 1, 1> T15;
T15[0] = 0;
T15[0]
= T1[((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.y))) % T1.logical_size[0LL])];
Array<float, 1, 1> T9;
T9[0]
= T15[0];
Array<float, 1, 1> T14;
T14[0] = 0;
T14[0]
= T2[((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.y))) % T1.logical_size[0LL])];
Array<float, 1, 1> T10;
T10[0]
= T14[0];
Array<float, 1, 1> T16;
T16[0] = 0;
T16[0]
= T6[((32 * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.y))) / T1.logical_size[0LL])) + (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.y))) % T1.logical_size[0LL]) / T4.logical_size[2LL]))];
Array<float, 4, 4> T17;
T17.set(float(0));
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T17[0], &T4[((((4 * ((nvfuser_index_t)threadIdx.x)) + ((T4.logical_size[4LL] * T4.logical_size[3LL]) * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * T4.logical_size[4LL]) * T4.logical_size[3LL]) * ((nvfuser_index_t)blockIdx.y))) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.x)))]);
Array<__half, 4, 4> T18;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
Array<float, 1, 1> T7;
T7[0]
= T17[i0]
/ T16[0];
Array<float, 1, 1> T8;
T8[0]
= T7[0];
Array<float, 1, 1> T11;
T11[0]
= T8[0]
* T9[0];
Array<float, 1, 1> T12;
T12[0]
= T11[0]
+ T10[0];
T18[i0]
= __float2half(T12[0]);
}
loadLocalToGlobal<__half, /*vec_size=*/4, /*is_volatile=*/false>( &T13[((((4 * ((nvfuser_index_t)threadIdx.x)) + ((T4.logical_size[4LL] * T4.logical_size[3LL]) * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * T4.logical_size[4LL]) * T4.logical_size[3LL]) * ((nvfuser_index_t)blockIdx.y))) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.x)))], &T18[0]);
} else {
Array<float, 1, 1> T15;
T15[0] = 0;
T15[0]
= T1[((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.y))) % T1.logical_size[0LL])];
Array<float, 1, 1> T9;
T9[0]
= T15[0];
Array<float, 1, 1> T14;
T14[0] = 0;
T14[0]
= T2[((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.y))) % T1.logical_size[0LL])];
Array<float, 1, 1> T10;
T10[0]
= T14[0];
Array<float, 1, 1> T16;
T16[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.y))) < (T1.logical_size[0LL] * T4.logical_size[0LL])) && (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.y))) % T1.logical_size[0LL]) < (32 * T4.logical_size[2LL])))) {
T16[0]
= T6[((32 * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.y))) / T1.logical_size[0LL])) + (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.y))) % T1.logical_size[0LL]) / T4.logical_size[2LL]))];
}
Array<float, 4, 4> T17;
T17.set(float(0));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((T1.logical_size[0LL] * T4.logical_size[0LL]), ((nvfuser_index_t)blockDim.y)))) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.y))) < (T1.logical_size[0LL] * T4.logical_size[0LL]))) && (((3 + (4 * ((nvfuser_index_t)threadIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.x))) < (T4.logical_size[3LL] * T4.logical_size[4LL])))) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T17[0], &T4[((((4 * ((nvfuser_index_t)threadIdx.x)) + ((T4.logical_size[4LL] * T4.logical_size[3LL]) * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * T4.logical_size[4LL]) * T4.logical_size[3LL]) * ((nvfuser_index_t)blockIdx.y))) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.x)))]);
}
Array<__half, 4, 4> T18;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
Array<float, 1, 1> T7;
T7[0]
= T17[i0]
/ T16[0];
Array<float, 1, 1> T8;
T8[0]
= T7[0];
Array<float, 1, 1> T11;
T11[0]
= T8[0]
* T9[0];
Array<float, 1, 1> T12;
T12[0]
= T11[0]
+ T10[0];
T18[i0]
= __float2half(T12[0]);
}
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((T1.logical_size[0LL] * T4.logical_size[0LL]), ((nvfuser_index_t)blockDim.y)))) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.y))) < (T1.logical_size[0LL] * T4.logical_size[0LL]))) && (((3 + (4 * ((nvfuser_index_t)threadIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.x))) < (T4.logical_size[3LL] * T4.logical_size[4LL])))) {
loadLocalToGlobal<__half, /*vec_size=*/4, /*is_volatile=*/false>( &T13[((((4 * ((nvfuser_index_t)threadIdx.x)) + ((T4.logical_size[4LL] * T4.logical_size[3LL]) * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * T4.logical_size[4LL]) * T4.logical_size[3LL]) * ((nvfuser_index_t)blockIdx.y))) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.x)))], &T18[0]);
}
}
}
--- 53997da5d
+++ 03a1b695e
@@ -1,7 +1,7 @@
__global__ void nvfuser_N(Tensor<float, 1, 1> T2, Tensor<float, 1, 1> T1, Tensor<float, 5, 5> T6, Tensor<float, 5, 5> T4, Tensor<__half, 4, 4> T13) {
- if (((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((T4.logical_size[0LL] * T1.logical_size[0LL]), ((nvfuser_index_t)blockDim.y)))) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.y))) < (T4.logical_size[0LL] * T1.logical_size[0LL]))) && (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.y))) % T1.logical_size[0LL]) < (32 * T4.logical_size[2LL]))) && ((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.x))) < (T4.logical_size[3LL] * T4.logical_size[4LL])))) {
+ if (((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((T1.logical_size[0LL] * T4.logical_size[0LL]), ((nvfuser_index_t)blockDim.y)))) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.y))) < (T1.logical_size[0LL] * T4.logical_size[0LL]))) && (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.y))) % T1.logical_size[0LL]) < (32 * T4.logical_size[2LL]))) && ((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + ((4 * ((nvfuser_index_t)blockDim.x)) * ((nvfuser_index_t)blockIdx.x))) < (T4.logical_size[3LL] * T4.logical_size[4LL])))) {
Array<float, 1, 1> T15;
T15[0] = 0;
T15[0]
= T1[((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.y))) % T1.logical_size[0LL])];
Array<float, 1, 1> T9;
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_380_cu_117f2bf5_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_380_cu_117f2bf5_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_380_cu_117f2bf5_191103std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_380_cu_117f2bf5_1911011nvfuser_380ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi5ELi5EEES2_NS0_INS_6__halfELi4ELi4EEE(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_380_cu_117f2bf5_1911011nvfuser_380ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi5ELi5EEES2_NS0_INS_6__halfELi4ELi4EEE_param_0[16],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_380_cu_117f2bf5_1911011nvfuser_380ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi5ELi5EEES2_NS0_INS_6__halfELi4ELi4EEE_param_1[16],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_380_cu_117f2bf5_1911011nvfuser_380ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi5ELi5EEES2_NS0_INS_6__halfELi4ELi4EEE_param_2[48],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_380_cu_117f2bf5_1911011nvfuser_380ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi5ELi5EEES2_NS0_INS_6__halfELi4ELi4EEE_param_3[48],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_380_cu_117f2bf5_1911011nvfuser_380ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi5ELi5EEES2_NS0_INS_6__halfELi4ELi4EEE_param_4[40]
)
{
.reg .pred %p<13>;
.reg .b16 %rs<11>;
.reg .f32 %f<54>;
.reg .b32 %r<80>;
.reg .b64 %rd<27>;
ld.param.v2.u32 {%r19, %r20}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_380_cu_117f2bf5_1911011nvfuser_380ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi5ELi5EEES2_NS0_INS_6__halfELi4ELi4EEE_param_1+8];
ld.param.v2.u32 {%r21, %r22}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_380_cu_117f2bf5_1911011nvfuser_380ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi5ELi5EEES2_NS0_INS_6__halfELi4ELi4EEE_param_3+8];
ld.param.v2.u32 {%r23, %r24}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_380_cu_117f2bf5_1911011nvfuser_380ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi5ELi5EEES2_NS0_INS_6__halfELi4ELi4EEE_param_3+24];
ld.param.v2.u32 {%r25, %r26}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_380_cu_117f2bf5_1911011nvfuser_380ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi5ELi5EEES2_NS0_INS_6__halfELi4ELi4EEE_param_3+16];
ld.param.u64 %rd5, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_380_cu_117f2bf5_1911011nvfuser_380ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi5ELi5EEES2_NS0_INS_6__halfELi4ELi4EEE_param_4];
ld.param.u64 %rd6, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_380_cu_117f2bf5_1911011nvfuser_380ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi5ELi5EEES2_NS0_INS_6__halfELi4ELi4EEE_param_2];
ld.param.u64 %rd7, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_380_cu_117f2bf5_1911011nvfuser_380ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi5ELi5EEES2_NS0_INS_6__halfELi4ELi4EEE_param_1];
ld.param.u64 %rd8, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_380_cu_117f2bf5_1911011nvfuser_380ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi5ELi5EEES2_NS0_INS_6__halfELi4ELi4EEE_param_0];
cvta.to.global.u64 %rd1, %rd8;
cvta.to.global.u64 %rd2, %rd7;
cvta.to.global.u64 %rd3, %rd6;
ld.param.u64 %rd4, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_380_cu_117f2bf5_1911011nvfuser_380ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi5ELi5EEES2_NS0_INS_6__halfELi4ELi4EEE_param_3];
mul.lo.s32 %r5, %r21, %r19;
mov.u32 %r28, %ntid.y;
add.s32 %r29, %r28, %r5;
add.s32 %r30, %r29, -1;
div.s32 %r31, %r30, %r28;
mov.u32 %r32, %ctaid.y;
setp.lt.s32 %p2, %r32, %r31;
mov.u32 %r33, %tid.y;
mad.lo.s32 %r6, %r28, %r32, %r33;
setp.lt.s32 %p3, %r6, %r5;
and.pred %p1, %p3, %p2;
@%p1 bra $L__BB0_1;
bra.uni $L__BB0_3;
$L__BB0_1:
rem.s32 %r7, %r6, %r19;
shl.b32 %r34, %r25, 5;
setp.ge.s32 %p4, %r7, %r34;
@%p4 bra $L__BB0_3;
mov.u32 %r35, %tid.x;
shl.b32 %r8, %r35, 2;
mov.u32 %r36, %ntid.x;
mov.u32 %r37, %ctaid.x;
mul.lo.s32 %r38, %r36, %r37;
shl.b32 %r9, %r38, 2;
add.s32 %r39, %r8, %r9;
or.b32 %r40, %r39, 3;
mul.lo.s32 %r10, %r26, %r23;
setp.lt.s32 %p5, %r40, %r10;
@%p5 bra $L__BB0_11;
bra.uni $L__BB0_3;
$L__BB0_11:
mul.wide.s32 %rd20, %r7, 4;
add.s64 %rd21, %rd2, %rd20;
ld.global.f32 %f34, [%rd21];
add.s64 %rd22, %rd1, %rd20;
ld.global.f32 %f35, [%rd22];
div.s32 %r74, %r6, %r19;
shl.b32 %r75, %r74, 5;
div.s32 %r76, %r7, %r25;
add.s32 %r77, %r76, %r75;
mul.wide.s32 %rd23, %r77, 4;
add.s64 %rd24, %rd3, %rd23;
ld.global.f32 %f36, [%rd24];
add.s32 %r78, %r9, %r8;
mad.lo.s32 %r79, %r10, %r6, %r78;
mul.wide.s32 %rd25, %r79, 4;
add.s64 %rd18, %rd4, %rd25;
// begin inline asm
ld.global.cs.v4.u32 {%r68,%r69,%r70,%r71}, [%rd18];
// end inline asm
mov.b32 %f37, %r68;
div.rn.f32 %f38, %f37, %f36;
fma.rn.f32 %f30, %f34, %f38, %f35;
mov.b32 %f39, %r69;
div.rn.f32 %f40, %f39, %f36;
fma.rn.f32 %f31, %f34, %f40, %f35;
// begin inline asm
{ cvt.rn.f16.f32 %rs8, %f31;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs7, %f30;}
// end inline asm
mov.b32 %r72, {%rs7, %rs8};
mov.b32 %f41, %r70;
div.rn.f32 %f42, %f41, %f36;
fma.rn.f32 %f32, %f34, %f42, %f35;
mov.b32 %f43, %r71;
div.rn.f32 %f44, %f43, %f36;
fma.rn.f32 %f33, %f34, %f44, %f35;
// begin inline asm
{ cvt.rn.f16.f32 %rs10, %f33;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs9, %f32;}
// end inline asm
mov.b32 %r73, {%rs9, %rs10};
mul.wide.s32 %rd26, %r79, 2;
add.s64 %rd19, %rd5, %rd26;
// begin inline asm
st.global.cs.v2.s32 [%rd19], {%r72,%r73};
// end inline asm
bra.uni $L__BB0_12;
$L__BB0_3:
setp.ge.s32 %p6, %r6, %r5;
rem.s32 %r11, %r6, %r19;
mul.wide.s32 %rd9, %r11, 4;
add.s64 %rd10, %rd2, %rd9;
ld.global.f32 %f1, [%rd10];
add.s64 %rd11, %rd1, %rd9;
ld.global.f32 %f2, [%rd11];
shl.b32 %r41, %r25, 5;
setp.ge.s32 %p7, %r11, %r41;
mov.f32 %f50, 0f00000000;
or.pred %p8, %p6, %p7;
mov.f32 %f49, %f50;
@%p8 bra $L__BB0_5;
div.s32 %r42, %r6, %r19;
shl.b32 %r43, %r42, 5;
div.s32 %r44, %r11, %r25;
add.s32 %r45, %r44, %r43;
mul.wide.s32 %rd12, %r45, 4;
add.s64 %rd13, %rd3, %rd12;
ld.global.f32 %f49, [%rd13];
$L__BB0_5:
not.pred %p9, %p1;
mov.f32 %f51, %f50;
mov.f32 %f52, %f50;
mov.f32 %f53, %f50;
@%p9 bra $L__BB0_8;
mov.u32 %r46, %tid.x;
shl.b32 %r12, %r46, 2;
mov.u32 %r47, %ntid.x;
mov.u32 %r48, %ctaid.x;
mul.lo.s32 %r49, %r47, %r48;
shl.b32 %r13, %r49, 2;
add.s32 %r50, %r12, %r13;
or.b32 %r51, %r50, 3;
mul.lo.s32 %r14, %r26, %r23;
setp.ge.s32 %p10, %r51, %r14;
mov.f32 %f51, %f50;
mov.f32 %f52, %f50;
mov.f32 %f53, %f50;
@%p10 bra $L__BB0_8;
add.s32 %r56, %r13, %r12;
mad.lo.s32 %r57, %r14, %r6, %r56;
mul.wide.s32 %rd15, %r57, 4;
add.s64 %rd14, %rd4, %rd15;
// begin inline asm
ld.global.cs.v4.u32 {%r52,%r53,%r54,%r55}, [%rd14];
// end inline asm
mov.b32 %f53, %r52;
mov.b32 %f52, %r53;
mov.b32 %f51, %r54;
mov.b32 %f50, %r55;
$L__BB0_8:
div.rn.f32 %f26, %f53, %f49;
fma.rn.f32 %f22, %f1, %f26, %f2;
div.rn.f32 %f27, %f52, %f49;
fma.rn.f32 %f23, %f1, %f27, %f2;
// begin inline asm
{ cvt.rn.f16.f32 %rs4, %f23;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs3, %f22;}
// end inline asm
mov.b32 %r15, {%rs3, %rs4};
div.rn.f32 %f28, %f51, %f49;
fma.rn.f32 %f24, %f1, %f28, %f2;
// begin inline asm
{ cvt.rn.f16.f32 %rs5, %f24;}
// end inline asm
div.rn.f32 %f29, %f50, %f49;
fma.rn.f32 %f25, %f1, %f29, %f2;
// begin inline asm
{ cvt.rn.f16.f32 %rs6, %f25;}
// end inline asm
@%p9 bra $L__BB0_12;
mov.u32 %r58, %tid.x;
shl.b32 %r16, %r58, 2;
mov.u32 %r59, %ntid.x;
mov.u32 %r60, %ctaid.x;
mul.lo.s32 %r61, %r59, %r60;
shl.b32 %r17, %r61, 2;
add.s32 %r62, %r16, %r17;
or.b32 %r63, %r62, 3;
mul.lo.s32 %r18, %r26, %r23;
setp.ge.s32 %p12, %r63, %r18;
@%p12 bra $L__BB0_12;
add.s32 %r66, %r17, %r16;
mad.lo.s32 %r67, %r18, %r6, %r66;
mul.wide.s32 %rd17, %r67, 2;
add.s64 %rd16, %rd5, %rd17;
mov.b32 %r65, {%rs5, %rs6};
// begin inline asm
st.global.cs.v2.s32 [%rd16], {%r15,%r65};
// end inline asm
$L__BB0_12:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_380_cu_b415bb8a_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_380_cu_b415bb8a_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_380_cu_b415bb8a_160113std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_380_cu_b415bb8a_1601111nvfuser_380ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi5ELi5EEES2_NS0_INS_6__halfELi4ELi4EEE(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_380_cu_b415bb8a_1601111nvfuser_380ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi5ELi5EEES2_NS0_INS_6__halfELi4ELi4EEE_param_0[16],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_380_cu_b415bb8a_1601111nvfuser_380ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi5ELi5EEES2_NS0_INS_6__halfELi4ELi4EEE_param_1[16],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_380_cu_b415bb8a_1601111nvfuser_380ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi5ELi5EEES2_NS0_INS_6__halfELi4ELi4EEE_param_2[48],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_380_cu_b415bb8a_1601111nvfuser_380ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi5ELi5EEES2_NS0_INS_6__halfELi4ELi4EEE_param_3[48],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_380_cu_b415bb8a_1601111nvfuser_380ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi5ELi5EEES2_NS0_INS_6__halfELi4ELi4EEE_param_4[40]
)
{
.reg .pred %p<13>;
.reg .b16 %rs<11>;
.reg .f32 %f<54>;
.reg .b32 %r<80>;
.reg .b64 %rd<27>;
ld.param.v2.u32 {%r19, %r20}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_380_cu_b415bb8a_1601111nvfuser_380ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi5ELi5EEES2_NS0_INS_6__halfELi4ELi4EEE_param_1+8];
ld.param.v2.u32 {%r21, %r22}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_380_cu_b415bb8a_1601111nvfuser_380ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi5ELi5EEES2_NS0_INS_6__halfELi4ELi4EEE_param_3+8];
ld.param.v2.u32 {%r23, %r24}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_380_cu_b415bb8a_1601111nvfuser_380ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi5ELi5EEES2_NS0_INS_6__halfELi4ELi4EEE_param_3+24];
ld.param.v2.u32 {%r25, %r26}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_380_cu_b415bb8a_1601111nvfuser_380ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi5ELi5EEES2_NS0_INS_6__halfELi4ELi4EEE_param_3+16];
ld.param.u64 %rd5, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_380_cu_b415bb8a_1601111nvfuser_380ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi5ELi5EEES2_NS0_INS_6__halfELi4ELi4EEE_param_4];
ld.param.u64 %rd6, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_380_cu_b415bb8a_1601111nvfuser_380ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi5ELi5EEES2_NS0_INS_6__halfELi4ELi4EEE_param_2];
ld.param.u64 %rd7, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_380_cu_b415bb8a_1601111nvfuser_380ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi5ELi5EEES2_NS0_INS_6__halfELi4ELi4EEE_param_1];
ld.param.u64 %rd8, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_380_cu_b415bb8a_1601111nvfuser_380ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi5ELi5EEES2_NS0_INS_6__halfELi4ELi4EEE_param_0];
cvta.to.global.u64 %rd1, %rd8;
cvta.to.global.u64 %rd2, %rd7;
cvta.to.global.u64 %rd3, %rd6;
ld.param.u64 %rd4, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_380_cu_b415bb8a_1601111nvfuser_380ENS_6TensorIfLi1ELi1EEES1_NS0_IfLi5ELi5EEES2_NS0_INS_6__halfELi4ELi4EEE_param_3];
mul.lo.s32 %r5, %r21, %r19;
mov.u32 %r28, %ntid.y;
add.s32 %r29, %r28, %r5;
add.s32 %r30, %r29, -1;
div.s32 %r31, %r30, %r28;
mov.u32 %r32, %ctaid.y;
setp.lt.s32 %p2, %r32, %r31;
mov.u32 %r33, %tid.y;
mad.lo.s32 %r6, %r28, %r32, %r33;
setp.lt.s32 %p3, %r6, %r5;
and.pred %p1, %p3, %p2;
@%p1 bra $L__BB0_1;
bra.uni $L__BB0_3;
$L__BB0_1:
rem.s32 %r7, %r6, %r19;
shl.b32 %r34, %r25, 5;
setp.ge.s32 %p4, %r7, %r34;
@%p4 bra $L__BB0_3;
mov.u32 %r35, %tid.x;
shl.b32 %r8, %r35, 2;
mov.u32 %r36, %ntid.x;
mov.u32 %r37, %ctaid.x;
mul.lo.s32 %r38, %r36, %r37;
shl.b32 %r9, %r38, 2;
add.s32 %r39, %r8, %r9;
or.b32 %r40, %r39, 3;
mul.lo.s32 %r10, %r26, %r23;
setp.lt.s32 %p5, %r40, %r10;
@%p5 bra $L__BB0_11;
bra.uni $L__BB0_3;
$L__BB0_11:
mul.wide.s32 %rd20, %r7, 4;
add.s64 %rd21, %rd2, %rd20;
ld.global.f32 %f34, [%rd21];
add.s64 %rd22, %rd1, %rd20;
ld.global.f32 %f35, [%rd22];
div.s32 %r74, %r6, %r19;
shl.b32 %r75, %r74, 5;
div.s32 %r76, %r7, %r25;
add.s32 %r77, %r76, %r75;
mul.wide.s32 %rd23, %r77, 4;
add.s64 %rd24, %rd3, %rd23;
ld.global.f32 %f36, [%rd24];
add.s32 %r78, %r9, %r8;
mad.lo.s32 %r79, %r10, %r6, %r78;
mul.wide.s32 %rd25, %r79, 4;
add.s64 %rd18, %rd4, %rd25;
// begin inline asm
ld.global.cs.v4.u32 {%r68,%r69,%r70,%r71}, [%rd18];
// end inline asm
mov.b32 %f37, %r68;
div.rn.f32 %f38, %f37, %f36;
fma.rn.f32 %f30, %f34, %f38, %f35;
mov.b32 %f39, %r69;
div.rn.f32 %f40, %f39, %f36;
fma.rn.f32 %f31, %f34, %f40, %f35;
// begin inline asm
{ cvt.rn.f16.f32 %rs8, %f31;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs7, %f30;}
// end inline asm
mov.b32 %r72, {%rs7, %rs8};
mov.b32 %f41, %r70;
div.rn.f32 %f42, %f41, %f36;
fma.rn.f32 %f32, %f34, %f42, %f35;
mov.b32 %f43, %r71;
div.rn.f32 %f44, %f43, %f36;
fma.rn.f32 %f33, %f34, %f44, %f35;
// begin inline asm
{ cvt.rn.f16.f32 %rs10, %f33;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs9, %f32;}
// end inline asm
mov.b32 %r73, {%rs9, %rs10};
mul.wide.s32 %rd26, %r79, 2;
add.s64 %rd19, %rd5, %rd26;
// begin inline asm
st.global.cs.v2.s32 [%rd19], {%r72,%r73};
// end inline asm
bra.uni $L__BB0_12;
$L__BB0_3:
setp.ge.s32 %p6, %r6, %r5;
rem.s32 %r11, %r6, %r19;
mul.wide.s32 %rd9, %r11, 4;
add.s64 %rd10, %rd2, %rd9;
ld.global.f32 %f1, [%rd10];
add.s64 %rd11, %rd1, %rd9;
ld.global.f32 %f2, [%rd11];
shl.b32 %r41, %r25, 5;
setp.ge.s32 %p7, %r11, %r41;
mov.f32 %f50, 0f00000000;
or.pred %p8, %p6, %p7;
mov.f32 %f49, %f50;
@%p8 bra $L__BB0_5;
div.s32 %r42, %r6, %r19;
shl.b32 %r43, %r42, 5;
div.s32 %r44, %r11, %r25;
add.s32 %r45, %r44, %r43;
mul.wide.s32 %rd12, %r45, 4;
add.s64 %rd13, %rd3, %rd12;
ld.global.f32 %f49, [%rd13];
$L__BB0_5:
not.pred %p9, %p1;
mov.f32 %f51, %f50;
mov.f32 %f52, %f50;
mov.f32 %f53, %f50;
@%p9 bra $L__BB0_8;
mov.u32 %r46, %tid.x;
shl.b32 %r12, %r46, 2;
mov.u32 %r47, %ntid.x;
mov.u32 %r48, %ctaid.x;
mul.lo.s32 %r49, %r47, %r48;
shl.b32 %r13, %r49, 2;
add.s32 %r50, %r12, %r13;
or.b32 %r51, %r50, 3;
mul.lo.s32 %r14, %r26, %r23;
setp.ge.s32 %p10, %r51, %r14;
mov.f32 %f51, %f50;
mov.f32 %f52, %f50;
mov.f32 %f53, %f50;
@%p10 bra $L__BB0_8;
add.s32 %r56, %r13, %r12;
mad.lo.s32 %r57, %r14, %r6, %r56;
mul.wide.s32 %rd15, %r57, 4;
add.s64 %rd14, %rd4, %rd15;
// begin inline asm
ld.global.cs.v4.u32 {%r52,%r53,%r54,%r55}, [%rd14];
// end inline asm
mov.b32 %f53, %r52;
mov.b32 %f52, %r53;
mov.b32 %f51, %r54;
mov.b32 %f50, %r55;
$L__BB0_8:
div.rn.f32 %f26, %f53, %f49;
fma.rn.f32 %f22, %f1, %f26, %f2;
div.rn.f32 %f27, %f52, %f49;
fma.rn.f32 %f23, %f1, %f27, %f2;
// begin inline asm
{ cvt.rn.f16.f32 %rs4, %f23;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs3, %f22;}
// end inline asm
mov.b32 %r15, {%rs3, %rs4};
div.rn.f32 %f28, %f51, %f49;
fma.rn.f32 %f24, %f1, %f28, %f2;
// begin inline asm
{ cvt.rn.f16.f32 %rs5, %f24;}
// end inline asm
div.rn.f32 %f29, %f50, %f49;
fma.rn.f32 %f25, %f1, %f29, %f2;
// begin inline asm
{ cvt.rn.f16.f32 %rs6, %f25;}
// end inline asm
@%p9 bra $L__BB0_12;
mov.u32 %r58, %tid.x;
shl.b32 %r16, %r58, 2;
mov.u32 %r59, %ntid.x;
mov.u32 %r60, %ctaid.x;
mul.lo.s32 %r61, %r59, %r60;
shl.b32 %r17, %r61, 2;
add.s32 %r62, %r16, %r17;
or.b32 %r63, %r62, 3;
mul.lo.s32 %r18, %r26, %r23;
setp.ge.s32 %p12, %r63, %r18;
@%p12 bra $L__BB0_12;
add.s32 %r66, %r17, %r16;
mad.lo.s32 %r67, %r18, %r6, %r66;
mul.wide.s32 %rd17, %r67, 2;
add.s64 %rd16, %rd5, %rd17;
mov.b32 %r65, {%rs5, %rs6};
// begin inline asm
st.global.cs.v2.s32 [%rd16], {%r15,%r65};
// end inline asm
$L__BB0_12:
ret;
}
10: GpuViewTest.GroupNormReshapeMovedToOutput
Kernel 1
CUDA
PTX
53997da5d
Diff
03a1b695e
-4
+4 index type: int
registers: 38→ 35
gmem: 3
static smem: 16
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<__half, 4, 4> T0, Tensor<float, 1, 1> T1, Tensor<float, 1, 1> T2, Tensor<__half, 5, 5> T16) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
NVFUSER_DEFINE_MAGIC_ZERO;
Array<float, 8, 1> T4;
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv((((ceilDiv(T0.logical_size[1LL], 32)) * T0.logical_size[2LL]) * T0.logical_size[3LL]), 8))) && ((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) / (T0.logical_size[2LL] * T0.logical_size[3LL])) + ((ceilDiv(T0.logical_size[1LL], 32)) * (((nvfuser_index_t)blockIdx.x) % 32))) < T0.logical_size[1LL]))) {
Array<__half, 8, 8> T17;
T17.set(__half(0));
loadGlobalToLocal<__half, /*vec_size=*/8, /*is_volatile=*/false, CacheOp::Streaming>(&T17[0], &T0[(((8 * ((nvfuser_index_t)threadIdx.x)) + (((T0.logical_size[3LL] * T0.logical_size[2LL]) * T0.logical_size[1LL]) * (((nvfuser_index_t)blockIdx.x) / 32))) + (((T0.logical_size[3LL] * T0.logical_size[2LL]) * (ceilDiv(T0.logical_size[1LL], 32))) * (((nvfuser_index_t)blockIdx.x) % 32)))]);
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 8; ++i0) {
Array<float, 1, 1> T3;
T3[0]
= __half2float(T17[i0]);
T4[i0]
= T3[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
Array<__half, 8, 8> T17;
T17.set(__half(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv((((ceilDiv(T0.logical_size[1LL], 32)) * T0.logical_size[2LL]) * T0.logical_size[3LL]), 8))) && ((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) / (T0.logical_size[2LL] * T0.logical_size[3LL])) + ((ceilDiv(T0.logical_size[1LL], 32)) * (((nvfuser_index_t)blockIdx.x) % 32))) < T0.logical_size[1LL]))) {
loadGlobalToLocal<__half, /*vec_size=*/8, /*is_volatile=*/false, CacheOp::Streaming>(&T17[0], &T0[(((8 * ((nvfuser_index_t)threadIdx.x)) + (((T0.logical_size[3LL] * T0.logical_size[2LL]) * T0.logical_size[1LL]) * (((nvfuser_index_t)blockIdx.x) / 32))) + (((T0.logical_size[3LL] * T0.logical_size[2LL]) * (ceilDiv(T0.logical_size[1LL], 32))) * (((nvfuser_index_t)blockIdx.x) % 32)))]);
}
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 8; ++i0) {
Array<float, 1, 1> T3;
T3[0]
= __half2float(T17[i0]);
T4[i0]
= T3[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<float, 1, 1> T5;
T5[0] = 0.000000000e+00f;
Array<float, 1, 1> T21;
T21[0] = 0.000000000e+00f;
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv((((ceilDiv(T0.logical_size[1LL], 32)) * T0.logical_size[2LL]) * T0.logical_size[3LL]), 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < (((ceilDiv(T0.logical_size[1LL], 32)) * T0.logical_size[2LL]) * T0.logical_size[3LL])))) {
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 8; ++i1) {
T21[0]
= T21[0]
+ T4[i1];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 8; ++i1) {
if ((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < (((ceilDiv(T0.logical_size[1LL], 32)) * T0.logical_size[2LL]) * T0.logical_size[3LL])) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv((((ceilDiv(T0.logical_size[1LL], 32)) * T0.logical_size[2LL]) * T0.logical_size[3LL]), 8))))) {
T21[0]
= T21[0]
+ T4[i1];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
warp::warpReduceTIDX<false, true>(T5[0], T21[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T6;
broadcast::blockBroadcast<true, false, false, true>(T6[0], T5[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv((((ceilDiv(T0.logical_size[1LL], 32)) * T0.logical_size[2LL]) * T0.logical_size[3LL]), 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < (((ceilDiv(T0.logical_size[1LL], 32)) * T0.logical_size[2LL]) * T0.logical_size[3LL]))) && ((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) / (T0.logical_size[2LL] * T0.logical_size[3LL])) + ((ceilDiv(T0.logical_size[1LL], 32)) * (((nvfuser_index_t)blockIdx.x) % 32))) < T0.logical_size[1LL]))) {
Array<__half, 8, 8> T20;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 8; ++i2) {
Array<float, 1, 1> T18;
T18[0]
= T1[(((ceilDiv(T0.logical_size[1LL], 32)) * (((nvfuser_index_t)blockIdx.x) % 32)) + (((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 + nvfuser_zero)) / (T0.logical_size[2LL] * T0.logical_size[3LL])))];
Array<float, 1, 1> T8;
T8[0] = 0;
T8[0]
= T18[0];
Array<float, 1, 1> T10;
T10[0]
= T8[0];
Array<float, 1, 1> T19;
T19[0]
= T2[(((ceilDiv(T0.logical_size[1LL], 32)) * (((nvfuser_index_t)blockIdx.x) % 32)) + (((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 + nvfuser_zero)) / (T0.logical_size[2LL] * T0.logical_size[3LL])))];
Array<float, 1, 1> T9;
T9[0] = 0;
T9[0]
= T19[0];
Array<float, 1, 1> T11;
T11[0]
= T9[0];
Array<float, 1, 1> T7;
T7[0]
= T4[i2]
/ T6[0];
Array<float, 1, 1> T12;
T12[0]
= T7[0]
* T10[0];
Array<float, 1, 1> T13;
T13[0]
= T12[0]
+ T11[0];
Array<__half, 1, 1> T14;
T14[0]
= __float2half(T13[0]);
T20[i2]
= T14[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
loadLocalToGlobal<__half, /*vec_size=*/8, /*is_volatile=*/false>( &T16[((8 * ((nvfuser_index_t)threadIdx.x)) + (((T0.logical_size[3LL] * T0.logical_size[2LL]) * (ceilDiv(T0.logical_size[1LL], 32))) * ((nvfuser_index_t)blockIdx.x)))], &T20[0]);
} else {
Array<__half, 8, 8> T20;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 8; ++i2) {
Array<float, 1, 1> T18;
if ((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < (((ceilDiv(T0.logical_size[1LL], 32)) * T0.logical_size[2LL]) * T0.logical_size[3LL])) && ((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) / (T0.logical_size[2LL] * T0.logical_size[3LL])) + ((ceilDiv(T0.logical_size[1LL], 32)) * (((nvfuser_index_t)blockIdx.x) % 32))) < T0.logical_size[1LL]))) {
T18[0]
= T1[(((ceilDiv(T0.logical_size[1LL], 32)) * (((nvfuser_index_t)blockIdx.x) % 32)) + (((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 + nvfuser_zero)) / (T0.logical_size[2LL] * T0.logical_size[3LL])))];
}
Array<float, 1, 1> T8;
T8[0] = 0;
if ((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < (((ceilDiv(T0.logical_size[1LL], 32)) * T0.logical_size[2LL]) * T0.logical_size[3LL])) && ((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) / (T0.logical_size[2LL] * T0.logical_size[3LL])) + ((ceilDiv(T0.logical_size[1LL], 32)) * (((nvfuser_index_t)blockIdx.x) % 32))) < T0.logical_size[1LL]))) {
T8[0]
= T18[0];
}
Array<float, 1, 1> T10;
T10[0]
= T8[0];
Array<float, 1, 1> T19;
if ((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < (((ceilDiv(T0.logical_size[1LL], 32)) * T0.logical_size[2LL]) * T0.logical_size[3LL])) && ((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) / (T0.logical_size[2LL] * T0.logical_size[3LL])) + ((ceilDiv(T0.logical_size[1LL], 32)) * (((nvfuser_index_t)blockIdx.x) % 32))) < T0.logical_size[1LL]))) {
T19[0]
= T2[(((ceilDiv(T0.logical_size[1LL], 32)) * (((nvfuser_index_t)blockIdx.x) % 32)) + (((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 + nvfuser_zero)) / (T0.logical_size[2LL] * T0.logical_size[3LL])))];
}
Array<float, 1, 1> T9;
T9[0] = 0;
if ((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < (((ceilDiv(T0.logical_size[1LL], 32)) * T0.logical_size[2LL]) * T0.logical_size[3LL])) && ((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) / (T0.logical_size[2LL] * T0.logical_size[3LL])) + ((ceilDiv(T0.logical_size[1LL], 32)) * (((nvfuser_index_t)blockIdx.x) % 32))) < T0.logical_size[1LL]))) {
T9[0]
= T19[0];
}
Array<float, 1, 1> T11;
T11[0]
= T9[0];
Array<float, 1, 1> T7;
T7[0]
= T4[i2]
/ T6[0];
Array<float, 1, 1> T12;
T12[0]
= T7[0]
* T10[0];
Array<float, 1, 1> T13;
T13[0]
= T12[0]
+ T11[0];
Array<__half, 1, 1> T14;
T14[0]
= __float2half(T13[0]);
T20[i2]
= T14[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv((((ceilDiv(T0.logical_size[1LL], 32)) * T0.logical_size[2LL]) * T0.logical_size[3LL]), 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < (((ceilDiv(T0.logical_size[1LL], 32)) * T0.logical_size[2LL]) * T0.logical_size[3LL])))) {
loadLocalToGlobal<__half, /*vec_size=*/8, /*is_volatile=*/false>( &T16[((8 * ((nvfuser_index_t)threadIdx.x)) + (((T0.logical_size[3LL] * T0.logical_size[2LL]) * (ceilDiv(T0.logical_size[1LL], 32))) * ((nvfuser_index_t)blockIdx.x)))], &T20[0]);
}
}
}
__global__ void nvfuser_N(Tensor<__half, 4, 4> T0, Tensor<float, 1, 1> T1, Tensor<float, 1, 1> T2, Tensor<__half, 5, 5> T16) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
NVFUSER_DEFINE_MAGIC_ZERO;
Array<float, 8, 1> T4;
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv((((ceilDiv(T0.logical_size[1LL], 32)) * T0.logical_size[2LL]) * T0.logical_size[3LL]), 8))) && ((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) / (T0.logical_size[2LL] * T0.logical_size[3LL])) + ((ceilDiv(T0.logical_size[1LL], 32)) * (((nvfuser_index_t)blockIdx.x) % 32))) < T0.logical_size[1LL]))) {
Array<__half, 8, 8> T17;
T17.set(__half(0));
loadGlobalToLocal<__half, /*vec_size=*/8, /*is_volatile=*/false, CacheOp::Streaming>(&T17[0], &T0[(((8 * ((nvfuser_index_t)threadIdx.x)) + (((T0.logical_size[3LL] * T0.logical_size[2LL]) * T0.logical_size[1LL]) * (((nvfuser_index_t)blockIdx.x) / 32))) + (((T0.logical_size[3LL] * T0.logical_size[2LL]) * (ceilDiv(T0.logical_size[1LL], 32))) * (((nvfuser_index_t)blockIdx.x) % 32)))]);
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 8; ++i0) {
Array<float, 1, 1> T3;
T3[0]
= __half2float(T17[i0]);
T4[i0]
= T3[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
Array<__half, 8, 8> T17;
T17.set(__half(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv((((ceilDiv(T0.logical_size[1LL], 32)) * T0.logical_size[2LL]) * T0.logical_size[3LL]), 8))) && ((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) / (T0.logical_size[2LL] * T0.logical_size[3LL])) + ((ceilDiv(T0.logical_size[1LL], 32)) * (((nvfuser_index_t)blockIdx.x) % 32))) < T0.logical_size[1LL]))) {
loadGlobalToLocal<__half, /*vec_size=*/8, /*is_volatile=*/false, CacheOp::Streaming>(&T17[0], &T0[(((8 * ((nvfuser_index_t)threadIdx.x)) + (((T0.logical_size[3LL] * T0.logical_size[2LL]) * T0.logical_size[1LL]) * (((nvfuser_index_t)blockIdx.x) / 32))) + (((T0.logical_size[3LL] * T0.logical_size[2LL]) * (ceilDiv(T0.logical_size[1LL], 32))) * (((nvfuser_index_t)blockIdx.x) % 32)))]);
}
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 8; ++i0) {
Array<float, 1, 1> T3;
T3[0]
= __half2float(T17[i0]);
T4[i0]
= T3[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<float, 1, 1> T5;
T5[0] = 0.000000000e+00f;
Array<float, 1, 1> T21;
T21[0] = 0.000000000e+00f;
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv((((ceilDiv(T0.logical_size[1LL], 32)) * T0.logical_size[2LL]) * T0.logical_size[3LL]), 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < (((ceilDiv(T0.logical_size[1LL], 32)) * T0.logical_size[2LL]) * T0.logical_size[3LL])))) {
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 8; ++i1) {
T21[0]
= T21[0]
+ T4[i1];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 8; ++i1) {
if ((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < (((ceilDiv(T0.logical_size[1LL], 32)) * T0.logical_size[2LL]) * T0.logical_size[3LL])) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv((((ceilDiv(T0.logical_size[1LL], 32)) * T0.logical_size[2LL]) * T0.logical_size[3LL]), 8))))) {
T21[0]
= T21[0]
+ T4[i1];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
warp::warpReduceTIDX<false, true>(T5[0], T21[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T6;
broadcast::blockBroadcast<true, false, false, true>(T6[0], T5[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv((((ceilDiv(T0.logical_size[1LL], 32)) * T0.logical_size[2LL]) * T0.logical_size[3LL]), 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < (((ceilDiv(T0.logical_size[1LL], 32)) * T0.logical_size[2LL]) * T0.logical_size[3LL]))) && ((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) / (T0.logical_size[2LL] * T0.logical_size[3LL])) + ((ceilDiv(T0.logical_size[1LL], 32)) * (((nvfuser_index_t)blockIdx.x) % 32))) < T0.logical_size[1LL]))) {
Array<__half, 8, 8> T20;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 8; ++i2) {
Array<float, 1, 1> T18;
T18[0]
= T1[(((ceilDiv(T0.logical_size[1LL], 32)) * (((nvfuser_index_t)blockIdx.x) % 32)) + (((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 + nvfuser_zero)) / (T0.logical_size[2LL] * T0.logical_size[3LL])))];
Array<float, 1, 1> T8;
T8[0] = 0;
T8[0]
= T18[0];
Array<float, 1, 1> T10;
T10[0]
= T8[0];
Array<float, 1, 1> T19;
T19[0]
= T2[(((ceilDiv(T0.logical_size[1LL], 32)) * (((nvfuser_index_t)blockIdx.x) % 32)) + (((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 + nvfuser_zero)) / (T0.logical_size[2LL] * T0.logical_size[3LL])))];
Array<float, 1, 1> T9;
T9[0] = 0;
T9[0]
= T19[0];
Array<float, 1, 1> T11;
T11[0]
= T9[0];
Array<float, 1, 1> T7;
T7[0]
= T4[i2]
/ T6[0];
Array<float, 1, 1> T12;
T12[0]
= T7[0]
* T10[0];
Array<float, 1, 1> T13;
T13[0]
= T12[0]
+ T11[0];
Array<__half, 1, 1> T14;
T14[0]
= __float2half(T13[0]);
T20[i2]
= T14[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
loadLocalToGlobal<__half, /*vec_size=*/8, /*is_volatile=*/false>( &T16[((8 * ((nvfuser_index_t)threadIdx.x)) + (((T0.logical_size[3LL] * T0.logical_size[2LL]) * (ceilDiv(T0.logical_size[1LL], 32))) * ((nvfuser_index_t)blockIdx.x)))], &T20[0]);
} else {
Array<__half, 8, 8> T20;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 8; ++i2) {
Array<float, 1, 1> T18;
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) / (T0.logical_size[2LL] * T0.logical_size[3LL])) + ((ceilDiv(T0.logical_size[1LL], 32)) * (((nvfuser_index_t)blockIdx.x) % 32))) < T0.logical_size[1LL])) {
T18[0]
= T1[(((ceilDiv(T0.logical_size[1LL], 32)) * (((nvfuser_index_t)blockIdx.x) % 32)) + (((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 + nvfuser_zero)) / (T0.logical_size[2LL] * T0.logical_size[3LL])))];
}
Array<float, 1, 1> T8;
T8[0] = 0;
if (((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < (((ceilDiv(T0.logical_size[1LL], 32)) * T0.logical_size[2LL]) * T0.logical_size[3LL]))) {
T8[0]
= T18[0];
}
Array<float, 1, 1> T10;
T10[0]
= T8[0];
Array<float, 1, 1> T19;
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) / (T0.logical_size[2LL] * T0.logical_size[3LL])) + ((ceilDiv(T0.logical_size[1LL], 32)) * (((nvfuser_index_t)blockIdx.x) % 32))) < T0.logical_size[1LL])) {
T19[0]
= T2[(((ceilDiv(T0.logical_size[1LL], 32)) * (((nvfuser_index_t)blockIdx.x) % 32)) + (((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 + nvfuser_zero)) / (T0.logical_size[2LL] * T0.logical_size[3LL])))];
}
Array<float, 1, 1> T9;
T9[0] = 0;
if (((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < (((ceilDiv(T0.logical_size[1LL], 32)) * T0.logical_size[2LL]) * T0.logical_size[3LL]))) {
T9[0]
= T19[0];
}
Array<float, 1, 1> T11;
T11[0]
= T9[0];
Array<float, 1, 1> T7;
T7[0]
= T4[i2]
/ T6[0];
Array<float, 1, 1> T12;
T12[0]
= T7[0]
* T10[0];
Array<float, 1, 1> T13;
T13[0]
= T12[0]
+ T11[0];
Array<__half, 1, 1> T14;
T14[0]
= __float2half(T13[0]);
T20[i2]
= T14[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv((((ceilDiv(T0.logical_size[1LL], 32)) * T0.logical_size[2LL]) * T0.logical_size[3LL]), 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < (((ceilDiv(T0.logical_size[1LL], 32)) * T0.logical_size[2LL]) * T0.logical_size[3LL])))) {
loadLocalToGlobal<__half, /*vec_size=*/8, /*is_volatile=*/false>( &T16[((8 * ((nvfuser_index_t)threadIdx.x)) + (((T0.logical_size[3LL] * T0.logical_size[2LL]) * (ceilDiv(T0.logical_size[1LL], 32))) * ((nvfuser_index_t)blockIdx.x)))], &T20[0]);
}
}
}
--- 53997da5d
+++ 03a1b695e
@@ -105,31 +105,31 @@
} else {
Array<__half, 8, 8> T20;
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 8; ++i2) {
Array<float, 1, 1> T18;
- if ((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < (((ceilDiv(T0.logical_size[1LL], 32)) * T0.logical_size[2LL]) * T0.logical_size[3LL])) && ((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) / (T0.logical_size[2LL] * T0.logical_size[3LL])) + ((ceilDiv(T0.logical_size[1LL], 32)) * (((nvfuser_index_t)blockIdx.x) % 32))) < T0.logical_size[1LL]))) {
+ if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) / (T0.logical_size[2LL] * T0.logical_size[3LL])) + ((ceilDiv(T0.logical_size[1LL], 32)) * (((nvfuser_index_t)blockIdx.x) % 32))) < T0.logical_size[1LL])) {
T18[0]
= T1[(((ceilDiv(T0.logical_size[1LL], 32)) * (((nvfuser_index_t)blockIdx.x) % 32)) + (((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 + nvfuser_zero)) / (T0.logical_size[2LL] * T0.logical_size[3LL])))];
}
Array<float, 1, 1> T8;
T8[0] = 0;
- if ((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < (((ceilDiv(T0.logical_size[1LL], 32)) * T0.logical_size[2LL]) * T0.logical_size[3LL])) && ((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) / (T0.logical_size[2LL] * T0.logical_size[3LL])) + ((ceilDiv(T0.logical_size[1LL], 32)) * (((nvfuser_index_t)blockIdx.x) % 32))) < T0.logical_size[1LL]))) {
+ if (((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < (((ceilDiv(T0.logical_size[1LL], 32)) * T0.logical_size[2LL]) * T0.logical_size[3LL]))) {
T8[0]
= T18[0];
}
Array<float, 1, 1> T10;
T10[0]
= T8[0];
Array<float, 1, 1> T19;
- if ((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < (((ceilDiv(T0.logical_size[1LL], 32)) * T0.logical_size[2LL]) * T0.logical_size[3LL])) && ((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) / (T0.logical_size[2LL] * T0.logical_size[3LL])) + ((ceilDiv(T0.logical_size[1LL], 32)) * (((nvfuser_index_t)blockIdx.x) % 32))) < T0.logical_size[1LL]))) {
+ if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) / (T0.logical_size[2LL] * T0.logical_size[3LL])) + ((ceilDiv(T0.logical_size[1LL], 32)) * (((nvfuser_index_t)blockIdx.x) % 32))) < T0.logical_size[1LL])) {
T19[0]
= T2[(((ceilDiv(T0.logical_size[1LL], 32)) * (((nvfuser_index_t)blockIdx.x) % 32)) + (((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 + nvfuser_zero)) / (T0.logical_size[2LL] * T0.logical_size[3LL])))];
}
Array<float, 1, 1> T9;
T9[0] = 0;
- if ((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < (((ceilDiv(T0.logical_size[1LL], 32)) * T0.logical_size[2LL]) * T0.logical_size[3LL])) && ((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) / (T0.logical_size[2LL] * T0.logical_size[3LL])) + ((ceilDiv(T0.logical_size[1LL], 32)) * (((nvfuser_index_t)blockIdx.x) % 32))) < T0.logical_size[1LL]))) {
+ if (((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < (((ceilDiv(T0.logical_size[1LL], 32)) * T0.logical_size[2LL]) * T0.logical_size[3LL]))) {
T9[0]
= T19[0];
}
Array<float, 1, 1> T11;
T11[0]
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_117f2bf5_1911011nvfuser_381ENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEEE14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_117f2bf5_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_117f2bf5_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_117f2bf5_191103std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_117f2bf5_191105arrayE[];
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_117f2bf5_1911011nvfuser_381ENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEE(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_117f2bf5_1911011nvfuser_381ENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEE_param_0[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_117f2bf5_1911011nvfuser_381ENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEE_param_1[16],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_117f2bf5_1911011nvfuser_381ENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEE_param_2[16],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_117f2bf5_1911011nvfuser_381ENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEE_param_3[48]
)
{
.reg .pred %p<96>;
.reg .b16 %rs<47>;
.reg .f32 %f<263>;
.reg .b32 %r<598>;
.reg .b64 %rd<82>;
// demoted variable
.shared .align 4 .u32 _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_117f2bf5_1911011nvfuser_381ENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEEE14nvfuser_zero_s;
ld.param.v2.u32 {%r63, %r64}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_117f2bf5_1911011nvfuser_381ENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEE_param_0+8];
ld.param.v2.u32 {%r65, %r66}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_117f2bf5_1911011nvfuser_381ENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEE_param_0+16];
ld.param.u64 %rd6, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_117f2bf5_1911011nvfuser_381ENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEE_param_3];
ld.param.u64 %rd7, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_117f2bf5_1911011nvfuser_381ENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEE_param_2];
ld.param.u64 %rd8, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_117f2bf5_1911011nvfuser_381ENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEE_param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_117f2bf5_1911011nvfuser_381ENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEE_param_0];
cvta.to.global.u64 %rd2, %rd8;
cvta.to.global.u64 %rd3, %rd7;
mov.u32 %r6, %tid.x;
setp.ne.s32 %p1, %r6, 0;
@%p1 bra $L__BB0_2;
mov.u32 %r77, 0;
st.shared.u32 [_ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_117f2bf5_1911011nvfuser_381ENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEEE14nvfuser_zero_s], %r77;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd9, _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_117f2bf5_1911011nvfuser_381ENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEEE14nvfuser_zero_s;
atom.shared.min.s32 %r78, [%rd9], %r6;
ld.shared.u32 %r7, [_ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_117f2bf5_1911011nvfuser_381ENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEEE14nvfuser_zero_s];
add.s32 %r79, %r64, 31;
shr.s32 %r80, %r79, 31;
shr.u32 %r81, %r80, 27;
add.s32 %r82, %r79, %r81;
shr.s32 %r8, %r82, 5;
mul.lo.s32 %r9, %r66, %r65;
mul.lo.s32 %r10, %r9, %r8;
add.s32 %r83, %r10, 7;
shr.s32 %r84, %r83, 31;
shr.u32 %r85, %r84, 29;
add.s32 %r86, %r83, %r85;
shr.s32 %r11, %r86, 3;
setp.ge.s32 %p2, %r6, %r11;
@%p2 bra $L__BB0_4;
shl.b32 %r12, %r6, 3;
or.b32 %r87, %r12, 7;
div.s32 %r88, %r87, %r9;
mov.u32 %r13, %ctaid.x;
shr.s32 %r89, %r13, 31;
shr.u32 %r90, %r89, 27;
add.s32 %r91, %r13, %r90;
and.b32 %r92, %r91, -32;
sub.s32 %r93, %r13, %r92;
mul.lo.s32 %r14, %r8, %r93;
add.s32 %r94, %r88, %r14;
setp.lt.s32 %p3, %r94, %r64;
@%p3 bra $L__BB0_8;
bra.uni $L__BB0_4;
$L__BB0_8:
shr.s32 %r120, %r91, 5;
mad.lo.s32 %r121, %r64, %r120, %r14;
mad.lo.s32 %r122, %r9, %r121, %r12;
mul.wide.s32 %rd13, %r122, 2;
add.s64 %rd12, %rd1, %rd13;
// begin inline asm
ld.global.cs.v4.u32 {%r113,%r114,%r115,%r116}, [%rd12];
// end inline asm
mov.b32 {%rs23, %rs24}, %r113;
// begin inline asm
{ cvt.f32.f16 %f227, %rs23;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f226, %rs24;}
// end inline asm
mov.b32 {%rs25, %rs26}, %r114;
// begin inline asm
{ cvt.f32.f16 %f225, %rs25;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f224, %rs26;}
// end inline asm
mov.b32 {%rs27, %rs28}, %r115;
// begin inline asm
{ cvt.f32.f16 %f223, %rs27;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f222, %rs28;}
// end inline asm
mov.b32 {%rs29, %rs30}, %r116;
// begin inline asm
{ cvt.f32.f16 %f221, %rs29;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f220, %rs30;}
// end inline asm
bra.uni $L__BB0_9;
$L__BB0_4:
mov.f32 %f98, 0f00000000;
// begin inline asm
{ cvt.rn.f16.f32 %rs13, %f98;}
// end inline asm
mov.b32 %r594, {%rs13, %rs13};
mov.u32 %r595, %r594;
mov.u32 %r596, %r594;
mov.u32 %r597, %r594;
@%p2 bra $L__BB0_7;
shl.b32 %r16, %r6, 3;
or.b32 %r95, %r16, 7;
div.s32 %r96, %r95, %r9;
mov.u32 %r17, %ctaid.x;
shr.s32 %r97, %r17, 31;
shr.u32 %r98, %r97, 27;
add.s32 %r99, %r17, %r98;
and.b32 %r100, %r99, -32;
sub.s32 %r101, %r17, %r100;
mul.lo.s32 %r18, %r8, %r101;
add.s32 %r102, %r96, %r18;
setp.ge.s32 %p5, %r102, %r64;
mov.u32 %r595, %r594;
mov.u32 %r596, %r594;
mov.u32 %r597, %r594;
@%p5 bra $L__BB0_7;
shr.s32 %r110, %r99, 5;
mad.lo.s32 %r111, %r64, %r110, %r18;
mad.lo.s32 %r112, %r9, %r111, %r16;
mul.wide.s32 %rd11, %r112, 2;
add.s64 %rd10, %rd1, %rd11;
// begin inline asm
ld.global.cs.v4.u32 {%r597,%r596,%r595,%r594}, [%rd10];
// end inline asm
$L__BB0_7:
mov.b32 {%rs14, %rs15}, %r597;
// begin inline asm
{ cvt.f32.f16 %f227, %rs14;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f226, %rs15;}
// end inline asm
mov.b32 {%rs16, %rs17}, %r596;
// begin inline asm
{ cvt.f32.f16 %f225, %rs16;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f224, %rs17;}
// end inline asm
mov.b32 {%rs18, %rs19}, %r595;
// begin inline asm
{ cvt.f32.f16 %f223, %rs18;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f222, %rs19;}
// end inline asm
mov.b32 {%rs20, %rs21}, %r594;
// begin inline asm
{ cvt.f32.f16 %f221, %rs20;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f220, %rs21;}
// end inline asm
$L__BB0_9:
setp.lt.s32 %p6, %r6, %r11;
@%p6 bra $L__BB0_10;
bra.uni $L__BB0_11;
$L__BB0_10:
shl.b32 %r123, %r6, 3;
or.b32 %r124, %r123, 7;
setp.lt.s32 %p7, %r124, %r10;
@%p7 bra $L__BB0_12;
bra.uni $L__BB0_11;
$L__BB0_12:
add.f32 %f124, %f227, 0f00000000;
add.f32 %f125, %f124, %f226;
add.f32 %f126, %f125, %f225;
add.f32 %f127, %f126, %f224;
add.f32 %f128, %f127, %f223;
add.f32 %f129, %f128, %f222;
add.f32 %f130, %f129, %f221;
add.f32 %f228, %f130, %f220;
bra.uni $L__BB0_13;
$L__BB0_11:
shl.b32 %r125, %r6, 3;
or.b32 %r126, %r125, 7;
setp.lt.s32 %p9, %r126, %r10;
and.pred %p10, %p9, %p6;
add.f32 %f116, %f227, 0f00000000;
add.f32 %f117, %f116, %f226;
add.f32 %f118, %f117, %f225;
add.f32 %f119, %f118, %f224;
add.f32 %f120, %f119, %f223;
add.f32 %f121, %f120, %f222;
add.f32 %f122, %f121, %f221;
add.f32 %f123, %f122, %f220;
selp.f32 %f228, %f123, 0f00000000, %p10;
$L__BB0_13:
mov.b32 %r127, %f228;
mov.u32 %r128, 31;
mov.u32 %r129, 16;
mov.u32 %r130, -1;
shfl.sync.bfly.b32 %r131|%p11, %r127, %r129, %r128, %r130;
mov.b32 %f131, %r131;
add.f32 %f132, %f228, %f131;
mov.b32 %r132, %f132;
mov.u32 %r133, 8;
shfl.sync.bfly.b32 %r134|%p12, %r132, %r133, %r128, %r130;
mov.b32 %f133, %r134;
add.f32 %f134, %f132, %f133;
mov.b32 %r135, %f134;
mov.u32 %r136, 4;
shfl.sync.bfly.b32 %r137|%p13, %r135, %r136, %r128, %r130;
mov.b32 %f135, %r137;
add.f32 %f136, %f134, %f135;
mov.b32 %r138, %f136;
mov.u32 %r139, 2;
shfl.sync.bfly.b32 %r140|%p14, %r138, %r139, %r128, %r130;
mov.b32 %f137, %r140;
add.f32 %f138, %f136, %f137;
mov.b32 %r141, %f138;
mov.u32 %r142, 1;
shfl.sync.bfly.b32 %r143|%p15, %r141, %r142, %r128, %r130;
mov.b32 %f139, %r143;
add.f32 %f230, %f138, %f139;
shl.b32 %r27, %r7, 2;
shr.u32 %r28, %r6, 5;
mov.u32 %r144, %tid.z;
mov.u32 %r145, %ntid.y;
mov.u32 %r146, %tid.y;
mad.lo.s32 %r29, %r144, %r145, %r146;
and.b32 %r30, %r6, 31;
setp.ne.s32 %p16, %r30, 0;
mov.u32 %r147, %ntid.x;
shr.u32 %r31, %r147, 5;
mul.lo.s32 %r32, %r29, %r31;
bar.sync 0;
@%p16 bra $L__BB0_15;
add.s32 %r148, %r32, %r28;
mul.wide.u32 %rd14, %r148, 4;
mov.u64 %rd15, _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_117f2bf5_191105arrayE;
add.s64 %rd16, %rd15, %rd14;
st.shared.f32 [%rd16], %f230;
$L__BB0_15:
bar.sync 0;
setp.ne.s32 %p17, %r28, 0;
@%p17 bra $L__BB0_19;
setp.ge.u32 %p18, %r30, %r31;
mov.f32 %f229, 0f00000000;
@%p18 bra $L__BB0_18;
add.s32 %r149, %r32, %r30;
mul.wide.u32 %rd17, %r149, 4;
mov.u64 %rd18, _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_117f2bf5_191105arrayE;
add.s64 %rd19, %rd18, %rd17;
ld.shared.f32 %f229, [%rd19];
$L__BB0_18:
mov.b32 %r150, %f229;
mov.u32 %r151, 31;
mov.u32 %r152, 16;
mov.u32 %r153, -1;
shfl.sync.bfly.b32 %r154|%p19, %r150, %r152, %r151, %r153;
mov.b32 %f141, %r154;
add.f32 %f142, %f229, %f141;
mov.b32 %r155, %f142;
mov.u32 %r156, 8;
shfl.sync.bfly.b32 %r157|%p20, %r155, %r156, %r151, %r153;
mov.b32 %f143, %r157;
add.f32 %f144, %f142, %f143;
mov.b32 %r158, %f144;
mov.u32 %r159, 4;
shfl.sync.bfly.b32 %r160|%p21, %r158, %r159, %r151, %r153;
mov.b32 %f145, %r160;
add.f32 %f146, %f144, %f145;
mov.b32 %r161, %f146;
mov.u32 %r162, 2;
shfl.sync.bfly.b32 %r163|%p22, %r161, %r162, %r151, %r153;
mov.b32 %f147, %r163;
add.f32 %f148, %f146, %f147;
mov.b32 %r164, %f148;
mov.u32 %r165, 1;
shfl.sync.bfly.b32 %r166|%p23, %r164, %r165, %r151, %r153;
mov.b32 %f149, %r166;
add.f32 %f230, %f148, %f149;
$L__BB0_19:
bar.sync 0;
mul.wide.s32 %rd20, %r29, 4;
mov.u64 %rd21, _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_117f2bf5_191105arrayE;
add.s64 %rd4, %rd21, %rd20;
setp.eq.s32 %p24, %r6, 0;
@%p24 bra $L__BB0_20;
bra.uni $L__BB0_21;
$L__BB0_20:
setp.eq.s32 %p25, %r30, 0;
add.f32 %f150, %f230, 0f00000000;
selp.f32 %f151, %f150, 0f00000000, %p25;
st.shared.f32 [%rd4], %f151;
$L__BB0_21:
bar.sync 0;
ld.shared.f32 %f33, [%rd4];
bar.sync 0;
@%p6 bra $L__BB0_22;
bra.uni $L__BB0_24;
$L__BB0_22:
shl.b32 %r33, %r6, 3;
or.b32 %r167, %r33, 7;
setp.ge.s32 %p27, %r167, %r10;
@%p27 bra $L__BB0_24;
add.s32 %r168, %r33, 7;
div.s32 %r169, %r168, %r9;
mov.u32 %r34, %ctaid.x;
shr.s32 %r170, %r34, 31;
shr.u32 %r171, %r170, 27;
add.s32 %r172, %r34, %r171;
and.b32 %r173, %r172, -32;
sub.s32 %r174, %r34, %r173;
mul.lo.s32 %r35, %r8, %r174;
add.s32 %r175, %r169, %r35;
setp.lt.s32 %p28, %r175, %r64;
@%p28 bra $L__BB0_106;
bra.uni $L__BB0_24;
$L__BB0_106:
add.s32 %r569, %r27, %r33;
div.s32 %r570, %r569, %r9;
add.s32 %r571, %r570, %r35;
mul.wide.s32 %rd57, %r571, 4;
add.s64 %rd58, %rd2, %rd57;
add.s64 %rd59, %rd3, %rd57;
div.rn.f32 %f196, %f227, %f33;
ld.global.f32 %f197, [%rd58];
ld.global.f32 %f198, [%rd59];
fma.rn.f32 %f188, %f197, %f196, %f198;
add.s32 %r572, %r569, 1;
div.s32 %r573, %r572, %r9;
add.s32 %r574, %r573, %r35;
mul.wide.s32 %rd60, %r574, 4;
add.s64 %rd61, %rd2, %rd60;
add.s64 %rd62, %rd3, %rd60;
div.rn.f32 %f199, %f226, %f33;
ld.global.f32 %f200, [%rd61];
ld.global.f32 %f201, [%rd62];
fma.rn.f32 %f189, %f200, %f199, %f201;
// begin inline asm
{ cvt.rn.f16.f32 %rs39, %f188;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs40, %f189;}
// end inline asm
mov.b32 %r565, {%rs39, %rs40};
add.s32 %r575, %r569, 2;
div.s32 %r576, %r575, %r9;
add.s32 %r577, %r576, %r35;
mul.wide.s32 %rd63, %r577, 4;
add.s64 %rd64, %rd2, %rd63;
add.s64 %rd65, %rd3, %rd63;
div.rn.f32 %f202, %f225, %f33;
ld.global.f32 %f203, [%rd64];
ld.global.f32 %f204, [%rd65];
fma.rn.f32 %f190, %f203, %f202, %f204;
add.s32 %r578, %r569, 3;
div.s32 %r579, %r578, %r9;
add.s32 %r580, %r579, %r35;
mul.wide.s32 %rd66, %r580, 4;
add.s64 %rd67, %rd2, %rd66;
add.s64 %rd68, %rd3, %rd66;
div.rn.f32 %f205, %f224, %f33;
ld.global.f32 %f206, [%rd67];
ld.global.f32 %f207, [%rd68];
fma.rn.f32 %f191, %f206, %f205, %f207;
// begin inline asm
{ cvt.rn.f16.f32 %rs42, %f191;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs41, %f190;}
// end inline asm
mov.b32 %r566, {%rs41, %rs42};
add.s32 %r581, %r569, 4;
div.s32 %r582, %r581, %r9;
add.s32 %r583, %r582, %r35;
mul.wide.s32 %rd69, %r583, 4;
add.s64 %rd70, %rd2, %rd69;
add.s64 %rd71, %rd3, %rd69;
div.rn.f32 %f208, %f223, %f33;
ld.global.f32 %f209, [%rd70];
ld.global.f32 %f210, [%rd71];
fma.rn.f32 %f192, %f209, %f208, %f210;
add.s32 %r584, %r569, 5;
div.s32 %r585, %r584, %r9;
add.s32 %r586, %r585, %r35;
mul.wide.s32 %rd72, %r586, 4;
add.s64 %rd73, %rd2, %rd72;
add.s64 %rd74, %rd3, %rd72;
div.rn.f32 %f211, %f222, %f33;
ld.global.f32 %f212, [%rd73];
ld.global.f32 %f213, [%rd74];
fma.rn.f32 %f193, %f212, %f211, %f213;
// begin inline asm
{ cvt.rn.f16.f32 %rs44, %f193;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs43, %f192;}
// end inline asm
mov.b32 %r567, {%rs43, %rs44};
add.s32 %r587, %r569, 6;
div.s32 %r588, %r587, %r9;
add.s32 %r589, %r588, %r35;
mul.wide.s32 %rd75, %r589, 4;
add.s64 %rd76, %rd2, %rd75;
add.s64 %rd77, %rd3, %rd75;
div.rn.f32 %f214, %f221, %f33;
ld.global.f32 %f215, [%rd76];
ld.global.f32 %f216, [%rd77];
fma.rn.f32 %f194, %f215, %f214, %f216;
add.s32 %r590, %r569, 7;
div.s32 %r591, %r590, %r9;
add.s32 %r592, %r591, %r35;
mul.wide.s32 %rd78, %r592, 4;
add.s64 %rd79, %rd2, %rd78;
add.s64 %rd80, %rd3, %rd78;
div.rn.f32 %f217, %f220, %f33;
ld.global.f32 %f218, [%rd79];
ld.global.f32 %f219, [%rd80];
fma.rn.f32 %f195, %f218, %f217, %f219;
// begin inline asm
{ cvt.rn.f16.f32 %rs46, %f195;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs45, %f194;}
// end inline asm
mov.b32 %r568, {%rs45, %rs46};
mad.lo.s32 %r593, %r10, %r34, %r33;
mul.wide.s32 %rd81, %r593, 2;
add.s64 %rd56, %rd6, %rd81;
// begin inline asm
st.global.cs.v4.s32 [%rd56], {%r565,%r566,%r567,%r568};
// end inline asm
bra.uni $L__BB0_107;
$L__BB0_24:
shl.b32 %r36, %r6, 3;
or.b32 %r176, %r36, 7;
setp.ge.s32 %p29, %r176, %r10;
@%p29 bra $L__BB0_27;
add.s32 %r177, %r36, 7;
div.s32 %r178, %r177, %r9;
mov.u32 %r179, %ctaid.x;
shr.s32 %r180, %r179, 31;
shr.u32 %r181, %r180, 27;
add.s32 %r182, %r179, %r181;
and.b32 %r183, %r182, -32;
sub.s32 %r184, %r179, %r183;
mul.lo.s32 %r37, %r8, %r184;
add.s32 %r185, %r178, %r37;
setp.ge.s32 %p30, %r185, %r64;
@%p30 bra $L__BB0_27;
add.s32 %r186, %r27, %r36;
div.s32 %r187, %r186, %r9;
add.s32 %r188, %r187, %r37;
mul.wide.s32 %rd22, %r188, 4;
add.s64 %rd23, %rd2, %rd22;
ld.global.f32 %f235, [%rd23];
$L__BB0_27:
add.s32 %r189, %r36, 7;
setp.ge.s32 %p31, %r189, %r10;
mov.f32 %f232, 0f00000000;
@%p31 bra $L__BB0_29;
div.s32 %r191, %r189, %r9;
mov.u32 %r192, %ctaid.x;
shr.s32 %r193, %r192, 31;
shr.u32 %r194, %r193, 27;
add.s32 %r195, %r192, %r194;
and.b32 %r196, %r195, -32;
sub.s32 %r197, %r192, %r196;
mad.lo.s32 %r198, %r8, %r197, %r191;
setp.lt.s32 %p32, %r198, %r64;
selp.f32 %f232, %f235, 0f00000000, %p32;
$L__BB0_29:
@%p31 bra $L__BB0_32;
div.s32 %r201, %r189, %r9;
mov.u32 %r202, %ctaid.x;
shr.s32 %r203, %r202, 31;
shr.u32 %r204, %r203, 27;
add.s32 %r205, %r202, %r204;
and.b32 %r206, %r205, -32;
sub.s32 %r207, %r202, %r206;
mul.lo.s32 %r38, %r8, %r207;
add.s32 %r208, %r201, %r38;
setp.ge.s32 %p34, %r208, %r64;
@%p34 bra $L__BB0_32;
add.s32 %r209, %r27, %r36;
div.s32 %r210, %r209, %r9;
add.s32 %r211, %r210, %r38;
mul.wide.s32 %rd24, %r211, 4;
add.s64 %rd25, %rd3, %rd24;
ld.global.f32 %f237, [%rd25];
$L__BB0_32:
mov.f32 %f234, 0f00000000;
@%p31 bra $L__BB0_34;
div.s32 %r214, %r189, %r9;
mov.u32 %r215, %ctaid.x;
shr.s32 %r216, %r215, 31;
shr.u32 %r217, %r216, 27;
add.s32 %r218, %r215, %r217;
and.b32 %r219, %r218, -32;
sub.s32 %r220, %r215, %r219;
mad.lo.s32 %r221, %r8, %r220, %r214;
setp.lt.s32 %p36, %r221, %r64;
selp.f32 %f234, %f237, 0f00000000, %p36;
$L__BB0_34:
div.rn.f32 %f159, %f227, %f33;
fma.rn.f32 %f158, %f232, %f159, %f234;
// begin inline asm
{ cvt.rn.f16.f32 %rs31, %f158;}
// end inline asm
@%p31 bra $L__BB0_37;
div.s32 %r224, %r189, %r9;
mov.u32 %r225, %ctaid.x;
shr.s32 %r226, %r225, 31;
shr.u32 %r227, %r226, 27;
add.s32 %r228, %r225, %r227;
and.b32 %r229, %r228, -32;
sub.s32 %r230, %r225, %r229;
mul.lo.s32 %r39, %r8, %r230;
add.s32 %r231, %r224, %r39;
setp.ge.s32 %p38, %r231, %r64;
@%p38 bra $L__BB0_37;
add.s32 %r232, %r27, %r36;
add.s32 %r233, %r232, 1;
div.s32 %r234, %r233, %r9;
add.s32 %r235, %r234, %r39;
mul.wide.s32 %rd26, %r235, 4;
add.s64 %rd27, %rd2, %rd26;
ld.global.f32 %f235, [%rd27];
$L__BB0_37:
mov.f32 %f236, 0f00000000;
@%p31 bra $L__BB0_39;
div.s32 %r238, %r189, %r9;
mov.u32 %r239, %ctaid.x;
shr.s32 %r240, %r239, 31;
shr.u32 %r241, %r240, 27;
add.s32 %r242, %r239, %r241;
and.b32 %r243, %r242, -32;
sub.s32 %r244, %r239, %r243;
mad.lo.s32 %r245, %r8, %r244, %r238;
setp.lt.s32 %p40, %r245, %r64;
selp.f32 %f236, %f235, 0f00000000, %p40;
$L__BB0_39:
@%p31 bra $L__BB0_42;
div.s32 %r248, %r189, %r9;
mov.u32 %r249, %ctaid.x;
shr.s32 %r250, %r249, 31;
shr.u32 %r251, %r250, 27;
add.s32 %r252, %r249, %r251;
and.b32 %r253, %r252, -32;
sub.s32 %r254, %r249, %r253;
mul.lo.s32 %r40, %r8, %r254;
add.s32 %r255, %r248, %r40;
setp.ge.s32 %p42, %r255, %r64;
@%p42 bra $L__BB0_42;
add.s32 %r256, %r27, %r36;
add.s32 %r257, %r256, 1;
div.s32 %r258, %r257, %r9;
add.s32 %r259, %r258, %r40;
mul.wide.s32 %rd28, %r259, 4;
add.s64 %rd29, %rd3, %rd28;
ld.global.f32 %f237, [%rd29];
$L__BB0_42:
mov.f32 %f238, 0f00000000;
@%p31 bra $L__BB0_44;
div.s32 %r262, %r189, %r9;
mov.u32 %r263, %ctaid.x;
shr.s32 %r264, %r263, 31;
shr.u32 %r265, %r264, 27;
add.s32 %r266, %r263, %r265;
and.b32 %r267, %r266, -32;
sub.s32 %r268, %r263, %r267;
mad.lo.s32 %r269, %r8, %r268, %r262;
setp.lt.s32 %p44, %r269, %r64;
selp.f32 %f238, %f237, 0f00000000, %p44;
$L__BB0_44:
div.rn.f32 %f163, %f226, %f33;
fma.rn.f32 %f162, %f236, %f163, %f238;
// begin inline asm
{ cvt.rn.f16.f32 %rs32, %f162;}
// end inline asm
@%p31 bra $L__BB0_47;
div.s32 %r272, %r189, %r9;
mov.u32 %r273, %ctaid.x;
shr.s32 %r274, %r273, 31;
shr.u32 %r275, %r274, 27;
add.s32 %r276, %r273, %r275;
and.b32 %r277, %r276, -32;
sub.s32 %r278, %r273, %r277;
mul.lo.s32 %r41, %r8, %r278;
add.s32 %r279, %r272, %r41;
setp.ge.s32 %p46, %r279, %r64;
@%p46 bra $L__BB0_47;
add.s32 %r280, %r27, %r36;
add.s32 %r281, %r280, 2;
div.s32 %r282, %r281, %r9;
add.s32 %r283, %r282, %r41;
mul.wide.s32 %rd30, %r283, 4;
add.s64 %rd31, %rd2, %rd30;
ld.global.f32 %f235, [%rd31];
$L__BB0_47:
mov.f32 %f240, 0f00000000;
@%p31 bra $L__BB0_49;
div.s32 %r286, %r189, %r9;
mov.u32 %r287, %ctaid.x;
shr.s32 %r288, %r287, 31;
shr.u32 %r289, %r288, 27;
add.s32 %r290, %r287, %r289;
and.b32 %r291, %r290, -32;
sub.s32 %r292, %r287, %r291;
mad.lo.s32 %r293, %r8, %r292, %r286;
setp.lt.s32 %p48, %r293, %r64;
selp.f32 %f240, %f235, 0f00000000, %p48;
$L__BB0_49:
@%p31 bra $L__BB0_52;
div.s32 %r296, %r189, %r9;
mov.u32 %r297, %ctaid.x;
shr.s32 %r298, %r297, 31;
shr.u32 %r299, %r298, 27;
add.s32 %r300, %r297, %r299;
and.b32 %r301, %r300, -32;
sub.s32 %r302, %r297, %r301;
mul.lo.s32 %r42, %r8, %r302;
add.s32 %r303, %r296, %r42;
setp.ge.s32 %p50, %r303, %r64;
@%p50 bra $L__BB0_52;
add.s32 %r304, %r27, %r36;
add.s32 %r305, %r304, 2;
div.s32 %r306, %r305, %r9;
add.s32 %r307, %r306, %r42;
mul.wide.s32 %rd32, %r307, 4;
add.s64 %rd33, %rd3, %rd32;
ld.global.f32 %f237, [%rd33];
$L__BB0_52:
mov.f32 %f242, 0f00000000;
@%p31 bra $L__BB0_54;
div.s32 %r310, %r189, %r9;
mov.u32 %r311, %ctaid.x;
shr.s32 %r312, %r311, 31;
shr.u32 %r313, %r312, 27;
add.s32 %r314, %r311, %r313;
and.b32 %r315, %r314, -32;
sub.s32 %r316, %r311, %r315;
mad.lo.s32 %r317, %r8, %r316, %r310;
setp.lt.s32 %p52, %r317, %r64;
selp.f32 %f242, %f237, 0f00000000, %p52;
$L__BB0_54:
div.rn.f32 %f167, %f225, %f33;
fma.rn.f32 %f166, %f240, %f167, %f242;
// begin inline asm
{ cvt.rn.f16.f32 %rs33, %f166;}
// end inline asm
@%p31 bra $L__BB0_57;
div.s32 %r320, %r189, %r9;
mov.u32 %r321, %ctaid.x;
shr.s32 %r322, %r321, 31;
shr.u32 %r323, %r322, 27;
add.s32 %r324, %r321, %r323;
and.b32 %r325, %r324, -32;
sub.s32 %r326, %r321, %r325;
mul.lo.s32 %r43, %r8, %r326;
add.s32 %r327, %r320, %r43;
setp.ge.s32 %p54, %r327, %r64;
@%p54 bra $L__BB0_57;
add.s32 %r328, %r27, %r36;
add.s32 %r329, %r328, 3;
div.s32 %r330, %r329, %r9;
add.s32 %r331, %r330, %r43;
mul.wide.s32 %rd34, %r331, 4;
add.s64 %rd35, %rd2, %rd34;
ld.global.f32 %f235, [%rd35];
$L__BB0_57:
mov.f32 %f244, 0f00000000;
@%p31 bra $L__BB0_59;
div.s32 %r334, %r189, %r9;
mov.u32 %r335, %ctaid.x;
shr.s32 %r336, %r335, 31;
shr.u32 %r337, %r336, 27;
add.s32 %r338, %r335, %r337;
and.b32 %r339, %r338, -32;
sub.s32 %r340, %r335, %r339;
mad.lo.s32 %r341, %r8, %r340, %r334;
setp.lt.s32 %p56, %r341, %r64;
selp.f32 %f244, %f235, 0f00000000, %p56;
$L__BB0_59:
@%p31 bra $L__BB0_62;
div.s32 %r344, %r189, %r9;
mov.u32 %r345, %ctaid.x;
shr.s32 %r346, %r345, 31;
shr.u32 %r347, %r346, 27;
add.s32 %r348, %r345, %r347;
and.b32 %r349, %r348, -32;
sub.s32 %r350, %r345, %r349;
mul.lo.s32 %r44, %r8, %r350;
add.s32 %r351, %r344, %r44;
setp.ge.s32 %p58, %r351, %r64;
@%p58 bra $L__BB0_62;
add.s32 %r352, %r27, %r36;
add.s32 %r353, %r352, 3;
div.s32 %r354, %r353, %r9;
add.s32 %r355, %r354, %r44;
mul.wide.s32 %rd36, %r355, 4;
add.s64 %rd37, %rd3, %rd36;
ld.global.f32 %f237, [%rd37];
$L__BB0_62:
mov.f32 %f246, 0f00000000;
@%p31 bra $L__BB0_64;
div.s32 %r358, %r189, %r9;
mov.u32 %r359, %ctaid.x;
shr.s32 %r360, %r359, 31;
shr.u32 %r361, %r360, 27;
add.s32 %r362, %r359, %r361;
and.b32 %r363, %r362, -32;
sub.s32 %r364, %r359, %r363;
mad.lo.s32 %r365, %r8, %r364, %r358;
setp.lt.s32 %p60, %r365, %r64;
selp.f32 %f246, %f237, 0f00000000, %p60;
$L__BB0_64:
div.rn.f32 %f171, %f224, %f33;
fma.rn.f32 %f170, %f244, %f171, %f246;
// begin inline asm
{ cvt.rn.f16.f32 %rs34, %f170;}
// end inline asm
@%p31 bra $L__BB0_67;
div.s32 %r368, %r189, %r9;
mov.u32 %r369, %ctaid.x;
shr.s32 %r370, %r369, 31;
shr.u32 %r371, %r370, 27;
add.s32 %r372, %r369, %r371;
and.b32 %r373, %r372, -32;
sub.s32 %r374, %r369, %r373;
mul.lo.s32 %r45, %r8, %r374;
add.s32 %r375, %r368, %r45;
setp.ge.s32 %p62, %r375, %r64;
@%p62 bra $L__BB0_67;
add.s32 %r376, %r27, %r36;
add.s32 %r377, %r376, 4;
div.s32 %r378, %r377, %r9;
add.s32 %r379, %r378, %r45;
mul.wide.s32 %rd38, %r379, 4;
add.s64 %rd39, %rd2, %rd38;
ld.global.f32 %f235, [%rd39];
$L__BB0_67:
mov.f32 %f248, 0f00000000;
@%p31 bra $L__BB0_69;
div.s32 %r382, %r189, %r9;
mov.u32 %r383, %ctaid.x;
shr.s32 %r384, %r383, 31;
shr.u32 %r385, %r384, 27;
add.s32 %r386, %r383, %r385;
and.b32 %r387, %r386, -32;
sub.s32 %r388, %r383, %r387;
mad.lo.s32 %r389, %r8, %r388, %r382;
setp.lt.s32 %p64, %r389, %r64;
selp.f32 %f248, %f235, 0f00000000, %p64;
$L__BB0_69:
@%p31 bra $L__BB0_72;
div.s32 %r392, %r189, %r9;
mov.u32 %r393, %ctaid.x;
shr.s32 %r394, %r393, 31;
shr.u32 %r395, %r394, 27;
add.s32 %r396, %r393, %r395;
and.b32 %r397, %r396, -32;
sub.s32 %r398, %r393, %r397;
mul.lo.s32 %r46, %r8, %r398;
add.s32 %r399, %r392, %r46;
setp.ge.s32 %p66, %r399, %r64;
@%p66 bra $L__BB0_72;
add.s32 %r400, %r27, %r36;
add.s32 %r401, %r400, 4;
div.s32 %r402, %r401, %r9;
add.s32 %r403, %r402, %r46;
mul.wide.s32 %rd40, %r403, 4;
add.s64 %rd41, %rd3, %rd40;
ld.global.f32 %f237, [%rd41];
$L__BB0_72:
mov.f32 %f250, 0f00000000;
@%p31 bra $L__BB0_74;
div.s32 %r406, %r189, %r9;
mov.u32 %r407, %ctaid.x;
shr.s32 %r408, %r407, 31;
shr.u32 %r409, %r408, 27;
add.s32 %r410, %r407, %r409;
and.b32 %r411, %r410, -32;
sub.s32 %r412, %r407, %r411;
mad.lo.s32 %r413, %r8, %r412, %r406;
setp.lt.s32 %p68, %r413, %r64;
selp.f32 %f250, %f237, 0f00000000, %p68;
$L__BB0_74:
div.rn.f32 %f175, %f223, %f33;
fma.rn.f32 %f174, %f248, %f175, %f250;
// begin inline asm
{ cvt.rn.f16.f32 %rs35, %f174;}
// end inline asm
@%p31 bra $L__BB0_77;
div.s32 %r416, %r189, %r9;
mov.u32 %r417, %ctaid.x;
shr.s32 %r418, %r417, 31;
shr.u32 %r419, %r418, 27;
add.s32 %r420, %r417, %r419;
and.b32 %r421, %r420, -32;
sub.s32 %r422, %r417, %r421;
mul.lo.s32 %r47, %r8, %r422;
add.s32 %r423, %r416, %r47;
setp.ge.s32 %p70, %r423, %r64;
@%p70 bra $L__BB0_77;
add.s32 %r424, %r27, %r36;
add.s32 %r425, %r424, 5;
div.s32 %r426, %r425, %r9;
add.s32 %r427, %r426, %r47;
mul.wide.s32 %rd42, %r427, 4;
add.s64 %rd43, %rd2, %rd42;
ld.global.f32 %f235, [%rd43];
$L__BB0_77:
mov.f32 %f252, 0f00000000;
@%p31 bra $L__BB0_79;
div.s32 %r430, %r189, %r9;
mov.u32 %r431, %ctaid.x;
shr.s32 %r432, %r431, 31;
shr.u32 %r433, %r432, 27;
add.s32 %r434, %r431, %r433;
and.b32 %r435, %r434, -32;
sub.s32 %r436, %r431, %r435;
mad.lo.s32 %r437, %r8, %r436, %r430;
setp.lt.s32 %p72, %r437, %r64;
selp.f32 %f252, %f235, 0f00000000, %p72;
$L__BB0_79:
@%p31 bra $L__BB0_82;
div.s32 %r440, %r189, %r9;
mov.u32 %r441, %ctaid.x;
shr.s32 %r442, %r441, 31;
shr.u32 %r443, %r442, 27;
add.s32 %r444, %r441, %r443;
and.b32 %r445, %r444, -32;
sub.s32 %r446, %r441, %r445;
mul.lo.s32 %r48, %r8, %r446;
add.s32 %r447, %r440, %r48;
setp.ge.s32 %p74, %r447, %r64;
@%p74 bra $L__BB0_82;
add.s32 %r448, %r27, %r36;
add.s32 %r449, %r448, 5;
div.s32 %r450, %r449, %r9;
add.s32 %r451, %r450, %r48;
mul.wide.s32 %rd44, %r451, 4;
add.s64 %rd45, %rd3, %rd44;
ld.global.f32 %f237, [%rd45];
$L__BB0_82:
mov.f32 %f254, 0f00000000;
@%p31 bra $L__BB0_84;
div.s32 %r454, %r189, %r9;
mov.u32 %r455, %ctaid.x;
shr.s32 %r456, %r455, 31;
shr.u32 %r457, %r456, 27;
add.s32 %r458, %r455, %r457;
and.b32 %r459, %r458, -32;
sub.s32 %r460, %r455, %r459;
mad.lo.s32 %r461, %r8, %r460, %r454;
setp.lt.s32 %p76, %r461, %r64;
selp.f32 %f254, %f237, 0f00000000, %p76;
$L__BB0_84:
div.rn.f32 %f179, %f222, %f33;
fma.rn.f32 %f178, %f252, %f179, %f254;
// begin inline asm
{ cvt.rn.f16.f32 %rs36, %f178;}
// end inline asm
@%p31 bra $L__BB0_87;
div.s32 %r464, %r189, %r9;
mov.u32 %r465, %ctaid.x;
shr.s32 %r466, %r465, 31;
shr.u32 %r467, %r466, 27;
add.s32 %r468, %r465, %r467;
and.b32 %r469, %r468, -32;
sub.s32 %r470, %r465, %r469;
mul.lo.s32 %r49, %r8, %r470;
add.s32 %r471, %r464, %r49;
setp.ge.s32 %p78, %r471, %r64;
@%p78 bra $L__BB0_87;
add.s32 %r472, %r27, %r36;
add.s32 %r473, %r472, 6;
div.s32 %r474, %r473, %r9;
add.s32 %r475, %r474, %r49;
mul.wide.s32 %rd46, %r475, 4;
add.s64 %rd47, %rd2, %rd46;
ld.global.f32 %f235, [%rd47];
$L__BB0_87:
mov.f32 %f256, 0f00000000;
@%p31 bra $L__BB0_89;
div.s32 %r478, %r189, %r9;
mov.u32 %r479, %ctaid.x;
shr.s32 %r480, %r479, 31;
shr.u32 %r481, %r480, 27;
add.s32 %r482, %r479, %r481;
and.b32 %r483, %r482, -32;
sub.s32 %r484, %r479, %r483;
mad.lo.s32 %r485, %r8, %r484, %r478;
setp.lt.s32 %p80, %r485, %r64;
selp.f32 %f256, %f235, 0f00000000, %p80;
$L__BB0_89:
@%p31 bra $L__BB0_92;
div.s32 %r488, %r189, %r9;
mov.u32 %r489, %ctaid.x;
shr.s32 %r490, %r489, 31;
shr.u32 %r491, %r490, 27;
add.s32 %r492, %r489, %r491;
and.b32 %r493, %r492, -32;
sub.s32 %r494, %r489, %r493;
mul.lo.s32 %r50, %r8, %r494;
add.s32 %r495, %r488, %r50;
setp.ge.s32 %p82, %r495, %r64;
@%p82 bra $L__BB0_92;
add.s32 %r496, %r27, %r36;
add.s32 %r497, %r496, 6;
div.s32 %r498, %r497, %r9;
add.s32 %r499, %r498, %r50;
mul.wide.s32 %rd48, %r499, 4;
add.s64 %rd49, %rd3, %rd48;
ld.global.f32 %f237, [%rd49];
$L__BB0_92:
mov.f32 %f258, 0f00000000;
@%p31 bra $L__BB0_94;
div.s32 %r502, %r189, %r9;
mov.u32 %r503, %ctaid.x;
shr.s32 %r504, %r503, 31;
shr.u32 %r505, %r504, 27;
add.s32 %r506, %r503, %r505;
and.b32 %r507, %r506, -32;
sub.s32 %r508, %r503, %r507;
mad.lo.s32 %r509, %r8, %r508, %r502;
setp.lt.s32 %p84, %r509, %r64;
selp.f32 %f258, %f237, 0f00000000, %p84;
$L__BB0_94:
div.rn.f32 %f183, %f221, %f33;
fma.rn.f32 %f182, %f256, %f183, %f258;
// begin inline asm
{ cvt.rn.f16.f32 %rs37, %f182;}
// end inline asm
@%p31 bra $L__BB0_97;
div.s32 %r512, %r189, %r9;
mov.u32 %r513, %ctaid.x;
shr.s32 %r514, %r513, 31;
shr.u32 %r515, %r514, 27;
add.s32 %r516, %r513, %r515;
and.b32 %r517, %r516, -32;
sub.s32 %r518, %r513, %r517;
mul.lo.s32 %r51, %r8, %r518;
add.s32 %r519, %r512, %r51;
setp.ge.s32 %p86, %r519, %r64;
@%p86 bra $L__BB0_97;
add.s32 %r520, %r36, %r27;
add.s32 %r521, %r520, 7;
div.s32 %r522, %r521, %r9;
add.s32 %r523, %r522, %r51;
mul.wide.s32 %rd50, %r523, 4;
add.s64 %rd51, %rd2, %rd50;
ld.global.f32 %f235, [%rd51];
$L__BB0_97:
mov.f32 %f260, 0f00000000;
@%p31 bra $L__BB0_99;
div.s32 %r526, %r189, %r9;
mov.u32 %r527, %ctaid.x;
shr.s32 %r528, %r527, 31;
shr.u32 %r529, %r528, 27;
add.s32 %r530, %r527, %r529;
and.b32 %r531, %r530, -32;
sub.s32 %r532, %r527, %r531;
mad.lo.s32 %r533, %r8, %r532, %r526;
setp.lt.s32 %p88, %r533, %r64;
selp.f32 %f260, %f235, 0f00000000, %p88;
$L__BB0_99:
@%p31 bra $L__BB0_102;
div.s32 %r536, %r189, %r9;
mov.u32 %r537, %ctaid.x;
shr.s32 %r538, %r537, 31;
shr.u32 %r539, %r538, 27;
add.s32 %r540, %r537, %r539;
and.b32 %r541, %r540, -32;
sub.s32 %r542, %r537, %r541;
mul.lo.s32 %r52, %r8, %r542;
add.s32 %r543, %r536, %r52;
setp.ge.s32 %p90, %r543, %r64;
@%p90 bra $L__BB0_102;
add.s32 %r544, %r36, %r27;
add.s32 %r545, %r544, 7;
div.s32 %r546, %r545, %r9;
add.s32 %r547, %r546, %r52;
mul.wide.s32 %rd52, %r547, 4;
add.s64 %rd53, %rd3, %rd52;
ld.global.f32 %f237, [%rd53];
$L__BB0_102:
mov.f32 %f262, 0f00000000;
@%p31 bra $L__BB0_104;
div.s32 %r550, %r189, %r9;
mov.u32 %r551, %ctaid.x;
shr.s32 %r552, %r551, 31;
shr.u32 %r553, %r552, 27;
add.s32 %r554, %r551, %r553;
and.b32 %r555, %r554, -32;
sub.s32 %r556, %r551, %r555;
mad.lo.s32 %r557, %r8, %r556, %r550;
setp.lt.s32 %p92, %r557, %r64;
selp.f32 %f262, %f237, 0f00000000, %p92;
$L__BB0_104:
div.rn.f32 %f187, %f220, %f33;
fma.rn.f32 %f186, %f260, %f187, %f262;
// begin inline asm
{ cvt.rn.f16.f32 %rs38, %f186;}
// end inline asm
or.pred %p95, %p2, %p31;
@%p95 bra $L__BB0_107;
mov.b32 %r562, {%rs37, %rs38};
mov.u32 %r563, %ctaid.x;
mad.lo.s32 %r564, %r10, %r563, %r36;
mul.wide.s32 %rd55, %r564, 2;
add.s64 %rd54, %rd6, %rd55;
mov.b32 %r559, {%rs31, %rs32};
mov.b32 %r560, {%rs33, %rs34};
mov.b32 %r561, {%rs35, %rs36};
// begin inline asm
st.global.cs.v4.s32 [%rd54], {%r559,%r560,%r561,%r562};
// end inline asm
$L__BB0_107:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_3b4d9499_1601111nvfuser_381ENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEEE14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_3b4d9499_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_3b4d9499_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_3b4d9499_160113std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_3b4d9499_160115arrayE[];
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_3b4d9499_1601111nvfuser_381ENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEE(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_3b4d9499_1601111nvfuser_381ENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEE_param_0[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_3b4d9499_1601111nvfuser_381ENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEE_param_1[16],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_3b4d9499_1601111nvfuser_381ENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEE_param_2[16],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_3b4d9499_1601111nvfuser_381ENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEE_param_3[48]
)
{
.reg .pred %p<64>;
.reg .b16 %rs<47>;
.reg .f32 %f<213>;
.reg .b32 %r<285>;
.reg .b64 %rd<82>;
// demoted variable
.shared .align 4 .u32 _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_3b4d9499_1601111nvfuser_381ENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEEE14nvfuser_zero_s;
ld.param.v2.u32 {%r50, %r51}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_3b4d9499_1601111nvfuser_381ENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEE_param_0+8];
ld.param.v2.u32 {%r52, %r53}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_3b4d9499_1601111nvfuser_381ENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEE_param_0+16];
ld.param.u64 %rd6, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_3b4d9499_1601111nvfuser_381ENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEE_param_3];
ld.param.u64 %rd7, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_3b4d9499_1601111nvfuser_381ENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEE_param_2];
ld.param.u64 %rd8, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_3b4d9499_1601111nvfuser_381ENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEE_param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_3b4d9499_1601111nvfuser_381ENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEE_param_0];
cvta.to.global.u64 %rd2, %rd8;
cvta.to.global.u64 %rd3, %rd7;
mov.u32 %r6, %tid.x;
setp.ne.s32 %p1, %r6, 0;
@%p1 bra $L__BB0_2;
mov.u32 %r64, 0;
st.shared.u32 [_ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_3b4d9499_1601111nvfuser_381ENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEEE14nvfuser_zero_s], %r64;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd9, _ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_3b4d9499_1601111nvfuser_381ENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEEE14nvfuser_zero_s;
atom.shared.min.s32 %r65, [%rd9], %r6;
ld.shared.u32 %r7, [_ZZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_3b4d9499_1601111nvfuser_381ENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEEE14nvfuser_zero_s];
add.s32 %r66, %r51, 31;
shr.s32 %r67, %r66, 31;
shr.u32 %r68, %r67, 27;
add.s32 %r69, %r66, %r68;
shr.s32 %r8, %r69, 5;
mul.lo.s32 %r9, %r53, %r52;
mul.lo.s32 %r10, %r9, %r8;
add.s32 %r70, %r10, 7;
shr.s32 %r71, %r70, 31;
shr.u32 %r72, %r71, 29;
add.s32 %r73, %r70, %r72;
shr.s32 %r11, %r73, 3;
setp.ge.s32 %p2, %r6, %r11;
@%p2 bra $L__BB0_4;
shl.b32 %r12, %r6, 3;
or.b32 %r74, %r12, 7;
div.s32 %r75, %r74, %r9;
mov.u32 %r13, %ctaid.x;
shr.s32 %r76, %r13, 31;
shr.u32 %r77, %r76, 27;
add.s32 %r78, %r13, %r77;
and.b32 %r79, %r78, -32;
sub.s32 %r80, %r13, %r79;
mul.lo.s32 %r14, %r8, %r80;
add.s32 %r81, %r75, %r14;
setp.lt.s32 %p3, %r81, %r51;
@%p3 bra $L__BB0_8;
bra.uni $L__BB0_4;
$L__BB0_8:
shr.s32 %r107, %r78, 5;
mad.lo.s32 %r108, %r51, %r107, %r14;
mad.lo.s32 %r109, %r9, %r108, %r12;
mul.wide.s32 %rd13, %r109, 2;
add.s64 %rd12, %rd1, %rd13;
// begin inline asm
ld.global.cs.v4.u32 {%r100,%r101,%r102,%r103}, [%rd12];
// end inline asm
mov.b32 {%rs23, %rs24}, %r100;
// begin inline asm
{ cvt.f32.f16 %f193, %rs23;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f192, %rs24;}
// end inline asm
mov.b32 {%rs25, %rs26}, %r101;
// begin inline asm
{ cvt.f32.f16 %f191, %rs25;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f190, %rs26;}
// end inline asm
mov.b32 {%rs27, %rs28}, %r102;
// begin inline asm
{ cvt.f32.f16 %f189, %rs27;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f188, %rs28;}
// end inline asm
mov.b32 {%rs29, %rs30}, %r103;
// begin inline asm
{ cvt.f32.f16 %f187, %rs29;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f186, %rs30;}
// end inline asm
bra.uni $L__BB0_9;
$L__BB0_4:
mov.f32 %f68, 0f00000000;
// begin inline asm
{ cvt.rn.f16.f32 %rs13, %f68;}
// end inline asm
mov.b32 %r281, {%rs13, %rs13};
mov.u32 %r282, %r281;
mov.u32 %r283, %r281;
mov.u32 %r284, %r281;
@%p2 bra $L__BB0_7;
shl.b32 %r16, %r6, 3;
or.b32 %r82, %r16, 7;
div.s32 %r83, %r82, %r9;
mov.u32 %r17, %ctaid.x;
shr.s32 %r84, %r17, 31;
shr.u32 %r85, %r84, 27;
add.s32 %r86, %r17, %r85;
and.b32 %r87, %r86, -32;
sub.s32 %r88, %r17, %r87;
mul.lo.s32 %r18, %r8, %r88;
add.s32 %r89, %r83, %r18;
setp.ge.s32 %p5, %r89, %r51;
mov.u32 %r282, %r281;
mov.u32 %r283, %r281;
mov.u32 %r284, %r281;
@%p5 bra $L__BB0_7;
shr.s32 %r97, %r86, 5;
mad.lo.s32 %r98, %r51, %r97, %r18;
mad.lo.s32 %r99, %r9, %r98, %r16;
mul.wide.s32 %rd11, %r99, 2;
add.s64 %rd10, %rd1, %rd11;
// begin inline asm
ld.global.cs.v4.u32 {%r284,%r283,%r282,%r281}, [%rd10];
// end inline asm
$L__BB0_7:
mov.b32 {%rs14, %rs15}, %r284;
// begin inline asm
{ cvt.f32.f16 %f193, %rs14;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f192, %rs15;}
// end inline asm
mov.b32 {%rs16, %rs17}, %r283;
// begin inline asm
{ cvt.f32.f16 %f191, %rs16;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f190, %rs17;}
// end inline asm
mov.b32 {%rs18, %rs19}, %r282;
// begin inline asm
{ cvt.f32.f16 %f189, %rs18;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f188, %rs19;}
// end inline asm
mov.b32 {%rs20, %rs21}, %r281;
// begin inline asm
{ cvt.f32.f16 %f187, %rs20;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f186, %rs21;}
// end inline asm
$L__BB0_9:
setp.lt.s32 %p6, %r6, %r11;
@%p6 bra $L__BB0_10;
bra.uni $L__BB0_11;
$L__BB0_10:
shl.b32 %r110, %r6, 3;
or.b32 %r111, %r110, 7;
setp.lt.s32 %p7, %r111, %r10;
@%p7 bra $L__BB0_12;
bra.uni $L__BB0_11;
$L__BB0_12:
add.f32 %f94, %f193, 0f00000000;
add.f32 %f95, %f94, %f192;
add.f32 %f96, %f95, %f191;
add.f32 %f97, %f96, %f190;
add.f32 %f98, %f97, %f189;
add.f32 %f99, %f98, %f188;
add.f32 %f100, %f99, %f187;
add.f32 %f194, %f100, %f186;
bra.uni $L__BB0_13;
$L__BB0_11:
shl.b32 %r112, %r6, 3;
or.b32 %r113, %r112, 7;
setp.lt.s32 %p9, %r113, %r10;
and.pred %p10, %p9, %p6;
add.f32 %f86, %f193, 0f00000000;
add.f32 %f87, %f86, %f192;
add.f32 %f88, %f87, %f191;
add.f32 %f89, %f88, %f190;
add.f32 %f90, %f89, %f189;
add.f32 %f91, %f90, %f188;
add.f32 %f92, %f91, %f187;
add.f32 %f93, %f92, %f186;
selp.f32 %f194, %f93, 0f00000000, %p10;
$L__BB0_13:
mov.b32 %r114, %f194;
mov.u32 %r115, 31;
mov.u32 %r116, 16;
mov.u32 %r117, -1;
shfl.sync.bfly.b32 %r118|%p11, %r114, %r116, %r115, %r117;
mov.b32 %f101, %r118;
add.f32 %f102, %f194, %f101;
mov.b32 %r119, %f102;
mov.u32 %r120, 8;
shfl.sync.bfly.b32 %r121|%p12, %r119, %r120, %r115, %r117;
mov.b32 %f103, %r121;
add.f32 %f104, %f102, %f103;
mov.b32 %r122, %f104;
mov.u32 %r123, 4;
shfl.sync.bfly.b32 %r124|%p13, %r122, %r123, %r115, %r117;
mov.b32 %f105, %r124;
add.f32 %f106, %f104, %f105;
mov.b32 %r125, %f106;
mov.u32 %r126, 2;
shfl.sync.bfly.b32 %r127|%p14, %r125, %r126, %r115, %r117;
mov.b32 %f107, %r127;
add.f32 %f108, %f106, %f107;
mov.b32 %r128, %f108;
mov.u32 %r129, 1;
shfl.sync.bfly.b32 %r130|%p15, %r128, %r129, %r115, %r117;
mov.b32 %f109, %r130;
add.f32 %f196, %f108, %f109;
shl.b32 %r27, %r7, 2;
shr.u32 %r28, %r6, 5;
mov.u32 %r131, %tid.z;
mov.u32 %r132, %ntid.y;
mov.u32 %r133, %tid.y;
mad.lo.s32 %r29, %r131, %r132, %r133;
and.b32 %r30, %r6, 31;
setp.ne.s32 %p16, %r30, 0;
mov.u32 %r134, %ntid.x;
shr.u32 %r31, %r134, 5;
mul.lo.s32 %r32, %r29, %r31;
bar.sync 0;
@%p16 bra $L__BB0_15;
add.s32 %r135, %r32, %r28;
mul.wide.u32 %rd14, %r135, 4;
mov.u64 %rd15, _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_3b4d9499_160115arrayE;
add.s64 %rd16, %rd15, %rd14;
st.shared.f32 [%rd16], %f196;
$L__BB0_15:
bar.sync 0;
setp.ne.s32 %p17, %r28, 0;
@%p17 bra $L__BB0_19;
setp.ge.u32 %p18, %r30, %r31;
mov.f32 %f195, 0f00000000;
@%p18 bra $L__BB0_18;
add.s32 %r136, %r32, %r30;
mul.wide.u32 %rd17, %r136, 4;
mov.u64 %rd18, _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_3b4d9499_160115arrayE;
add.s64 %rd19, %rd18, %rd17;
ld.shared.f32 %f195, [%rd19];
$L__BB0_18:
mov.b32 %r137, %f195;
mov.u32 %r138, 31;
mov.u32 %r139, 16;
mov.u32 %r140, -1;
shfl.sync.bfly.b32 %r141|%p19, %r137, %r139, %r138, %r140;
mov.b32 %f111, %r141;
add.f32 %f112, %f195, %f111;
mov.b32 %r142, %f112;
mov.u32 %r143, 8;
shfl.sync.bfly.b32 %r144|%p20, %r142, %r143, %r138, %r140;
mov.b32 %f113, %r144;
add.f32 %f114, %f112, %f113;
mov.b32 %r145, %f114;
mov.u32 %r146, 4;
shfl.sync.bfly.b32 %r147|%p21, %r145, %r146, %r138, %r140;
mov.b32 %f115, %r147;
add.f32 %f116, %f114, %f115;
mov.b32 %r148, %f116;
mov.u32 %r149, 2;
shfl.sync.bfly.b32 %r150|%p22, %r148, %r149, %r138, %r140;
mov.b32 %f117, %r150;
add.f32 %f118, %f116, %f117;
mov.b32 %r151, %f118;
mov.u32 %r152, 1;
shfl.sync.bfly.b32 %r153|%p23, %r151, %r152, %r138, %r140;
mov.b32 %f119, %r153;
add.f32 %f196, %f118, %f119;
$L__BB0_19:
bar.sync 0;
mul.wide.s32 %rd20, %r29, 4;
mov.u64 %rd21, _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_381_cu_3b4d9499_160115arrayE;
add.s64 %rd4, %rd21, %rd20;
setp.eq.s32 %p24, %r6, 0;
@%p24 bra $L__BB0_20;
bra.uni $L__BB0_21;
$L__BB0_20:
setp.eq.s32 %p25, %r30, 0;
add.f32 %f120, %f196, 0f00000000;
selp.f32 %f121, %f120, 0f00000000, %p25;
st.shared.f32 [%rd4], %f121;
$L__BB0_21:
bar.sync 0;
ld.shared.f32 %f33, [%rd4];
bar.sync 0;
@%p6 bra $L__BB0_22;
bra.uni $L__BB0_24;
$L__BB0_22:
shl.b32 %r33, %r6, 3;
or.b32 %r154, %r33, 7;
setp.ge.s32 %p27, %r154, %r10;
@%p27 bra $L__BB0_24;
add.s32 %r155, %r33, 7;
div.s32 %r156, %r155, %r9;
mov.u32 %r34, %ctaid.x;
shr.s32 %r157, %r34, 31;
shr.u32 %r158, %r157, 27;
add.s32 %r159, %r34, %r158;
and.b32 %r160, %r159, -32;
sub.s32 %r161, %r34, %r160;
mul.lo.s32 %r35, %r8, %r161;
add.s32 %r162, %r156, %r35;
setp.lt.s32 %p28, %r162, %r51;
@%p28 bra $L__BB0_58;
bra.uni $L__BB0_24;
$L__BB0_58:
add.s32 %r256, %r27, %r33;
div.s32 %r257, %r256, %r9;
add.s32 %r258, %r257, %r35;
mul.wide.s32 %rd57, %r258, 4;
add.s64 %rd58, %rd2, %rd57;
add.s64 %rd59, %rd3, %rd57;
div.rn.f32 %f162, %f193, %f33;
ld.global.f32 %f163, [%rd58];
ld.global.f32 %f164, [%rd59];
fma.rn.f32 %f154, %f163, %f162, %f164;
add.s32 %r259, %r256, 1;
div.s32 %r260, %r259, %r9;
add.s32 %r261, %r260, %r35;
mul.wide.s32 %rd60, %r261, 4;
add.s64 %rd61, %rd2, %rd60;
add.s64 %rd62, %rd3, %rd60;
div.rn.f32 %f165, %f192, %f33;
ld.global.f32 %f166, [%rd61];
ld.global.f32 %f167, [%rd62];
fma.rn.f32 %f155, %f166, %f165, %f167;
// begin inline asm
{ cvt.rn.f16.f32 %rs39, %f154;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs40, %f155;}
// end inline asm
mov.b32 %r252, {%rs39, %rs40};
add.s32 %r262, %r256, 2;
div.s32 %r263, %r262, %r9;
add.s32 %r264, %r263, %r35;
mul.wide.s32 %rd63, %r264, 4;
add.s64 %rd64, %rd2, %rd63;
add.s64 %rd65, %rd3, %rd63;
div.rn.f32 %f168, %f191, %f33;
ld.global.f32 %f169, [%rd64];
ld.global.f32 %f170, [%rd65];
fma.rn.f32 %f156, %f169, %f168, %f170;
add.s32 %r265, %r256, 3;
div.s32 %r266, %r265, %r9;
add.s32 %r267, %r266, %r35;
mul.wide.s32 %rd66, %r267, 4;
add.s64 %rd67, %rd2, %rd66;
add.s64 %rd68, %rd3, %rd66;
div.rn.f32 %f171, %f190, %f33;
ld.global.f32 %f172, [%rd67];
ld.global.f32 %f173, [%rd68];
fma.rn.f32 %f157, %f172, %f171, %f173;
// begin inline asm
{ cvt.rn.f16.f32 %rs42, %f157;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs41, %f156;}
// end inline asm
mov.b32 %r253, {%rs41, %rs42};
add.s32 %r268, %r256, 4;
div.s32 %r269, %r268, %r9;
add.s32 %r270, %r269, %r35;
mul.wide.s32 %rd69, %r270, 4;
add.s64 %rd70, %rd2, %rd69;
add.s64 %rd71, %rd3, %rd69;
div.rn.f32 %f174, %f189, %f33;
ld.global.f32 %f175, [%rd70];
ld.global.f32 %f176, [%rd71];
fma.rn.f32 %f158, %f175, %f174, %f176;
add.s32 %r271, %r256, 5;
div.s32 %r272, %r271, %r9;
add.s32 %r273, %r272, %r35;
mul.wide.s32 %rd72, %r273, 4;
add.s64 %rd73, %rd2, %rd72;
add.s64 %rd74, %rd3, %rd72;
div.rn.f32 %f177, %f188, %f33;
ld.global.f32 %f178, [%rd73];
ld.global.f32 %f179, [%rd74];
fma.rn.f32 %f159, %f178, %f177, %f179;
// begin inline asm
{ cvt.rn.f16.f32 %rs44, %f159;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs43, %f158;}
// end inline asm
mov.b32 %r254, {%rs43, %rs44};
add.s32 %r274, %r256, 6;
div.s32 %r275, %r274, %r9;
add.s32 %r276, %r275, %r35;
mul.wide.s32 %rd75, %r276, 4;
add.s64 %rd76, %rd2, %rd75;
add.s64 %rd77, %rd3, %rd75;
div.rn.f32 %f180, %f187, %f33;
ld.global.f32 %f181, [%rd76];
ld.global.f32 %f182, [%rd77];
fma.rn.f32 %f160, %f181, %f180, %f182;
add.s32 %r277, %r256, 7;
div.s32 %r278, %r277, %r9;
add.s32 %r279, %r278, %r35;
mul.wide.s32 %rd78, %r279, 4;
add.s64 %rd79, %rd2, %rd78;
add.s64 %rd80, %rd3, %rd78;
div.rn.f32 %f183, %f186, %f33;
ld.global.f32 %f184, [%rd79];
ld.global.f32 %f185, [%rd80];
fma.rn.f32 %f161, %f184, %f183, %f185;
// begin inline asm
{ cvt.rn.f16.f32 %rs46, %f161;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs45, %f160;}
// end inline asm
mov.b32 %r255, {%rs45, %rs46};
mad.lo.s32 %r280, %r10, %r34, %r33;
mul.wide.s32 %rd81, %r280, 2;
add.s64 %rd56, %rd6, %rd81;
// begin inline asm
st.global.cs.v4.s32 [%rd56], {%r252,%r253,%r254,%r255};
// end inline asm
bra.uni $L__BB0_59;
$L__BB0_24:
shl.b32 %r36, %r6, 3;
or.b32 %r163, %r36, 7;
div.s32 %r164, %r163, %r9;
mov.u32 %r37, %ctaid.x;
shr.s32 %r165, %r37, 31;
shr.u32 %r166, %r165, 27;
add.s32 %r167, %r37, %r166;
and.b32 %r168, %r167, -32;
sub.s32 %r169, %r37, %r168;
mul.lo.s32 %r38, %r8, %r169;
add.s32 %r39, %r164, %r38;
setp.ge.s32 %p29, %r39, %r51;
@%p29 bra $L__BB0_26;
add.s32 %r170, %r27, %r36;
div.s32 %r171, %r170, %r9;
add.s32 %r172, %r171, %r38;
mul.wide.s32 %rd22, %r172, 4;
add.s64 %rd23, %rd2, %rd22;
ld.global.f32 %f197, [%rd23];
$L__BB0_26:
add.s32 %r173, %r36, 7;
setp.lt.s32 %p30, %r173, %r10;
selp.f32 %f201, %f197, 0f00000000, %p30;
@%p29 bra $L__BB0_28;
add.s32 %r174, %r27, %r36;
div.s32 %r175, %r174, %r9;
add.s32 %r176, %r175, %r38;
mul.wide.s32 %rd24, %r176, 4;
add.s64 %rd25, %rd3, %rd24;
ld.global.f32 %f198, [%rd25];
$L__BB0_28:
selp.f32 %f202, %f198, 0f00000000, %p30;
div.rn.f32 %f125, %f193, %f33;
fma.rn.f32 %f124, %f201, %f125, %f202;
// begin inline asm
{ cvt.rn.f16.f32 %rs31, %f124;}
// end inline asm
@%p29 bra $L__BB0_30;
add.s32 %r179, %r27, %r36;
add.s32 %r180, %r179, 1;
div.s32 %r181, %r180, %r9;
add.s32 %r182, %r181, %r38;
mul.wide.s32 %rd26, %r182, 4;
add.s64 %rd27, %rd2, %rd26;
ld.global.f32 %f126, [%rd27];
selp.f32 %f201, %f126, 0f00000000, %p30;
$L__BB0_30:
@%p29 bra $L__BB0_32;
add.s32 %r184, %r27, %r36;
add.s32 %r185, %r184, 1;
div.s32 %r186, %r185, %r9;
add.s32 %r187, %r186, %r38;
mul.wide.s32 %rd28, %r187, 4;
add.s64 %rd29, %rd3, %rd28;
ld.global.f32 %f127, [%rd29];
selp.f32 %f202, %f127, 0f00000000, %p30;
$L__BB0_32:
div.rn.f32 %f129, %f192, %f33;
fma.rn.f32 %f128, %f201, %f129, %f202;
// begin inline asm
{ cvt.rn.f16.f32 %rs32, %f128;}
// end inline asm
@%p29 bra $L__BB0_34;
add.s32 %r189, %r27, %r36;
add.s32 %r190, %r189, 2;
div.s32 %r191, %r190, %r9;
add.s32 %r192, %r191, %r38;
mul.wide.s32 %rd30, %r192, 4;
add.s64 %rd31, %rd2, %rd30;
ld.global.f32 %f130, [%rd31];
selp.f32 %f201, %f130, 0f00000000, %p30;
$L__BB0_34:
@%p29 bra $L__BB0_36;
add.s32 %r194, %r27, %r36;
add.s32 %r195, %r194, 2;
div.s32 %r196, %r195, %r9;
add.s32 %r197, %r196, %r38;
mul.wide.s32 %rd32, %r197, 4;
add.s64 %rd33, %rd3, %rd32;
ld.global.f32 %f131, [%rd33];
selp.f32 %f202, %f131, 0f00000000, %p30;
$L__BB0_36:
div.rn.f32 %f133, %f191, %f33;
fma.rn.f32 %f132, %f201, %f133, %f202;
// begin inline asm
{ cvt.rn.f16.f32 %rs33, %f132;}
// end inline asm
@%p29 bra $L__BB0_38;
add.s32 %r199, %r27, %r36;
add.s32 %r200, %r199, 3;
div.s32 %r201, %r200, %r9;
add.s32 %r202, %r201, %r38;
mul.wide.s32 %rd34, %r202, 4;
add.s64 %rd35, %rd2, %rd34;
ld.global.f32 %f134, [%rd35];
selp.f32 %f201, %f134, 0f00000000, %p30;
$L__BB0_38:
@%p29 bra $L__BB0_40;
add.s32 %r204, %r27, %r36;
add.s32 %r205, %r204, 3;
div.s32 %r206, %r205, %r9;
add.s32 %r207, %r206, %r38;
mul.wide.s32 %rd36, %r207, 4;
add.s64 %rd37, %rd3, %rd36;
ld.global.f32 %f135, [%rd37];
selp.f32 %f202, %f135, 0f00000000, %p30;
$L__BB0_40:
div.rn.f32 %f137, %f190, %f33;
fma.rn.f32 %f136, %f201, %f137, %f202;
// begin inline asm
{ cvt.rn.f16.f32 %rs34, %f136;}
// end inline asm
@%p29 bra $L__BB0_42;
add.s32 %r209, %r27, %r36;
add.s32 %r210, %r209, 4;
div.s32 %r211, %r210, %r9;
add.s32 %r212, %r211, %r38;
mul.wide.s32 %rd38, %r212, 4;
add.s64 %rd39, %rd2, %rd38;
ld.global.f32 %f138, [%rd39];
selp.f32 %f201, %f138, 0f00000000, %p30;
$L__BB0_42:
@%p29 bra $L__BB0_44;
add.s32 %r214, %r27, %r36;
add.s32 %r215, %r214, 4;
div.s32 %r216, %r215, %r9;
add.s32 %r217, %r216, %r38;
mul.wide.s32 %rd40, %r217, 4;
add.s64 %rd41, %rd3, %rd40;
ld.global.f32 %f139, [%rd41];
selp.f32 %f202, %f139, 0f00000000, %p30;
$L__BB0_44:
div.rn.f32 %f141, %f189, %f33;
fma.rn.f32 %f140, %f201, %f141, %f202;
// begin inline asm
{ cvt.rn.f16.f32 %rs35, %f140;}
// end inline asm
@%p29 bra $L__BB0_46;
add.s32 %r219, %r27, %r36;
add.s32 %r220, %r219, 5;
div.s32 %r221, %r220, %r9;
add.s32 %r222, %r221, %r38;
mul.wide.s32 %rd42, %r222, 4;
add.s64 %rd43, %rd2, %rd42;
ld.global.f32 %f142, [%rd43];
selp.f32 %f201, %f142, 0f00000000, %p30;
$L__BB0_46:
@%p29 bra $L__BB0_48;
add.s32 %r224, %r27, %r36;
add.s32 %r225, %r224, 5;
div.s32 %r226, %r225, %r9;
add.s32 %r227, %r226, %r38;
mul.wide.s32 %rd44, %r227, 4;
add.s64 %rd45, %rd3, %rd44;
ld.global.f32 %f143, [%rd45];
selp.f32 %f202, %f143, 0f00000000, %p30;
$L__BB0_48:
div.rn.f32 %f145, %f188, %f33;
fma.rn.f32 %f144, %f201, %f145, %f202;
// begin inline asm
{ cvt.rn.f16.f32 %rs36, %f144;}
// end inline asm
@%p29 bra $L__BB0_50;
add.s32 %r229, %r27, %r36;
add.s32 %r230, %r229, 6;
div.s32 %r231, %r230, %r9;
add.s32 %r232, %r231, %r38;
mul.wide.s32 %rd46, %r232, 4;
add.s64 %rd47, %rd2, %rd46;
ld.global.f32 %f146, [%rd47];
selp.f32 %f201, %f146, 0f00000000, %p30;
$L__BB0_50:
@%p29 bra $L__BB0_52;
add.s32 %r234, %r27, %r36;
add.s32 %r235, %r234, 6;
div.s32 %r236, %r235, %r9;
add.s32 %r237, %r236, %r38;
mul.wide.s32 %rd48, %r237, 4;
add.s64 %rd49, %rd3, %rd48;
ld.global.f32 %f147, [%rd49];
selp.f32 %f202, %f147, 0f00000000, %p30;
$L__BB0_52:
div.rn.f32 %f149, %f187, %f33;
fma.rn.f32 %f148, %f201, %f149, %f202;
// begin inline asm
{ cvt.rn.f16.f32 %rs37, %f148;}
// end inline asm
@%p29 bra $L__BB0_54;
add.s32 %r239, %r173, %r27;
div.s32 %r240, %r239, %r9;
add.s32 %r241, %r240, %r38;
mul.wide.s32 %rd50, %r241, 4;
add.s64 %rd51, %rd2, %rd50;
ld.global.f32 %f150, [%rd51];
selp.f32 %f201, %f150, 0f00000000, %p30;
$L__BB0_54:
@%p29 bra $L__BB0_56;
add.s32 %r243, %r173, %r27;
div.s32 %r244, %r243, %r9;
add.s32 %r245, %r244, %r38;
mul.wide.s32 %rd52, %r245, 4;
add.s64 %rd53, %rd3, %rd52;
ld.global.f32 %f151, [%rd53];
selp.f32 %f202, %f151, 0f00000000, %p30;
$L__BB0_56:
setp.ge.s32 %p61, %r173, %r10;
div.rn.f32 %f153, %f186, %f33;
fma.rn.f32 %f152, %f201, %f153, %f202;
// begin inline asm
{ cvt.rn.f16.f32 %rs38, %f152;}
// end inline asm
or.pred %p63, %p2, %p61;
@%p63 bra $L__BB0_59;
mov.b32 %r250, {%rs37, %rs38};
mad.lo.s32 %r251, %r10, %r37, %r36;
mul.wide.s32 %rd55, %r251, 2;
add.s64 %rd54, %rd6, %rd55;
mov.b32 %r247, {%rs31, %rs32};
mov.b32 %r248, {%rs33, %rs34};
mov.b32 %r249, {%rs35, %rs36};
// begin inline asm
st.global.cs.v4.s32 [%rd54], {%r247,%r248,%r249,%r250};
// end inline asm
$L__BB0_59:
ret;
}
--- 53997da5d
+++ 03a1b695e
@@ -21,320 +21,320 @@
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEE_param_1[16],
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEE_param_2[16],
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEE_param_3[48]
)
{
- .reg .pred %p<96>;
+ .reg .pred %p<64>;
.reg .b16 %rs<47>;
- .reg .f32 %f<263>;
- .reg .b32 %r<598>;
+ .reg .f32 %f<213>;
+ .reg .b32 %r<285>;
.reg .b64 %rd<82>;
.shared .align 4 .u32 _ZZN11kernelscope6kernelENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEEE14nvfuser_zero_s;
- ld.param.v2.u32 {%r63, %r64}, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEE_param_0+8];
- ld.param.v2.u32 {%r65, %r66}, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEE_param_0+16];
+ ld.param.v2.u32 {%r50, %r51}, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEE_param_0+8];
+ ld.param.v2.u32 {%r52, %r53}, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEE_param_0+16];
ld.param.u64 %rd6, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEE_param_3];
ld.param.u64 %rd7, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEE_param_2];
ld.param.u64 %rd8, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEE_param_1];
ld.param.u64 %rd1, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEE_param_0];
cvta.to.global.u64 %rd2, %rd8;
cvta.to.global.u64 %rd3, %rd7;
mov.u32 %r6, %tid.x;
setp.ne.s32 %p1, %r6, 0;
@%p1 bra $L__BB0_2;
- mov.u32 %r77, 0;
- st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEEE14nvfuser_zero_s], %r77;
+ mov.u32 %r64, 0;
+ st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEEE14nvfuser_zero_s], %r64;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd9, _ZZN11kernelscope6kernelENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEEE14nvfuser_zero_s;
- atom.shared.min.s32 %r78, [%rd9], %r6;
+ atom.shared.min.s32 %r65, [%rd9], %r6;
ld.shared.u32 %r7, [_ZZN11kernelscope6kernelENS_6TensorINS_6__halfELi4ELi4EEENS0_IfLi1ELi1EEES3_NS0_IS1_Li5ELi5EEEE14nvfuser_zero_s];
- add.s32 %r79, %r64, 31;
- shr.s32 %r80, %r79, 31;
- shr.u32 %r81, %r80, 27;
- add.s32 %r82, %r79, %r81;
- shr.s32 %r8, %r82, 5;
- mul.lo.s32 %r9, %r66, %r65;
+ add.s32 %r66, %r51, 31;
+ shr.s32 %r67, %r66, 31;
+ shr.u32 %r68, %r67, 27;
+ add.s32 %r69, %r66, %r68;
+ shr.s32 %r8, %r69, 5;
+ mul.lo.s32 %r9, %r53, %r52;
mul.lo.s32 %r10, %r9, %r8;
- add.s32 %r83, %r10, 7;
- shr.s32 %r84, %r83, 31;
- shr.u32 %r85, %r84, 29;
- add.s32 %r86, %r83, %r85;
- shr.s32 %r11, %r86, 3;
+ add.s32 %r70, %r10, 7;
+ shr.s32 %r71, %r70, 31;
+ shr.u32 %r72, %r71, 29;
+ add.s32 %r73, %r70, %r72;
+ shr.s32 %r11, %r73, 3;
setp.ge.s32 %p2, %r6, %r11;
@%p2 bra $L__BB0_4;
shl.b32 %r12, %r6, 3;
- or.b32 %r87, %r12, 7;
- div.s32 %r88, %r87, %r9;
+ or.b32 %r74, %r12, 7;
+ div.s32 %r75, %r74, %r9;
mov.u32 %r13, %ctaid.x;
- shr.s32 %r89, %r13, 31;
- shr.u32 %r90, %r89, 27;
- add.s32 %r91, %r13, %r90;
- and.b32 %r92, %r91, -32;
- sub.s32 %r93, %r13, %r92;
- mul.lo.s32 %r14, %r8, %r93;
- add.s32 %r94, %r88, %r14;
- setp.lt.s32 %p3, %r94, %r64;
+ shr.s32 %r76, %r13, 31;
+ shr.u32 %r77, %r76, 27;
+ add.s32 %r78, %r13, %r77;
+ and.b32 %r79, %r78, -32;
+ sub.s32 %r80, %r13, %r79;
+ mul.lo.s32 %r14, %r8, %r80;
+ add.s32 %r81, %r75, %r14;
+ setp.lt.s32 %p3, %r81, %r51;
@%p3 bra $L__BB0_8;
bra.uni $L__BB0_4;
$L__BB0_8:
- shr.s32 %r120, %r91, 5;
- mad.lo.s32 %r121, %r64, %r120, %r14;
- mad.lo.s32 %r122, %r9, %r121, %r12;
- mul.wide.s32 %rd13, %r122, 2;
+ shr.s32 %r107, %r78, 5;
+ mad.lo.s32 %r108, %r51, %r107, %r14;
+ mad.lo.s32 %r109, %r9, %r108, %r12;
+ mul.wide.s32 %rd13, %r109, 2;
add.s64 %rd12, %rd1, %rd13;
- ld.global.cs.v4.u32 {%r113,%r114,%r115,%r116}, [%rd12];
-
- mov.b32 {%rs23, %rs24}, %r113;
-
- { cvt.f32.f16 %f227, %rs23;}
-
-
-
- { cvt.f32.f16 %f226, %rs24;}
-
-
- mov.b32 {%rs25, %rs26}, %r114;
-
- { cvt.f32.f16 %f225, %rs25;}
-
-
-
- { cvt.f32.f16 %f224, %rs26;}
-
-
- mov.b32 {%rs27, %rs28}, %r115;
-
- { cvt.f32.f16 %f223, %rs27;}
-
-
-
- { cvt.f32.f16 %f222, %rs28;}
-
-
- mov.b32 {%rs29, %rs30}, %r116;
-
- { cvt.f32.f16 %f221, %rs29;}
-
-
-
- { cvt.f32.f16 %f220, %rs30;}
+ ld.global.cs.v4.u32 {%r100,%r101,%r102,%r103}, [%rd12];
+
+ mov.b32 {%rs23, %rs24}, %r100;
+
+ { cvt.f32.f16 %f193, %rs23;}
+
+
+
+ { cvt.f32.f16 %f192, %rs24;}
+
+
+ mov.b32 {%rs25, %rs26}, %r101;
+
+ { cvt.f32.f16 %f191, %rs25;}
+
+
+
+ { cvt.f32.f16 %f190, %rs26;}
+
+
+ mov.b32 {%rs27, %rs28}, %r102;
+
+ { cvt.f32.f16 %f189, %rs27;}
+
+
+
+ { cvt.f32.f16 %f188, %rs28;}
+
+
+ mov.b32 {%rs29, %rs30}, %r103;
+
+ { cvt.f32.f16 %f187, %rs29;}
+
+
+
+ { cvt.f32.f16 %f186, %rs30;}
bra.uni $L__BB0_9;
$L__BB0_4:
- mov.f32 %f98, 0f00000000;
-
- { cvt.rn.f16.f32 %rs13, %f98;}
-
-
- mov.b32 %r594, {%rs13, %rs13};
- mov.u32 %r595, %r594;
- mov.u32 %r596, %r594;
- mov.u32 %r597, %r594;
+ mov.f32 %f68, 0f00000000;
+
+ { cvt.rn.f16.f32 %rs13, %f68;}
+
+
+ mov.b32 %r281, {%rs13, %rs13};
+ mov.u32 %r282, %r281;
+ mov.u32 %r283, %r281;
+ mov.u32 %r284, %r281;
@%p2 bra $L__BB0_7;
shl.b32 %r16, %r6, 3;
- or.b32 %r95, %r16, 7;
- div.s32 %r96, %r95, %r9;
+ or.b32 %r82, %r16, 7;
+ div.s32 %r83, %r82, %r9;
mov.u32 %r17, %ctaid.x;
- shr.s32 %r97, %r17, 31;
- shr.u32 %r98, %r97, 27;
- add.s32 %r99, %r17, %r98;
- and.b32 %r100, %r99, -32;
- sub.s32 %r101, %r17, %r100;
- mul.lo.s32 %r18, %r8, %r101;
- add.s32 %r102, %r96, %r18;
- setp.ge.s32 %p5, %r102, %r64;
- mov.u32 %r595, %r594;
- mov.u32 %r596, %r594;
- mov.u32 %r597, %r594;
+ shr.s32 %r84, %r17, 31;
+ shr.u32 %r85, %r84, 27;
+ add.s32 %r86, %r17, %r85;
+ and.b32 %r87, %r86, -32;
+ sub.s32 %r88, %r17, %r87;
+ mul.lo.s32 %r18, %r8, %r88;
+ add.s32 %r89, %r83, %r18;
+ setp.ge.s32 %p5, %r89, %r51;
+ mov.u32 %r282, %r281;
+ mov.u32 %r283, %r281;
+ mov.u32 %r284, %r281;
@%p5 bra $L__BB0_7;
- shr.s32 %r110, %r99, 5;
- mad.lo.s32 %r111, %r64, %r110, %r18;
- mad.lo.s32 %r112, %r9, %r111, %r16;
- mul.wide.s32 %rd11, %r112, 2;
+ shr.s32 %r97, %r86, 5;
+ mad.lo.s32 %r98, %r51, %r97, %r18;
+ mad.lo.s32 %r99, %r9, %r98, %r16;
+ mul.wide.s32 %rd11, %r99, 2;
add.s64 %rd10, %rd1, %rd11;
- ld.global.cs.v4.u32 {%r597,%r596,%r595,%r594}, [%rd10];
+ ld.global.cs.v4.u32 {%r284,%r283,%r282,%r281}, [%rd10];
$L__BB0_7:
- mov.b32 {%rs14, %rs15}, %r597;
-
- { cvt.f32.f16 %f227, %rs14;}
-
-
-
- { cvt.f32.f16 %f226, %rs15;}
-
-
- mov.b32 {%rs16, %rs17}, %r596;
-
- { cvt.f32.f16 %f225, %rs16;}
-
-
-
- { cvt.f32.f16 %f224, %rs17;}
-
-
- mov.b32 {%rs18, %rs19}, %r595;
-
- { cvt.f32.f16 %f223, %rs18;}
-
-
-
- { cvt.f32.f16 %f222, %rs19;}
-
-
- mov.b32 {%rs20, %rs21}, %r594;
-
- { cvt.f32.f16 %f221, %rs20;}
-
-
-
- { cvt.f32.f16 %f220, %rs21;}
+ mov.b32 {%rs14, %rs15}, %r284;
+
+ { cvt.f32.f16 %f193, %rs14;}
+
+
+
+ { cvt.f32.f16 %f192, %rs15;}
+
+
+ mov.b32 {%rs16, %rs17}, %r283;
+
+ { cvt.f32.f16 %f191, %rs16;}
+
+
+
+ { cvt.f32.f16 %f190, %rs17;}
+
+
+ mov.b32 {%rs18, %rs19}, %r282;
+
+ { cvt.f32.f16 %f189, %rs18;}
+
+
+
+ { cvt.f32.f16 %f188, %rs19;}
+
+
+ mov.b32 {%rs20, %rs21}, %r281;
+
+ { cvt.f32.f16 %f187, %rs20;}
+
+
+
+ { cvt.f32.f16 %f186, %rs21;}
$L__BB0_9:
setp.lt.s32 %p6, %r6, %r11;
@%p6 bra $L__BB0_10;
bra.uni $L__BB0_11;
$L__BB0_10:
- shl.b32 %r123, %r6, 3;
- or.b32 %r124, %r123, 7;
- setp.lt.s32 %p7, %r124, %r10;
+ shl.b32 %r110, %r6, 3;
+ or.b32 %r111, %r110, 7;
+ setp.lt.s32 %p7, %r111, %r10;
@%p7 bra $L__BB0_12;
bra.uni $L__BB0_11;
$L__BB0_12:
- add.f32 %f124, %f227, 0f00000000;
- add.f32 %f125, %f124, %f226;
- add.f32 %f126, %f125, %f225;
- add.f32 %f127, %f126, %f224;
- add.f32 %f128, %f127, %f223;
- add.f32 %f129, %f128, %f222;
- add.f32 %f130, %f129, %f221;
- add.f32 %f228, %f130, %f220;
+ add.f32 %f94, %f193, 0f00000000;
+ add.f32 %f95, %f94, %f192;
+ add.f32 %f96, %f95, %f191;
+ add.f32 %f97, %f96, %f190;
+ add.f32 %f98, %f97, %f189;
+ add.f32 %f99, %f98, %f188;
+ add.f32 %f100, %f99, %f187;
+ add.f32 %f194, %f100, %f186;
bra.uni $L__BB0_13;
$L__BB0_11:
- shl.b32 %r125, %r6, 3;
- or.b32 %r126, %r125, 7;
- setp.lt.s32 %p9, %r126, %r10;
+ shl.b32 %r112, %r6, 3;
+ or.b32 %r113, %r112, 7;
+ setp.lt.s32 %p9, %r113, %r10;
and.pred %p10, %p9, %p6;
- add.f32 %f116, %f227, 0f00000000;
- add.f32 %f117, %f116, %f226;
- add.f32 %f118, %f117, %f225;
- add.f32 %f119, %f118, %f224;
- add.f32 %f120, %f119, %f223;
- add.f32 %f121, %f120, %f222;
- add.f32 %f122, %f121, %f221;
- add.f32 %f123, %f122, %f220;
- selp.f32 %f228, %f123, 0f00000000, %p10;
+ add.f32 %f86, %f193, 0f00000000;
+ add.f32 %f87, %f86, %f192;
+ add.f32 %f88, %f87, %f191;
+ add.f32 %f89, %f88, %f190;
+ add.f32 %f90, %f89, %f189;
+ add.f32 %f91, %f90, %f188;
+ add.f32 %f92, %f91, %f187;
+ add.f32 %f93, %f92, %f186;
+ selp.f32 %f194, %f93, 0f00000000, %p10;
$L__BB0_13:
- mov.b32 %r127, %f228;
- mov.u32 %r128, 31;
- mov.u32 %r129, 16;
- mov.u32 %r130, -1;
- shfl.sync.bfly.b32 %r131|%p11, %r127, %r129, %r128, %r130;
- mov.b32 %f131, %r131;
- add.f32 %f132, %f228, %f131;
- mov.b32 %r132, %f132;
- mov.u32 %r133, 8;
- shfl.sync.bfly.b32 %r134|%p12, %r132, %r133, %r128, %r130;
- mov.b32 %f133, %r134;
- add.f32 %f134, %f132, %f133;
- mov.b32 %r135, %f134;
- mov.u32 %r136, 4;
- shfl.sync.bfly.b32 %r137|%p13, %r135, %r136, %r128, %r130;
- mov.b32 %f135, %r137;
- add.f32 %f136, %f134, %f135;
- mov.b32 %r138, %f136;
- mov.u32 %r139, 2;
- shfl.sync.bfly.b32 %r140|%p14, %r138, %r139, %r128, %r130;
- mov.b32 %f137, %r140;
- add.f32 %f138, %f136, %f137;
- mov.b32 %r141, %f138;
- mov.u32 %r142, 1;
- shfl.sync.bfly.b32 %r143|%p15, %r141, %r142, %r128, %r130;
- mov.b32 %f139, %r143;
- add.f32 %f230, %f138, %f139;
+ mov.b32 %r114, %f194;
+ mov.u32 %r115, 31;
+ mov.u32 %r116, 16;
+ mov.u32 %r117, -1;
+ shfl.sync.bfly.b32 %r118|%p11, %r114, %r116, %r115, %r117;
+ mov.b32 %f101, %r118;
+ add.f32 %f102, %f194, %f101;
+ mov.b32 %r119, %f102;
+ mov.u32 %r120, 8;
+ shfl.sync.bfly.b32 %r121|%p12, %r119, %r120, %r115, %r117;
+ mov.b32 %f103, %r121;
+ add.f32 %f104, %f102, %f103;
+ mov.b32 %r122, %f104;
+ mov.u32 %r123, 4;
+ shfl.sync.bfly.b32 %r124|%p13, %r122, %r123, %r115, %r117;
+ mov.b32 %f105, %r124;
+ add.f32 %f106, %f104, %f105;
+ mov.b32 %r125, %f106;
+ mov.u32 %r126, 2;
+ shfl.sync.bfly.b32 %r127|%p14, %r125, %r126, %r115, %r117;
+ mov.b32 %f107, %r127;
+ add.f32 %f108, %f106, %f107;
+ mov.b32 %r128, %f108;
+ mov.u32 %r129, 1;
+ shfl.sync.bfly.b32 %r130|%p15, %r128, %r129, %r115, %r117;
+ mov.b32 %f109, %r130;
+ add.f32 %f196, %f108, %f109;
shl.b32 %r27, %r7, 2;
shr.u32 %r28, %r6, 5;
- mov.u32 %r144, %tid.z;
- mov.u32 %r145, %ntid.y;
- mov.u32 %r146, %tid.y;
- mad.lo.s32 %r29, %r144, %r145, %r146;
+ mov.u32 %r131, %tid.z;
+ mov.u32 %r132, %ntid.y;
+ mov.u32 %r133, %tid.y;
+ mad.lo.s32 %r29, %r131, %r132, %r133;
and.b32 %r30, %r6, 31;
setp.ne.s32 %p16, %r30, 0;
- mov.u32 %r147, %ntid.x;
- shr.u32 %r31, %r147, 5;
+ mov.u32 %r134, %ntid.x;
+ shr.u32 %r31, %r134, 5;
mul.lo.s32 %r32, %r29, %r31;
bar.sync 0;
@%p16 bra $L__BB0_15;
- add.s32 %r148, %r32, %r28;
- mul.wide.u32 %rd14, %r148, 4;
+ add.s32 %r135, %r32, %r28;
+ mul.wide.u32 %rd14, %r135, 4;
mov.u64 %rd15, _ZN11kernelscope6kernelE;
add.s64 %rd16, %rd15, %rd14;
- st.shared.f32 [%rd16], %f230;
+ st.shared.f32 [%rd16], %f196;
$L__BB0_15:
bar.sync 0;
setp.ne.s32 %p17, %r28, 0;
@%p17 bra $L__BB0_19;
setp.ge.u32 %p18, %r30, %r31;
- mov.f32 %f229, 0f00000000;
+ mov.f32 %f195, 0f00000000;
@%p18 bra $L__BB0_18;
- add.s32 %r149, %r32, %r30;
- mul.wide.u32 %rd17, %r149, 4;
+ add.s32 %r136, %r32, %r30;
+ mul.wide.u32 %rd17, %r136, 4;
mov.u64 %rd18, _ZN11kernelscope6kernelE;
add.s64 %rd19, %rd18, %rd17;
- ld.shared.f32 %f229, [%rd19];
+ ld.shared.f32 %f195, [%rd19];
$L__BB0_18:
- mov.b32 %r150, %f229;
- mov.u32 %r151, 31;
- mov.u32 %r152, 16;
- mov.u32 %r153, -1;
- shfl.sync.bfly.b32 %r154|%p19, %r150, %r152, %r151, %r153;
- mov.b32 %f141, %r154;
- add.f32 %f142, %f229, %f141;
- mov.b32 %r155, %f142;
- mov.u32 %r156, 8;
- shfl.sync.bfly.b32 %r157|%p20, %r155, %r156, %r151, %r153;
- mov.b32 %f143, %r157;
- add.f32 %f144, %f142, %f143;
- mov.b32 %r158, %f144;
- mov.u32 %r159, 4;
- shfl.sync.bfly.b32 %r160|%p21, %r158, %r159, %r151, %r153;
- mov.b32 %f145, %r160;
- add.f32 %f146, %f144, %f145;
- mov.b32 %r161, %f146;
- mov.u32 %r162, 2;
- shfl.sync.bfly.b32 %r163|%p22, %r161, %r162, %r151, %r153;
- mov.b32 %f147, %r163;
- add.f32 %f148, %f146, %f147;
- mov.b32 %r164, %f148;
- mov.u32 %r165, 1;
- shfl.sync.bfly.b32 %r166|%p23, %r164, %r165, %r151, %r153;
- mov.b32 %f149, %r166;
- add.f32 %f230, %f148, %f149;
+ mov.b32 %r137, %f195;
+ mov.u32 %r138, 31;
+ mov.u32 %r139, 16;
+ mov.u32 %r140, -1;
+ shfl.sync.bfly.b32 %r141|%p19, %r137, %r139, %r138, %r140;
+ mov.b32 %f111, %r141;
+ add.f32 %f112, %f195, %f111;
+ mov.b32 %r142, %f112;
+ mov.u32 %r143, 8;
+ shfl.sync.bfly.b32 %r144|%p20, %r142, %r143, %r138, %r140;
+ mov.b32 %f113, %r144;
+ add.f32 %f114, %f112, %f113;
+ mov.b32 %r145, %f114;
+ mov.u32 %r146, 4;
+ shfl.sync.bfly.b32 %r147|%p21, %r145, %r146, %r138, %r140;
+ mov.b32 %f115, %r147;
+ add.f32 %f116, %f114, %f115;
+ mov.b32 %r148, %f116;
+ mov.u32 %r149, 2;
+ shfl.sync.bfly.b32 %r150|%p22, %r148, %r149, %r138, %r140;
+ mov.b32 %f117, %r150;
+ add.f32 %f118, %f116, %f117;
+ mov.b32 %r151, %f118;
+ mov.u32 %r152, 1;
+ shfl.sync.bfly.b32 %r153|%p23, %r151, %r152, %r138, %r140;
+ mov.b32 %f119, %r153;
+ add.f32 %f196, %f118, %f119;
$L__BB0_19:
bar.sync 0;
mul.wide.s32 %rd20, %r29, 4;
mov.u64 %rd21, _ZN11kernelscope6kernelE;
@@ -343,843 +343,433 @@
@%p24 bra $L__BB0_20;
bra.uni $L__BB0_21;
$L__BB0_20:
setp.eq.s32 %p25, %r30, 0;
- add.f32 %f150, %f230, 0f00000000;
- selp.f32 %f151, %f150, 0f00000000, %p25;
- st.shared.f32 [%rd4], %f151;
+ add.f32 %f120, %f196, 0f00000000;
+ selp.f32 %f121, %f120, 0f00000000, %p25;
+ st.shared.f32 [%rd4], %f121;
$L__BB0_21:
bar.sync 0;
ld.shared.f32 %f33, [%rd4];
bar.sync 0;
@%p6 bra $L__BB0_22;
bra.uni $L__BB0_24;
$L__BB0_22:
shl.b32 %r33, %r6, 3;
- or.b32 %r167, %r33, 7;
- setp.ge.s32 %p27, %r167, %r10;
+ or.b32 %r154, %r33, 7;
+ setp.ge.s32 %p27, %r154, %r10;
@%p27 bra $L__BB0_24;
- add.s32 %r168, %r33, 7;
- div.s32 %r169, %r168, %r9;
+ add.s32 %r155, %r33, 7;
+ div.s32 %r156, %r155, %r9;
mov.u32 %r34, %ctaid.x;
- shr.s32 %r170, %r34, 31;
- shr.u32 %r171, %r170, 27;
- add.s32 %r172, %r34, %r171;
- and.b32 %r173, %r172, -32;
- sub.s32 %r174, %r34, %r173;
- mul.lo.s32 %r35, %r8, %r174;
- add.s32 %r175, %r169, %r35;
- setp.lt.s32 %p28, %r175, %r64;
- @%p28 bra $L__BB0_106;
+ shr.s32 %r157, %r34, 31;
+ shr.u32 %r158, %r157, 27;
+ add.s32 %r159, %r34, %r158;
+ and.b32 %r160, %r159, -32;
+ sub.s32 %r161, %r34, %r160;
+ mul.lo.s32 %r35, %r8, %r161;
+ add.s32 %r162, %r156, %r35;
+ setp.lt.s32 %p28, %r162, %r51;
+ @%p28 bra $L__BB0_58;
bra.uni $L__BB0_24;
-$L__BB0_106:
- add.s32 %r569, %r27, %r33;
- div.s32 %r570, %r569, %r9;
- add.s32 %r571, %r570, %r35;
- mul.wide.s32 %rd57, %r571, 4;
+$L__BB0_58:
+ add.s32 %r256, %r27, %r33;
+ div.s32 %r257, %r256, %r9;
+ add.s32 %r258, %r257, %r35;
+ mul.wide.s32 %rd57, %r258, 4;
add.s64 %rd58, %rd2, %rd57;
add.s64 %rd59, %rd3, %rd57;
- div.rn.f32 %f196, %f227, %f33;
- ld.global.f32 %f197, [%rd58];
- ld.global.f32 %f198, [%rd59];
- fma.rn.f32 %f188, %f197, %f196, %f198;
- add.s32 %r572, %r569, 1;
- div.s32 %r573, %r572, %r9;
- add.s32 %r574, %r573, %r35;
- mul.wide.s32 %rd60, %r574, 4;
+ div.rn.f32 %f162, %f193, %f33;
+ ld.global.f32 %f163, [%rd58];
+ ld.global.f32 %f164, [%rd59];
+ fma.rn.f32 %f154, %f163, %f162, %f164;
+ add.s32 %r259, %r256, 1;
+ div.s32 %r260, %r259, %r9;
+ add.s32 %r261, %r260, %r35;
+ mul.wide.s32 %rd60, %r261, 4;
add.s64 %rd61, %rd2, %rd60;
add.s64 %rd62, %rd3, %rd60;
- div.rn.f32 %f199, %f226, %f33;
- ld.global.f32 %f200, [%rd61];
- ld.global.f32 %f201, [%rd62];
- fma.rn.f32 %f189, %f200, %f199, %f201;
-
- { cvt.rn.f16.f32 %rs39, %f188;}
-
-
-
- { cvt.rn.f16.f32 %rs40, %f189;}
-
-
- mov.b32 %r565, {%rs39, %rs40};
- add.s32 %r575, %r569, 2;
- div.s32 %r576, %r575, %r9;
- add.s32 %r577, %r576, %r35;
- mul.wide.s32 %rd63, %r577, 4;
+ div.rn.f32 %f165, %f192, %f33;
+ ld.global.f32 %f166, [%rd61];
+ ld.global.f32 %f167, [%rd62];
+ fma.rn.f32 %f155, %f166, %f165, %f167;
+
+ { cvt.rn.f16.f32 %rs39, %f154;}
+
+
+
+ { cvt.rn.f16.f32 %rs40, %f155;}
+
+
+ mov.b32 %r252, {%rs39, %rs40};
+ add.s32 %r262, %r256, 2;
+ div.s32 %r263, %r262, %r9;
+ add.s32 %r264, %r263, %r35;
+ mul.wide.s32 %rd63, %r264, 4;
add.s64 %rd64, %rd2, %rd63;
add.s64 %rd65, %rd3, %rd63;
- div.rn.f32 %f202, %f225, %f33;
- ld.global.f32 %f203, [%rd64];
- ld.global.f32 %f204, [%rd65];
- fma.rn.f32 %f190, %f203, %f202, %f204;
- add.s32 %r578, %r569, 3;
- div.s32 %r579, %r578, %r9;
- add.s32 %r580, %r579, %r35;
- mul.wide.s32 %rd66, %r580, 4;
+ div.rn.f32 %f168, %f191, %f33;
+ ld.global.f32 %f169, [%rd64];
+ ld.global.f32 %f170, [%rd65];
+ fma.rn.f32 %f156, %f169, %f168, %f170;
+ add.s32 %r265, %r256, 3;
+ div.s32 %r266, %r265, %r9;
+ add.s32 %r267, %r266, %r35;
+ mul.wide.s32 %rd66, %r267, 4;
add.s64 %rd67, %rd2, %rd66;
add.s64 %rd68, %rd3, %rd66;
- div.rn.f32 %f205, %f224, %f33;
- ld.global.f32 %f206, [%rd67];
- ld.global.f32 %f207, [%rd68];
- fma.rn.f32 %f191, %f206, %f205, %f207;
-
- { cvt.rn.f16.f32 %rs42, %f191;}
-
-
-
- { cvt.rn.f16.f32 %rs41, %f190;}
-
-
- mov.b32 %r566, {%rs41, %rs42};
- add.s32 %r581, %r569, 4;
- div.s32 %r582, %r581, %r9;
- add.s32 %r583, %r582, %r35;
- mul.wide.s32 %rd69, %r583, 4;
+ div.rn.f32 %f171, %f190, %f33;
+ ld.global.f32 %f172, [%rd67];
+ ld.global.f32 %f173, [%rd68];
+ fma.rn.f32 %f157, %f172, %f171, %f173;
+
+ { cvt.rn.f16.f32 %rs42, %f157;}
+
+
+
+ { cvt.rn.f16.f32 %rs41, %f156;}
+
+
+ mov.b32 %r253, {%rs41, %rs42};
+ add.s32 %r268, %r256, 4;
+ div.s32 %r269, %r268, %r9;
+ add.s32 %r270, %r269, %r35;
+ mul.wide.s32 %rd69, %r270, 4;
add.s64 %rd70, %rd2, %rd69;
add.s64 %rd71, %rd3, %rd69;
- div.rn.f32 %f208, %f223, %f33;
- ld.global.f32 %f209, [%rd70];
- ld.global.f32 %f210, [%rd71];
- fma.rn.f32 %f192, %f209, %f208, %f210;
- add.s32 %r584, %r569, 5;
- div.s32 %r585, %r584, %r9;
- add.s32 %r586, %r585, %r35;
- mul.wide.s32 %rd72, %r586, 4;
+ div.rn.f32 %f174, %f189, %f33;
+ ld.global.f32 %f175, [%rd70];
+ ld.global.f32 %f176, [%rd71];
+ fma.rn.f32 %f158, %f175, %f174, %f176;
+ add.s32 %r271, %r256, 5;
+ div.s32 %r272, %r271, %r9;
+ add.s32 %r273, %r272, %r35;
+ mul.wide.s32 %rd72, %r273, 4;
add.s64 %rd73, %rd2, %rd72;
add.s64 %rd74, %rd3, %rd72;
- div.rn.f32 %f211, %f222, %f33;
- ld.global.f32 %f212, [%rd73];
- ld.global.f32 %f213, [%rd74];
- fma.rn.f32 %f193, %f212, %f211, %f213;
-
- { cvt.rn.f16.f32 %rs44, %f193;}
-
-
-
- { cvt.rn.f16.f32 %rs43, %f192;}
-
-
- mov.b32 %r567, {%rs43, %rs44};
- add.s32 %r587, %r569, 6;
- div.s32 %r588, %r587, %r9;
- add.s32 %r589, %r588, %r35;
- mul.wide.s32 %rd75, %r589, 4;
+ div.rn.f32 %f177, %f188, %f33;
+ ld.global.f32 %f178, [%rd73];
+ ld.global.f32 %f179, [%rd74];
+ fma.rn.f32 %f159, %f178, %f177, %f179;
+
+ { cvt.rn.f16.f32 %rs44, %f159;}
+
+
+
+ { cvt.rn.f16.f32 %rs43, %f158;}
+
+
+ mov.b32 %r254, {%rs43, %rs44};
+ add.s32 %r274, %r256, 6;
+ div.s32 %r275, %r274, %r9;
+ add.s32 %r276, %r275, %r35;
+ mul.wide.s32 %rd75, %r276, 4;
add.s64 %rd76, %rd2, %rd75;
add.s64 %rd77, %rd3, %rd75;
- div.rn.f32 %f214, %f221, %f33;
- ld.global.f32 %f215, [%rd76];
- ld.global.f32 %f216, [%rd77];
- fma.rn.f32 %f194, %f215, %f214, %f216;
- add.s32 %r590, %r569, 7;
- div.s32 %r591, %r590, %r9;
- add.s32 %r592, %r591, %r35;
- mul.wide.s32 %rd78, %r592, 4;
+ div.rn.f32 %f180, %f187, %f33;
+ ld.global.f32 %f181, [%rd76];
+ ld.global.f32 %f182, [%rd77];
+ fma.rn.f32 %f160, %f181, %f180, %f182;
+ add.s32 %r277, %r256, 7;
+ div.s32 %r278, %r277, %r9;
+ add.s32 %r279, %r278, %r35;
+ mul.wide.s32 %rd78, %r279, 4;
add.s64 %rd79, %rd2, %rd78;
add.s64 %rd80, %rd3, %rd78;
- div.rn.f32 %f217, %f220, %f33;
- ld.global.f32 %f218, [%rd79];
- ld.global.f32 %f219, [%rd80];
- fma.rn.f32 %f195, %f218, %f217, %f219;
-
- { cvt.rn.f16.f32 %rs46, %f195;}
-
-
-
- { cvt.rn.f16.f32 %rs45, %f194;}
-
-
- mov.b32 %r568, {%rs45, %rs46};
- mad.lo.s32 %r593, %r10, %r34, %r33;
- mul.wide.s32 %rd81, %r593, 2;
+ div.rn.f32 %f183, %f186, %f33;
+ ld.global.f32 %f184, [%rd79];
+ ld.global.f32 %f185, [%rd80];
+ fma.rn.f32 %f161, %f184, %f183, %f185;
+
+ { cvt.rn.f16.f32 %rs46, %f161;}
+
+
+
+ { cvt.rn.f16.f32 %rs45, %f160;}
+
+
+ mov.b32 %r255, {%rs45, %rs46};
+ mad.lo.s32 %r280, %r10, %r34, %r33;
+ mul.wide.s32 %rd81, %r280, 2;
add.s64 %rd56, %rd6, %rd81;
- st.global.cs.v4.s32 [%rd56], {%r565,%r566,%r567,%r568};
-
- bra.uni $L__BB0_107;
+ st.global.cs.v4.s32 [%rd56], {%r252,%r253,%r254,%r255};
+
+ bra.uni $L__BB0_59;
$L__BB0_24:
shl.b32 %r36, %r6, 3;
- or.b32 %r176, %r36, 7;
- setp.ge.s32 %p29, %r176, %r10;
- @%p29 bra $L__BB0_27;
-
- add.s32 %r177, %r36, 7;
- div.s32 %r178, %r177, %r9;
- mov.u32 %r179, %ctaid.x;
- shr.s32 %r180, %r179, 31;
- shr.u32 %r181, %r180, 27;
- add.s32 %r182, %r179, %r181;
- and.b32 %r183, %r182, -32;
- sub.s32 %r184, %r179, %r183;
- mul.lo.s32 %r37, %r8, %r184;
- add.s32 %r185, %r178, %r37;
- setp.ge.s32 %p30, %r185, %r64;
- @%p30 bra $L__BB0_27;
-
- add.s32 %r186, %r27, %r36;
- div.s32 %r187, %r186, %r9;
- add.s32 %r188, %r187, %r37;
- mul.wide.s32 %rd22, %r188, 4;
+ or.b32 %r163, %r36, 7;
+ div.s32 %r164, %r163, %r9;
+ mov.u32 %r37, %ctaid.x;
+ shr.s32 %r165, %r37, 31;
+ shr.u32 %r166, %r165, 27;
+ add.s32 %r167, %r37, %r166;
+ and.b32 %r168, %r167, -32;
+ sub.s32 %r169, %r37, %r168;
+ mul.lo.s32 %r38, %r8, %r169;
+ add.s32 %r39, %r164, %r38;
+ setp.ge.s32 %p29, %r39, %r51;
+ @%p29 bra $L__BB0_26;
+
+ add.s32 %r170, %r27, %r36;
+ div.s32 %r171, %r170, %r9;
+ add.s32 %r172, %r171, %r38;
+ mul.wide.s32 %rd22, %r172, 4;
add.s64 %rd23, %rd2, %rd22;
- ld.global.f32 %f235, [%rd23];
-
-$L__BB0_27:
- add.s32 %r189, %r36, 7;
- setp.ge.s32 %p31, %r189, %r10;
- mov.f32 %f232, 0f00000000;
- @%p31 bra $L__BB0_29;
-
- div.s32 %r191, %r189, %r9;
- mov.u32 %r192, %ctaid.x;
- shr.s32 %r193, %r192, 31;
- shr.u32 %r194, %r193, 27;
- add.s32 %r195, %r192, %r194;
- and.b32 %r196, %r195, -32;
- sub.s32 %r197, %r192, %r196;
- mad.lo.s32 %r198, %r8, %r197, %r191;
- setp.lt.s32 %p32, %r198, %r64;
- selp.f32 %f232, %f235, 0f00000000, %p32;
-
-$L__BB0_29:
- @%p31 bra $L__BB0_32;
-
- div.s32 %r201, %r189, %r9;
- mov.u32 %r202, %ctaid.x;
- shr.s32 %r203, %r202, 31;
- shr.u32 %r204, %r203, 27;
- add.s32 %r205, %r202, %r204;
- and.b32 %r206, %r205, -32;
- sub.s32 %r207, %r202, %r206;
- mul.lo.s32 %r38, %r8, %r207;
- add.s32 %r208, %r201, %r38;
- setp.ge.s32 %p34, %r208, %r64;
- @%p34 bra $L__BB0_32;
+ ld.global.f32 %f197, [%rd23];
+
+$L__BB0_26:
+ add.s32 %r173, %r36, 7;
+ setp.lt.s32 %p30, %r173, %r10;
+ selp.f32 %f201, %f197, 0f00000000, %p30;
+ @%p29 bra $L__BB0_28;
+
+ add.s32 %r174, %r27, %r36;
+ div.s32 %r175, %r174, %r9;
+ add.s32 %r176, %r175, %r38;
+ mul.wide.s32 %rd24, %r176, 4;
+ add.s64 %rd25, %rd3, %rd24;
+ ld.global.f32 %f198, [%rd25];
+
+$L__BB0_28:
+ selp.f32 %f202, %f198, 0f00000000, %p30;
+ div.rn.f32 %f125, %f193, %f33;
+ fma.rn.f32 %f124, %f201, %f125, %f202;
+
+ { cvt.rn.f16.f32 %rs31, %f124;}
+
+
+ @%p29 bra $L__BB0_30;
+
+ add.s32 %r179, %r27, %r36;
+ add.s32 %r180, %r179, 1;
+ div.s32 %r181, %r180, %r9;
+ add.s32 %r182, %r181, %r38;
+ mul.wide.s32 %rd26, %r182, 4;
+ add.s64 %rd27, %rd2, %rd26;
+ ld.global.f32 %f126, [%rd27];
+ selp.f32 %f201, %f126, 0f00000000, %p30;
+
+$L__BB0_30:
+ @%p29 bra $L__BB0_32;
+
+ add.s32 %r184, %r27, %r36;
+ add.s32 %r185, %r184, 1;
+ div.s32 %r186, %r185, %r9;
+ add.s32 %r187, %r186, %r38;
+ mul.wide.s32 %rd28, %r187, 4;
+ add.s64 %rd29, %rd3, %rd28;
+ ld.global.f32 %f127, [%rd29];
+ selp.f32 %f202, %f127, 0f00000000, %p30;
+
+$L__BB0_32:
+ div.rn.f32 %f129, %f192, %f33;
+ fma.rn.f32 %f128, %f201, %f129, %f202;
+
+ { cvt.rn.f16.f32 %rs32, %f128;}
+
+
+ @%p29 bra $L__BB0_34;
+
+ add.s32 %r189, %r27, %r36;
+ add.s32 %r190, %r189, 2;
+ div.s32 %r191, %r190, %r9;
+ add.s32 %r192, %r191, %r38;
+ mul.wide.s32 %rd30, %r192, 4;
+ add.s64 %rd31, %rd2, %rd30;
+ ld.global.f32 %f130, [%rd31];
+ selp.f32 %f201, %f130, 0f00000000, %p30;
+
+$L__BB0_34:
+ @%p29 bra $L__BB0_36;
+
+ add.s32 %r194, %r27, %r36;
+ add.s32 %r195, %r194, 2;
+ div.s32 %r196, %r195, %r9;
+ add.s32 %r197, %r196, %r38;
+ mul.wide.s32 %rd32, %r197, 4;
+ add.s64 %rd33, %rd3, %rd32;
+ ld.global.f32 %f131, [%rd33];
+ selp.f32 %f202, %f131, 0f00000000, %p30;
+
+$L__BB0_36:
+ div.rn.f32 %f133, %f191, %f33;
+ fma.rn.f32 %f132, %f201, %f133, %f202;
+
+ { cvt.rn.f16.f32 %rs33, %f132;}
+
+
+ @%p29 bra $L__BB0_38;
+
+ add.s32 %r199, %r27, %r36;
+ add.s32 %r200, %r199, 3;
+ div.s32 %r201, %r200, %r9;
+ add.s32 %r202, %r201, %r38;
+ mul.wide.s32 %rd34, %r202, 4;
+ add.s64 %rd35, %rd2, %rd34;
+ ld.global.f32 %f134, [%rd35];
+ selp.f32 %f201, %f134, 0f00000000, %p30;
+
+$L__BB0_38:
+ @%p29 bra $L__BB0_40;
+
+ add.s32 %r204, %r27, %r36;
+ add.s32 %r205, %r204, 3;
+ div.s32 %r206, %r205, %r9;
+ add.s32 %r207, %r206, %r38;
+ mul.wide.s32 %rd36, %r207, 4;
+ add.s64 %rd37, %rd3, %rd36;
+ ld.global.f32 %f135, [%rd37];
+ selp.f32 %f202, %f135, 0f00000000, %p30;
+
+$L__BB0_40:
+ div.rn.f32 %f137, %f190, %f33;
+ fma.rn.f32 %f136, %f201, %f137, %f202;
+
+ { cvt.rn.f16.f32 %rs34, %f136;}
+
+
+ @%p29 bra $L__BB0_42;
add.s32 %r209, %r27, %r36;
- div.s32 %r210, %r209, %r9;
- add.s32 %r211, %r210, %r38;
- mul.wide.s32 %rd24, %r211, 4;
- add.s64 %rd25, %rd3, %rd24;
- ld.global.f32 %f237, [%rd25];
-
-$L__BB0_32:
- mov.f32 %f234, 0f00000000;
- @%p31 bra $L__BB0_34;
-
- div.s32 %r214, %r189, %r9;
- mov.u32 %r215, %ctaid.x;
- shr.s32 %r216, %r215, 31;
- shr.u32 %r217, %r216, 27;
- add.s32 %r218, %r215, %r217;
- and.b32 %r219, %r218, -32;
- sub.s32 %r220, %r215, %r219;
- mad.lo.s32 %r221, %r8, %r220, %r214;
- setp.lt.s32 %p36, %r221, %r64;
- selp.f32 %f234, %f237, 0f00000000, %p36;
-
-$L__BB0_34:
- div.rn.f32 %f159, %f227, %f33;
- fma.rn.f32 %f158, %f232, %f159, %f234;
-
- { cvt.rn.f16.f32 %rs31, %f158;}
-
-
- @%p31 bra $L__BB0_37;
-
- div.s32 %r224, %r189, %r9;
- mov.u32 %r225, %ctaid.x;
- shr.s32 %r226, %r225, 31;
- shr.u32 %r227, %r226, 27;
- add.s32 %r228, %r225, %r227;
- and.b32 %r229, %r228, -32;
- sub.s32 %r230, %r225, %r229;
- mul.lo.s32 %r39, %r8, %r230;
- add.s32 %r231, %r224, %r39;
- setp.ge.s32 %p38, %r231, %r64;
- @%p38 bra $L__BB0_37;
-
- add.s32 %r232, %r27, %r36;
- add.s32 %r233, %r232, 1;
- div.s32 %r234, %r233, %r9;
- add.s32 %r235, %r234, %r39;
- mul.wide.s32 %rd26, %r235, 4;
- add.s64 %rd27, %rd2, %rd26;
- ld.global.f32 %f235, [%rd27];
-
-$L__BB0_37:
- mov.f32 %f236, 0f00000000;
- @%p31 bra $L__BB0_39;
-
- div.s32 %r238, %r189, %r9;
- mov.u32 %r239, %ctaid.x;
- shr.s32 %r240, %r239, 31;
- shr.u32 %r241, %r240, 27;
- add.s32 %r242, %r239, %r241;
- and.b32 %r243, %r242, -32;
- sub.s32 %r244, %r239, %r243;
- mad.lo.s32 %r245, %r8, %r244, %r238;
- setp.lt.s32 %p40, %r245, %r64;
- selp.f32 %f236, %f235, 0f00000000, %p40;
-
-$L__BB0_39:
- @%p31 bra $L__BB0_42;
-
- div.s32 %r248, %r189, %r9;
- mov.u32 %r249, %ctaid.x;
- shr.s32 %r250, %r249, 31;
- shr.u32 %r251, %r250, 27;
- add.s32 %r252, %r249, %r251;
- and.b32 %r253, %r252, -32;
- sub.s32 %r254, %r249, %r253;
- mul.lo.s32 %r40, %r8, %r254;
- add.s32 %r255, %r248, %r40;
- setp.ge.s32 %p42, %r255, %r64;
- @%p42 bra $L__BB0_42;
-
- add.s32 %r256, %r27, %r36;
- add.s32 %r257, %r256, 1;
- div.s32 %r258, %r257, %r9;
- add.s32 %r259, %r258, %r40;
- mul.wide.s32 %rd28, %r259, 4;
- add.s64 %rd29, %rd3, %rd28;
- ld.global.f32 %f237, [%rd29];
+ add.s32 %r210, %r209, 4;
+ div.s32 %r211, %r210, %r9;
+ add.s32 %r212, %r211, %r38;
+ mul.wide.s32 %rd38, %r212, 4;
+ add.s64 %rd39, %rd2, %rd38;
+ ld.global.f32 %f138, [%rd39];
+ selp.f32 %f201, %f138, 0f00000000, %p30;
$L__BB0_42:
- mov.f32 %f238, 0f00000000;
- @%p31 bra $L__BB0_44;
-
- div.s32 %r262, %r189, %r9;
- mov.u32 %r263, %ctaid.x;
- shr.s32 %r264, %r263, 31;
- shr.u32 %r265, %r264, 27;
- add.s32 %r266, %r263, %r265;
- and.b32 %r267, %r266, -32;
- sub.s32 %r268, %r263, %r267;
- mad.lo.s32 %r269, %r8, %r268, %r262;
- setp.lt.s32 %p44, %r269, %r64;
- selp.f32 %f238, %f237, 0f00000000, %p44;
+ @%p29 bra $L__BB0_44;
+
+ add.s32 %r214, %r27, %r36;
+ add.s32 %r215, %r214, 4;
+ div.s32 %r216, %r215, %r9;
+ add.s32 %r217, %r216, %r38;
+ mul.wide.s32 %rd40, %r217, 4;
+ add.s64 %rd41, %rd3, %rd40;
+ ld.global.f32 %f139, [%rd41];
+ selp.f32 %f202, %f139, 0f00000000, %p30;
$L__BB0_44:
- div.rn.f32 %f163, %f226, %f33;
- fma.rn.f32 %f162, %f236, %f163, %f238;
-
- { cvt.rn.f16.f32 %rs32, %f162;}
-
-
- @%p31 bra $L__BB0_47;
-
- div.s32 %r272, %r189, %r9;
- mov.u32 %r273, %ctaid.x;
- shr.s32 %r274, %r273, 31;
- shr.u32 %r275, %r274, 27;
- add.s32 %r276, %r273, %r275;
- and.b32 %r277, %r276, -32;
- sub.s32 %r278, %r273, %r277;
- mul.lo.s32 %r41, %r8, %r278;
- add.s32 %r279, %r272, %r41;
- setp.ge.s32 %p46, %r279, %r64;
- @%p46 bra $L__BB0_47;
-
- add.s32 %r280, %r27, %r36;
- add.s32 %r281, %r280, 2;
- div.s32 %r282, %r281, %r9;
- add.s32 %r283, %r282, %r41;
- mul.wide.s32 %rd30, %r283, 4;
- add.s64 %rd31, %rd2, %rd30;
- ld.global.f32 %f235, [%rd31];
-
-$L__BB0_47:
- mov.f32 %f240, 0f00000000;
- @%p31 bra $L__BB0_49;
-
- div.s32 %r286, %r189, %r9;
- mov.u32 %r287, %ctaid.x;
- shr.s32 %r288, %r287, 31;
- shr.u32 %r289, %r288, 27;
- add.s32 %r290, %r287, %r289;
- and.b32 %r291, %r290, -32;
- sub.s32 %r292, %r287, %r291;
- mad.lo.s32 %r293, %r8, %r292, %r286;
- setp.lt.s32 %p48, %r293, %r64;
- selp.f32 %f240, %f235, 0f00000000, %p48;
-
-$L__BB0_49:
- @%p31 bra $L__BB0_52;
-
- div.s32 %r296, %r189, %r9;
- mov.u32 %r297, %ctaid.x;
- shr.s32 %r298, %r297, 31;
- shr.u32 %r299, %r298, 27;
- add.s32 %r300, %r297, %r299;
- and.b32 %r301, %r300, -32;
- sub.s32 %r302, %r297, %r301;
- mul.lo.s32 %r42, %r8, %r302;
- add.s32 %r303, %r296, %r42;
- setp.ge.s32 %p50, %r303, %r64;
- @%p50 bra $L__BB0_52;
-
- add.s32 %r304, %r27, %r36;
- add.s32 %r305, %r304, 2;
- div.s32 %r306, %r305, %r9;
- add.s32 %r307, %r306, %r42;
- mul.wide.s32 %rd32, %r307, 4;
- add.s64 %rd33, %rd3, %rd32;
- ld.global.f32 %f237, [%rd33];
+ div.rn.f32 %f141, %f189, %f33;
+ fma.rn.f32 %f140, %f201, %f141, %f202;
+
+ { cvt.rn.f16.f32 %rs35, %f140;}
+
+
+ @%p29 bra $L__BB0_46;
+
+ add.s32 %r219, %r27, %r36;
+ add.s32 %r220, %r219, 5;
+ div.s32 %r221, %r220, %r9;
+ add.s32 %r222, %r221, %r38;
+ mul.wide.s32 %rd42, %r222, 4;
+ add.s64 %rd43, %rd2, %rd42;
+ ld.global.f32 %f142, [%rd43];
+ selp.f32 %f201, %f142, 0f00000000, %p30;
+
+$L__BB0_46:
+ @%p29 bra $L__BB0_48;
+
+ add.s32 %r224, %r27, %r36;
+ add.s32 %r225, %r224, 5;
+ div.s32 %r226, %r225, %r9;
+ add.s32 %r227, %r226, %r38;
+ mul.wide.s32 %rd44, %r227, 4;
+ add.s64 %rd45, %rd3, %rd44;
+ ld.global.f32 %f143, [%rd45];
+ selp.f32 %f202, %f143, 0f00000000, %p30;
+
+$L__BB0_48:
+ div.rn.f32 %f145, %f188, %f33;
+ fma.rn.f32 %f144, %f201, %f145, %f202;
+
+ { cvt.rn.f16.f32 %rs36, %f144;}
+
+
+ @%p29 bra $L__BB0_50;
+
+ add.s32 %r229, %r27, %r36;
+ add.s32 %r230, %r229, 6;
+ div.s32 %r231, %r230, %r9;
+ add.s32 %r232, %r231, %r38;
+ mul.wide.s32 %rd46, %r232, 4;
+ add.s64 %rd47, %rd2, %rd46;
+ ld.global.f32 %f146, [%rd47];
+ selp.f32 %f201, %f146, 0f00000000, %p30;
+
+$L__BB0_50:
+ @%p29 bra $L__BB0_52;
+
+ add.s32 %r234, %r27, %r36;
+ add.s32 %r235, %r234, 6;
+ div.s32 %r236, %r235, %r9;
+ add.s32 %r237, %r236, %r38;
+ mul.wide.s32 %rd48, %r237, 4;
+ add.s64 %rd49, %rd3, %rd48;
+ ld.global.f32 %f147, [%rd49];
+ selp.f32 %f202, %f147, 0f00000000, %p30;
$L__BB0_52:
- mov.f32 %f242, 0f00000000;
- @%p31 bra $L__BB0_54;
-
- div.s32 %r310, %r189, %r9;
- mov.u32 %r311, %ctaid.x;
- shr.s32 %r312, %r311, 31;
- shr.u32 %r313, %r312, 27;
- add.s32 %r314, %r311, %r313;
- and.b32 %r315, %r314, -32;
- sub.s32 %r316, %r311, %r315;
- mad.lo.s32 %r317, %r8, %r316, %r310;
- setp.lt.s32 %p52, %r317, %r64;
- selp.f32 %f242, %f237, 0f00000000, %p52;
+ div.rn.f32 %f149, %f187, %f33;
+ fma.rn.f32 %f148, %f201, %f149, %f202;
+
+ { cvt.rn.f16.f32 %rs37, %f148;}
+
+
+ @%p29 bra $L__BB0_54;
+
+ add.s32 %r239, %r173, %r27;
+ div.s32 %r240, %r239, %r9;
+ add.s32 %r241, %r240, %r38;
+ mul.wide.s32 %rd50, %r241, 4;
+ add.s64 %rd51, %rd2, %rd50;
+ ld.global.f32 %f150, [%rd51];
+ selp.f32 %f201, %f150, 0f00000000, %p30;
$L__BB0_54:
- div.rn.f32 %f167, %f225, %f33;
- fma.rn.f32 %f166, %f240, %f167, %f242;
-
- { cvt.rn.f16.f32 %rs33, %f166;}
-
-
- @%p31 bra $L__BB0_57;
-
- div.s32 %r320, %r189, %r9;
- mov.u32 %r321, %ctaid.x;
- shr.s32 %r322, %r321, 31;
- shr.u32 %r323, %r322, 27;
- add.s32 %r324, %r321, %r323;
- and.b32 %r325, %r324, -32;
- sub.s32 %r326, %r321, %r325;
- mul.lo.s32 %r43, %r8, %r326;
- add.s32 %r327, %r320, %r43;
- setp.ge.s32 %p54, %r327, %r64;
- @%p54 bra $L__BB0_57;
-
- add.s32 %r328, %r27, %r36;
- add.s32 %r329, %r328, 3;
- div.s32 %r330, %r329, %r9;
- add.s32 %r331, %r330, %r43;
- mul.wide.s32 %rd34, %r331, 4;
- add.s64 %rd35, %rd2, %rd34;
- ld.global.f32 %f235, [%rd35];
-
-$L__BB0_57:
- mov.f32 %f244, 0f00000000;
- @%p31 bra $L__BB0_59;
-
- div.s32 %r334, %r189, %r9;
- mov.u32 %r335, %ctaid.x;
- shr.s32 %r336, %r335, 31;
- shr.u32 %r337, %r336, 27;
- add.s32 %r338, %r335, %r337;
- and.b32 %r339, %r338, -32;
- sub.s32 %r340, %r335, %r339;
- mad.lo.s32 %r341, %r8, %r340, %r334;
- setp.lt.s32 %p56, %r341, %r64;
- selp.f32 %f244, %f235, 0f00000000, %p56;
+ @%p29 bra $L__BB0_56;
+
+ add.s32 %r243, %r173, %r27;
+ div.s32 %r244, %r243, %r9;
+ add.s32 %r245, %r244, %r38;
+ mul.wide.s32 %rd52, %r245, 4;
+ add.s64 %rd53, %rd3, %rd52;
+ ld.global.f32 %f151, [%rd53];
+ selp.f32 %f202, %f151, 0f00000000, %p30;
+
+$L__BB0_56:
+ setp.ge.s32 %p61, %r173, %r10;
+ div.rn.f32 %f153, %f186, %f33;
+ fma.rn.f32 %f152, %f201, %f153, %f202;
+
+ { cvt.rn.f16.f32 %rs38, %f152;}
+
+
+ or.pred %p63, %p2, %p61;
+ @%p63 bra $L__BB0_59;
+
+ mov.b32 %r250, {%rs37, %rs38};
+ mad.lo.s32 %r251, %r10, %r37, %r36;
+ mul.wide.s32 %rd55, %r251, 2;
+ add.s64 %rd54, %rd6, %rd55;
+ mov.b32 %r247, {%rs31, %rs32};
+ mov.b32 %r248, {%rs33, %rs34};
+ mov.b32 %r249, {%rs35, %rs36};
+
+ st.global.cs.v4.s32 [%rd54], {%r247,%r248,%r249,%r250};
+
$L__BB0_59:
- @%p31 bra $L__BB0_62;
-
- div.s32 %r344, %r189, %r9;
- mov.u32 %r345, %ctaid.x;
- shr.s32 %r346, %r345, 31;
- shr.u32 %r347, %r346, 27;
- add.s32 %r348, %r345, %r347;
- and.b32 %r349, %r348, -32;
- sub.s32 %r350, %r345, %r349;
- mul.lo.s32 %r44, %r8, %r350;
- add.s32 %r351, %r344, %r44;
- setp.ge.s32 %p58, %r351, %r64;
- @%p58 bra $L__BB0_62;
-
- add.s32 %r352, %r27, %r36;
- add.s32 %r353, %r352, 3;
- div.s32 %r354, %r353, %r9;
- add.s32 %r355, %r354, %r44;
- mul.wide.s32 %rd36, %r355, 4;
- add.s64 %rd37, %rd3, %rd36;
- ld.global.f32 %f237, [%rd37];
-
-$L__BB0_62:
- mov.f32 %f246, 0f00000000;
- @%p31 bra $L__BB0_64;
-
- div.s32 %r358, %r189, %r9;
- mov.u32 %r359, %ctaid.x;
- shr.s32 %r360, %r359, 31;
- shr.u32 %r361, %r360, 27;
- add.s32 %r362, %r359, %r361;
- and.b32 %r363, %r362, -32;
- sub.s32 %r364, %r359, %r363;
- mad.lo.s32 %r365, %r8, %r364, %r358;
- setp.lt.s32 %p60, %r365, %r64;
- selp.f32 %f246, %f237, 0f00000000, %p60;
-
-$L__BB0_64:
- div.rn.f32 %f171, %f224, %f33;
- fma.rn.f32 %f170, %f244, %f171, %f246;
-
- { cvt.rn.f16.f32 %rs34, %f170;}
-
-
- @%p31 bra $L__BB0_67;
-
- div.s32 %r368, %r189, %r9;
- mov.u32 %r369, %ctaid.x;
- shr.s32 %r370, %r369, 31;
- shr.u32 %r371, %r370, 27;
- add.s32 %r372, %r369, %r371;
- and.b32 %r373, %r372, -32;
- sub.s32 %r374, %r369, %r373;
- mul.lo.s32 %r45, %r8, %r374;
- add.s32 %r375, %r368, %r45;
- setp.ge.s32 %p62, %r375, %r64;
- @%p62 bra $L__BB0_67;
-
- add.s32 %r376, %r27, %r36;
- add.s32 %r377, %r376, 4;
- div.s32 %r378, %r377, %r9;
- add.s32 %r379, %r378, %r45;
- mul.wide.s32 %rd38, %r379, 4;
- add.s64 %rd39, %rd2, %rd38;
- ld.global.f32 %f235, [%rd39];
-
-$L__BB0_67:
- mov.f32 %f248, 0f00000000;
- @%p31 bra $L__BB0_69;
-
- div.s32 %r382, %r189, %r9;
- mov.u32 %r383, %ctaid.x;
- shr.s32 %r384, %r383, 31;
- shr.u32 %r385, %r384, 27;
- add.s32 %r386, %r383, %r385;
- and.b32 %r387, %r386, -32;
- sub.s32 %r388, %r383, %r387;
- mad.lo.s32 %r389, %r8, %r388, %r382;
- setp.lt.s32 %p64, %r389, %r64;
- selp.f32 %f248, %f235, 0f00000000, %p64;
-
-$L__BB0_69:
- @%p31 bra $L__BB0_72;
-
- div.s32 %r392, %r189, %r9;
- mov.u32 %r393, %ctaid.x;
- shr.s32 %r394, %r393, 31;
- shr.u32 %r395, %r394, 27;
- add.s32 %r396, %r393, %r395;
- and.b32 %r397, %r396, -32;
- sub.s32 %r398, %r393, %r397;
- mul.lo.s32 %r46, %r8, %r398;
- add.s32 %r399, %r392, %r46;
- setp.ge.s32 %p66, %r399, %r64;
- @%p66 bra $L__BB0_72;
-
- add.s32 %r400, %r27, %r36;
- add.s32 %r401, %r400, 4;
- div.s32 %r402, %r401, %r9;
- add.s32 %r403, %r402, %r46;
- mul.wide.s32 %rd40, %r403, 4;
- add.s64 %rd41, %rd3, %rd40;
- ld.global.f32 %f237, [%rd41];
-
-$L__BB0_72:
- mov.f32 %f250, 0f00000000;
- @%p31 bra $L__BB0_74;
-
- div.s32 %r406, %r189, %r9;
- mov.u32 %r407, %ctaid.x;
- shr.s32 %r408, %r407, 31;
- shr.u32 %r409, %r408, 27;
- add.s32 %r410, %r407, %r409;
- and.b32 %r411, %r410, -32;
- sub.s32 %r412, %r407, %r411;
- mad.lo.s32 %r413, %r8, %r412, %r406;
- setp.lt.s32 %p68, %r413, %r64;
- selp.f32 %f250, %f237, 0f00000000, %p68;
-
-$L__BB0_74:
- div.rn.f32 %f175, %f223, %f33;
- fma.rn.f32 %f174, %f248, %f175, %f250;
-
- { cvt.rn.f16.f32 %rs35, %f174;}
-
-
- @%p31 bra $L__BB0_77;
-
- div.s32 %r416, %r189, %r9;
- mov.u32 %r417, %ctaid.x;
- shr.s32 %r418, %r417, 31;
- shr.u32 %r419, %r418, 27;
- add.s32 %r420, %r417, %r419;
- and.b32 %r421, %r420, -32;
- sub.s32 %r422, %r417, %r421;
- mul.lo.s32 %r47, %r8, %r422;
- add.s32 %r423, %r416, %r47;
- setp.ge.s32 %p70, %r423, %r64;
- @%p70 bra $L__BB0_77;
-
- add.s32 %r424, %r27, %r36;
- add.s32 %r425, %r424, 5;
- div.s32 %r426, %r425, %r9;
- add.s32 %r427, %r426, %r47;
- mul.wide.s32 %rd42, %r427, 4;
- add.s64 %rd43, %rd2, %rd42;
- ld.global.f32 %f235, [%rd43];
-
-$L__BB0_77:
- mov.f32 %f252, 0f00000000;
- @%p31 bra $L__BB0_79;
-
- div.s32 %r430, %r189, %r9;
- mov.u32 %r431, %ctaid.x;
- shr.s32 %r432, %r431, 31;
- shr.u32 %r433, %r432, 27;
- add.s32 %r434, %r431, %r433;
- and.b32 %r435, %r434, -32;
- sub.s32 %r436, %r431, %r435;
- mad.lo.s32 %r437, %r8, %r436, %r430;
- setp.lt.s32 %p72, %r437, %r64;
- selp.f32 %f252, %f235, 0f00000000, %p72;
-
-$L__BB0_79:
- @%p31 bra $L__BB0_82;
-
- div.s32 %r440, %r189, %r9;
- mov.u32 %r441, %ctaid.x;
- shr.s32 %r442, %r441, 31;
- shr.u32 %r443, %r442, 27;
- add.s32 %r444, %r441, %r443;
- and.b32 %r445, %r444, -32;
- sub.s32 %r446, %r441, %r445;
- mul.lo.s32 %r48, %r8, %r446;
- add.s32 %r447, %r440, %r48;
- setp.ge.s32 %p74, %r447, %r64;
- @%p74 bra $L__BB0_82;
-
- add.s32 %r448, %r27, %r36;
- add.s32 %r449, %r448, 5;
- div.s32 %r450, %r449, %r9;
- add.s32 %r451, %r450, %r48;
- mul.wide.s32 %rd44, %r451, 4;
- add.s64 %rd45, %rd3, %rd44;
- ld.global.f32 %f237, [%rd45];
-
-$L__BB0_82:
- mov.f32 %f254, 0f00000000;
- @%p31 bra $L__BB0_84;
-
- div.s32 %r454, %r189, %r9;
- mov.u32 %r455, %ctaid.x;
- shr.s32 %r456, %r455, 31;
- shr.u32 %r457, %r456, 27;
- add.s32 %r458, %r455, %r457;
- and.b32 %r459, %r458, -32;
- sub.s32 %r460, %r455, %r459;
- mad.lo.s32 %r461, %r8, %r460, %r454;
- setp.lt.s32 %p76, %r461, %r64;
- selp.f32 %f254, %f237, 0f00000000, %p76;
-
-$L__BB0_84:
- div.rn.f32 %f179, %f222, %f33;
- fma.rn.f32 %f178, %f252, %f179, %f254;
-
- { cvt.rn.f16.f32 %rs36, %f178;}
-
-
- @%p31 bra $L__BB0_87;
-
- div.s32 %r464, %r189, %r9;
- mov.u32 %r465, %ctaid.x;
- shr.s32 %r466, %r465, 31;
- shr.u32 %r467, %r466, 27;
- add.s32 %r468, %r465, %r467;
- and.b32 %r469, %r468, -32;
- sub.s32 %r470, %r465, %r469;
- mul.lo.s32 %r49, %r8, %r470;
- add.s32 %r471, %r464, %r49;
- setp.ge.s32 %p78, %r471, %r64;
- @%p78 bra $L__BB0_87;
-
- add.s32 %r472, %r27, %r36;
- add.s32 %r473, %r472, 6;
- div.s32 %r474, %r473, %r9;
- add.s32 %r475, %r474, %r49;
- mul.wide.s32 %rd46, %r475, 4;
- add.s64 %rd47, %rd2, %rd46;
- ld.global.f32 %f235, [%rd47];
-
-$L__BB0_87:
- mov.f32 %f256, 0f00000000;
- @%p31 bra $L__BB0_89;
-
- div.s32 %r478, %r189, %r9;
- mov.u32 %r479, %ctaid.x;
- shr.s32 %r480, %r479, 31;
- shr.u32 %r481, %r480, 27;
- add.s32 %r482, %r479, %r481;
- and.b32 %r483, %r482, -32;
- sub.s32 %r484, %r479, %r483;
- mad.lo.s32 %r485, %r8, %r484, %r478;
- setp.lt.s32 %p80, %r485, %r64;
- selp.f32 %f256, %f235, 0f00000000, %p80;
-
-$L__BB0_89:
- @%p31 bra $L__BB0_92;
-
- div.s32 %r488, %r189, %r9;
- mov.u32 %r489, %ctaid.x;
- shr.s32 %r490, %r489, 31;
- shr.u32 %r491, %r490, 27;
- add.s32 %r492, %r489, %r491;
- and.b32 %r493, %r492, -32;
- sub.s32 %r494, %r489, %r493;
- mul.lo.s32 %r50, %r8, %r494;
- add.s32 %r495, %r488, %r50;
- setp.ge.s32 %p82, %r495, %r64;
- @%p82 bra $L__BB0_92;
-
- add.s32 %r496, %r27, %r36;
- add.s32 %r497, %r496, 6;
- div.s32 %r498, %r497, %r9;
- add.s32 %r499, %r498, %r50;
- mul.wide.s32 %rd48, %r499, 4;
- add.s64 %rd49, %rd3, %rd48;
- ld.global.f32 %f237, [%rd49];
-
-$L__BB0_92:
- mov.f32 %f258, 0f00000000;
- @%p31 bra $L__BB0_94;
-
- div.s32 %r502, %r189, %r9;
- mov.u32 %r503, %ctaid.x;
- shr.s32 %r504, %r503, 31;
- shr.u32 %r505, %r504, 27;
- add.s32 %r506, %r503, %r505;
- and.b32 %r507, %r506, -32;
- sub.s32 %r508, %r503, %r507;
- mad.lo.s32 %r509, %r8, %r508, %r502;
- setp.lt.s32 %p84, %r509, %r64;
- selp.f32 %f258, %f237, 0f00000000, %p84;
-
-$L__BB0_94:
- div.rn.f32 %f183, %f221, %f33;
- fma.rn.f32 %f182, %f256, %f183, %f258;
-
- { cvt.rn.f16.f32 %rs37, %f182;}
-
-
- @%p31 bra $L__BB0_97;
-
- div.s32 %r512, %r189, %r9;
- mov.u32 %r513, %ctaid.x;
- shr.s32 %r514, %r513, 31;
- shr.u32 %r515, %r514, 27;
- add.s32 %r516, %r513, %r515;
- and.b32 %r517, %r516, -32;
- sub.s32 %r518, %r513, %r517;
- mul.lo.s32 %r51, %r8, %r518;
- add.s32 %r519, %r512, %r51;
- setp.ge.s32 %p86, %r519, %r64;
- @%p86 bra $L__BB0_97;
-
- add.s32 %r520, %r36, %r27;
- add.s32 %r521, %r520, 7;
- div.s32 %r522, %r521, %r9;
- add.s32 %r523, %r522, %r51;
- mul.wide.s32 %rd50, %r523, 4;
- add.s64 %rd51, %rd2, %rd50;
- ld.global.f32 %f235, [%rd51];
-
-$L__BB0_97:
- mov.f32 %f260, 0f00000000;
- @%p31 bra $L__BB0_99;
-
- div.s32 %r526, %r189, %r9;
- mov.u32 %r527, %ctaid.x;
- shr.s32 %r528, %r527, 31;
- shr.u32 %r529, %r528, 27;
- add.s32 %r530, %r527, %r529;
- and.b32 %r531, %r530, -32;
- sub.s32 %r532, %r527, %r531;
- mad.lo.s32 %r533, %r8, %r532, %r526;
- setp.lt.s32 %p88, %r533, %r64;
- selp.f32 %f260, %f235, 0f00000000, %p88;
-
-$L__BB0_99:
- @%p31 bra $L__BB0_102;
-
- div.s32 %r536, %r189, %r9;
- mov.u32 %r537, %ctaid.x;
- shr.s32 %r538, %r537, 31;
- shr.u32 %r539, %r538, 27;
- add.s32 %r540, %r537, %r539;
- and.b32 %r541, %r540, -32;
- sub.s32 %r542, %r537, %r541;
- mul.lo.s32 %r52, %r8, %r542;
- add.s32 %r543, %r536, %r52;
- setp.ge.s32 %p90, %r543, %r64;
- @%p90 bra $L__BB0_102;
-
- add.s32 %r544, %r36, %r27;
- add.s32 %r545, %r544, 7;
- div.s32 %r546, %r545, %r9;
- add.s32 %r547, %r546, %r52;
- mul.wide.s32 %rd52, %r547, 4;
- add.s64 %rd53, %rd3, %rd52;
- ld.global.f32 %f237, [%rd53];
-
-$L__BB0_102:
- mov.f32 %f262, 0f00000000;
- @%p31 bra $L__BB0_104;
-
- div.s32 %r550, %r189, %r9;
- mov.u32 %r551, %ctaid.x;
- shr.s32 %r552, %r551, 31;
- shr.u32 %r553, %r552, 27;
- add.s32 %r554, %r551, %r553;
- and.b32 %r555, %r554, -32;
- sub.s32 %r556, %r551, %r555;
- mad.lo.s32 %r557, %r8, %r556, %r550;
- setp.lt.s32 %p92, %r557, %r64;
- selp.f32 %f262, %f237, 0f00000000, %p92;
-
-$L__BB0_104:
- div.rn.f32 %f187, %f220, %f33;
- fma.rn.f32 %f186, %f260, %f187, %f262;
-
- { cvt.rn.f16.f32 %rs38, %f186;}
-
-
- or.pred %p95, %p2, %p31;
- @%p95 bra $L__BB0_107;
-
- mov.b32 %r562, {%rs37, %rs38};
- mov.u32 %r563, %ctaid.x;
- mad.lo.s32 %r564, %r10, %r563, %r36;
- mul.wide.s32 %rd55, %r564, 2;
- add.s64 %rd54, %rd6, %rd55;
- mov.b32 %r559, {%rs31, %rs32};
- mov.b32 %r560, {%rs33, %rs34};
- mov.b32 %r561, {%rs35, %rs36};
-
- st.global.cs.v4.s32 [%rd54], {%r559,%r560,%r561,%r562};
-
-
-$L__BB0_107:
ret;
}
11: GpuViewTest.FusionMismatchingReshape
Kernel 1
CUDA
PTX
53997da5d
Diff
03a1b695e
-4
+4 index type: int
registers: 18
gmem: 27
static smem: 0
stack frame: 32
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 3, 3> T0, Tensor<float, 2, 2> T4, Tensor<float, 2, 2> T5) {
Array<float, 1, 1> T1;
T1[0] = 0;
if (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < 30)) {
T1[0]
= sinf(T0[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]);
}
Array<float, 1, 1> T2;
T2[0]
= T1[0];
if (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < 30)) {
T4[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
= cosf(T2[0]);
}
Array<float, 1, 1> T3;
T3[0]
= T1[0];
if (((((nvfuser_index_t)threadIdx.x) < 30) && (((5 * ((((nvfuser_index_t)threadIdx.x) / 5) % 3)) + (((nvfuser_index_t)threadIdx.x) % 5)) < 15))) {
T5[(((15 * (((nvfuser_index_t)threadIdx.x) / 15)) + (5 * ((((nvfuser_index_t)threadIdx.x) / 5) % 3))) + (((nvfuser_index_t)threadIdx.x) % 5))]
= expf(T3[0]);
}
}
__global__ void nvfuser_N(Tensor<float, 3, 3> T0, Tensor<float, 2, 2> T4, Tensor<float, 2, 2> T5) {
Array<float, 1, 1> T1;
T1[0] = 0;
if ((((nvfuser_index_t)threadIdx.x) < 30)) {
T1[0]
= sinf(T0[((nvfuser_index_t)threadIdx.x)]);
}
Array<float, 1, 1> T2;
T2[0]
= T1[0];
if ((((nvfuser_index_t)threadIdx.x) < 30)) {
T4[((nvfuser_index_t)threadIdx.x)]
= cosf(T2[0]);
}
Array<float, 1, 1> T3;
T3[0]
= T1[0];
if (((((nvfuser_index_t)threadIdx.x) < 30) && (((5 * ((((nvfuser_index_t)threadIdx.x) / 5) % 3)) + (((nvfuser_index_t)threadIdx.x) % 5)) < 15))) {
T5[(((15 * (((nvfuser_index_t)threadIdx.x) / 15)) + (5 * ((((nvfuser_index_t)threadIdx.x) / 5) % 3))) + (((nvfuser_index_t)threadIdx.x) % 5))]
= expf(T3[0]);
}
}
--- 53997da5d
+++ 03a1b695e
@@ -1,17 +1,17 @@
__global__ void nvfuser_N(Tensor<float, 3, 3> T0, Tensor<float, 2, 2> T4, Tensor<float, 2, 2> T5) {
Array<float, 1, 1> T1;
T1[0] = 0;
- if (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < 30)) {
+ if ((((nvfuser_index_t)threadIdx.x) < 30)) {
T1[0]
- = sinf(T0[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]);
+ = sinf(T0[((nvfuser_index_t)threadIdx.x)]);
}
Array<float, 1, 1> T2;
T2[0]
= T1[0];
- if (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < 30)) {
- T4[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
+ if ((((nvfuser_index_t)threadIdx.x) < 30)) {
+ T4[((nvfuser_index_t)threadIdx.x)]
= cosf(T2[0]);
}
Array<float, 1, 1> T3;
T3[0]
= T1[0];
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_382_cu_9e2704e6_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_382_cu_9e2704e6_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_382_cu_9e2704e6_191103std14__numeric_typeIvE5valueE = 1;
.global .align 4 .b8 __cudart_i2opi_f[24] = {65, 144, 67, 60, 153, 149, 98, 219, 192, 221, 52, 245, 209, 87, 39, 252, 41, 21, 68, 78, 110, 131, 249, 162};
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_382_cu_9e2704e6_1911011nvfuser_382ENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEES2_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_382_cu_9e2704e6_1911011nvfuser_382ENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEES2__param_0[32],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_382_cu_9e2704e6_1911011nvfuser_382ENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEES2__param_1[24],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_382_cu_9e2704e6_1911011nvfuser_382ENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEES2__param_2[24]
)
{
.local .align 4 .b8 __local_depot0[28];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<24>;
.reg .f32 %f<93>;
.reg .b32 %r<160>;
.reg .f64 %fd<5>;
.reg .b64 %rd<54>;
mov.u64 %SPL, __local_depot0;
ld.param.u64 %rd17, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_382_cu_9e2704e6_1911011nvfuser_382ENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEES2__param_2];
ld.param.u64 %rd16, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_382_cu_9e2704e6_1911011nvfuser_382ENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEES2__param_1];
ld.param.u64 %rd15, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_382_cu_9e2704e6_1911011nvfuser_382ENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEES2__param_0];
add.u64 %rd1, %SPL, 0;
mov.u32 %r68, %ctaid.x;
shl.b32 %r69, %r68, 7;
mov.u32 %r1, %tid.x;
add.s32 %r2, %r69, %r1;
setp.gt.s32 %p1, %r2, 29;
add.s64 %rd2, %rd1, 24;
mov.f32 %f89, 0f00000000;
@%p1 bra $L__BB0_13;
cvta.to.global.u64 %rd19, %rd15;
mul.wide.s32 %rd20, %r2, 4;
add.s64 %rd21, %rd19, %rd20;
ld.global.f32 %f1, [%rd21];
mul.f32 %f28, %f1, 0f3F22F983;
cvt.rni.s32.f32 %r155, %f28;
cvt.rn.f32.s32 %f29, %r155;
mov.f32 %f30, 0fBFC90FDA;
fma.rn.f32 %f31, %f29, %f30, %f1;
mov.f32 %f32, 0fB3A22168;
fma.rn.f32 %f33, %f29, %f32, %f31;
mov.f32 %f34, 0fA7C234C5;
fma.rn.f32 %f86, %f29, %f34, %f33;
abs.f32 %f3, %f1;
setp.ltu.f32 %p2, %f3, 0f47CE4780;
@%p2 bra $L__BB0_9;
setp.eq.f32 %p3, %f3, 0f7F800000;
@%p3 bra $L__BB0_8;
bra.uni $L__BB0_3;
$L__BB0_8:
mov.f32 %f37, 0f00000000;
mul.rn.f32 %f86, %f1, %f37;
mov.u32 %r155, 0;
bra.uni $L__BB0_9;
$L__BB0_3:
mov.b32 %r4, %f1;
shr.u32 %r71, %r4, 23;
and.b32 %r72, %r71, 255;
add.s32 %r5, %r72, -128;
shl.b32 %r73, %r4, 8;
or.b32 %r6, %r73, -2147483648;
shr.u32 %r7, %r5, 5;
mov.u64 %rd50, 0;
mov.u32 %r152, 0;
mov.u64 %rd49, __cudart_i2opi_f;
mov.u64 %rd48, %rd1;
$L__BB0_4:
.pragma "nounroll";
ld.global.nc.u32 %r74, [%rd49];
mad.wide.u32 %rd24, %r74, %r6, %rd50;
shr.u64 %rd50, %rd24, 32;
st.local.u32 [%rd48], %rd24;
add.s64 %rd49, %rd49, 4;
add.s64 %rd48, %rd48, 4;
add.s32 %r152, %r152, 1;
setp.ne.s32 %p4, %r152, 6;
@%p4 bra $L__BB0_4;
st.local.u32 [%rd2], %rd50;
mov.u32 %r75, 4;
sub.s32 %r10, %r75, %r7;
mov.u32 %r76, 6;
sub.s32 %r77, %r76, %r7;
mul.wide.s32 %rd25, %r77, 4;
add.s64 %rd26, %rd1, %rd25;
ld.local.u32 %r153, [%rd26];
ld.local.u32 %r154, [%rd26+-4];
and.b32 %r13, %r5, 31;
setp.eq.s32 %p5, %r13, 0;
@%p5 bra $L__BB0_7;
mov.u32 %r78, 32;
sub.s32 %r79, %r78, %r13;
shr.u32 %r80, %r154, %r79;
shl.b32 %r81, %r153, %r13;
add.s32 %r153, %r80, %r81;
mul.wide.s32 %rd27, %r10, 4;
add.s64 %rd28, %rd1, %rd27;
ld.local.u32 %r82, [%rd28];
shr.u32 %r83, %r82, %r79;
shl.b32 %r84, %r154, %r13;
add.s32 %r154, %r83, %r84;
$L__BB0_7:
and.b32 %r85, %r4, -2147483648;
shr.u32 %r86, %r154, 30;
shl.b32 %r87, %r153, 2;
or.b32 %r88, %r86, %r87;
shr.u32 %r89, %r88, 31;
shr.u32 %r90, %r153, 30;
add.s32 %r91, %r89, %r90;
neg.s32 %r92, %r91;
setp.eq.s32 %p6, %r85, 0;
selp.b32 %r155, %r91, %r92, %p6;
setp.ne.s32 %p7, %r89, 0;
xor.b32 %r93, %r85, -2147483648;
selp.b32 %r94, %r93, %r85, %p7;
selp.b32 %r95, -1, 0, %p7;
xor.b32 %r96, %r88, %r95;
shl.b32 %r97, %r154, 2;
xor.b32 %r98, %r97, %r95;
cvt.u64.u32 %rd29, %r96;
cvt.u64.u32 %rd30, %r98;
bfi.b64 %rd31, %rd29, %rd30, 32, 32;
cvt.rn.f64.s64 %fd1, %rd31;
mul.f64 %fd2, %fd1, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f35, %fd2;
setp.eq.s32 %p8, %r94, 0;
neg.f32 %f36, %f35;
selp.f32 %f86, %f35, %f36, %p8;
$L__BB0_9:
and.b32 %r20, %r155, 1;
setp.eq.s32 %p9, %r20, 0;
selp.f32 %f7, %f86, 0f3F800000, %p9;
mul.rn.f32 %f8, %f86, %f86;
mov.f32 %f87, 0fB94D4153;
@%p9 bra $L__BB0_11;
mov.f32 %f39, 0fBAB607ED;
mov.f32 %f40, 0f37CBAC00;
fma.rn.f32 %f87, %f40, %f8, %f39;
$L__BB0_11:
selp.f32 %f41, 0f3C0885E4, 0f3D2AAABB, %p9;
fma.rn.f32 %f42, %f87, %f8, %f41;
selp.f32 %f43, 0fBE2AAAA8, 0fBEFFFFFF, %p9;
fma.rn.f32 %f44, %f42, %f8, %f43;
mov.f32 %f45, 0f00000000;
fma.rn.f32 %f46, %f8, %f7, %f45;
fma.rn.f32 %f89, %f44, %f46, %f7;
and.b32 %r100, %r155, 2;
setp.eq.s32 %p11, %r100, 0;
@%p11 bra $L__BB0_13;
mov.f32 %f48, 0fBF800000;
fma.rn.f32 %f89, %f89, %f48, %f45;
$L__BB0_13:
@%p1 bra $L__BB0_27;
mul.f32 %f49, %f89, 0f3F22F983;
cvt.rni.s32.f32 %r159, %f49;
cvt.rn.f32.s32 %f50, %r159;
mov.f32 %f51, 0fBFC90FDA;
fma.rn.f32 %f52, %f50, %f51, %f89;
mov.f32 %f53, 0fB3A22168;
fma.rn.f32 %f54, %f50, %f53, %f52;
mov.f32 %f55, 0fA7C234C5;
fma.rn.f32 %f90, %f50, %f55, %f54;
abs.f32 %f16, %f89;
setp.ltu.f32 %p13, %f16, 0f47CE4780;
@%p13 bra $L__BB0_22;
setp.eq.f32 %p14, %f16, 0f7F800000;
@%p14 bra $L__BB0_21;
bra.uni $L__BB0_16;
$L__BB0_21:
mov.f32 %f58, 0f00000000;
mul.rn.f32 %f90, %f89, %f58;
mov.u32 %r159, 0;
bra.uni $L__BB0_22;
$L__BB0_16:
mov.b32 %r22, %f89;
shr.u32 %r102, %r22, 23;
and.b32 %r103, %r102, 255;
add.s32 %r23, %r103, -128;
shl.b32 %r104, %r22, 8;
or.b32 %r24, %r104, -2147483648;
shr.u32 %r25, %r23, 5;
mov.u64 %rd53, 0;
mov.u32 %r156, 0;
mov.u64 %rd52, __cudart_i2opi_f;
mov.u64 %rd51, %rd1;
$L__BB0_17:
.pragma "nounroll";
ld.global.nc.u32 %r105, [%rd52];
mad.wide.u32 %rd34, %r105, %r24, %rd53;
shr.u64 %rd53, %rd34, 32;
st.local.u32 [%rd51], %rd34;
add.s64 %rd52, %rd52, 4;
add.s64 %rd51, %rd51, 4;
add.s32 %r156, %r156, 1;
setp.ne.s32 %p15, %r156, 6;
@%p15 bra $L__BB0_17;
st.local.u32 [%rd2], %rd53;
mov.u32 %r106, 4;
sub.s32 %r28, %r106, %r25;
mov.u32 %r107, 6;
sub.s32 %r108, %r107, %r25;
mul.wide.s32 %rd35, %r108, 4;
add.s64 %rd36, %rd1, %rd35;
ld.local.u32 %r157, [%rd36];
ld.local.u32 %r158, [%rd36+-4];
and.b32 %r31, %r23, 31;
setp.eq.s32 %p16, %r31, 0;
@%p16 bra $L__BB0_20;
mov.u32 %r109, 32;
sub.s32 %r110, %r109, %r31;
shr.u32 %r111, %r158, %r110;
shl.b32 %r112, %r157, %r31;
add.s32 %r157, %r111, %r112;
mul.wide.s32 %rd37, %r28, 4;
add.s64 %rd38, %rd1, %rd37;
ld.local.u32 %r113, [%rd38];
shr.u32 %r114, %r113, %r110;
shl.b32 %r115, %r158, %r31;
add.s32 %r158, %r114, %r115;
$L__BB0_20:
and.b32 %r116, %r22, -2147483648;
shr.u32 %r117, %r158, 30;
shl.b32 %r118, %r157, 2;
or.b32 %r119, %r117, %r118;
shr.u32 %r120, %r119, 31;
shr.u32 %r121, %r157, 30;
add.s32 %r122, %r120, %r121;
neg.s32 %r123, %r122;
setp.eq.s32 %p17, %r116, 0;
selp.b32 %r159, %r122, %r123, %p17;
setp.ne.s32 %p18, %r120, 0;
xor.b32 %r124, %r116, -2147483648;
selp.b32 %r125, %r124, %r116, %p18;
selp.b32 %r126, -1, 0, %p18;
xor.b32 %r127, %r119, %r126;
shl.b32 %r128, %r158, 2;
xor.b32 %r129, %r128, %r126;
cvt.u64.u32 %rd39, %r127;
cvt.u64.u32 %rd40, %r129;
bfi.b64 %rd41, %rd39, %rd40, 32, 32;
cvt.rn.f64.s64 %fd3, %rd41;
mul.f64 %fd4, %fd3, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f56, %fd4;
setp.eq.s32 %p19, %r125, 0;
neg.f32 %f57, %f56;
selp.f32 %f90, %f56, %f57, %p19;
$L__BB0_22:
add.s32 %r38, %r159, 1;
and.b32 %r39, %r38, 1;
setp.eq.s32 %p20, %r39, 0;
selp.f32 %f20, %f90, 0f3F800000, %p20;
mul.rn.f32 %f21, %f90, %f90;
mov.f32 %f91, 0fB94D4153;
@%p20 bra $L__BB0_24;
mov.f32 %f60, 0fBAB607ED;
mov.f32 %f61, 0f37CBAC00;
fma.rn.f32 %f91, %f61, %f21, %f60;
$L__BB0_24:
selp.f32 %f62, 0f3C0885E4, 0f3D2AAABB, %p20;
fma.rn.f32 %f63, %f91, %f21, %f62;
selp.f32 %f64, 0fBE2AAAA8, 0fBEFFFFFF, %p20;
fma.rn.f32 %f65, %f63, %f21, %f64;
mov.f32 %f66, 0f00000000;
fma.rn.f32 %f67, %f21, %f20, %f66;
fma.rn.f32 %f92, %f65, %f67, %f20;
and.b32 %r131, %r38, 2;
setp.eq.s32 %p22, %r131, 0;
@%p22 bra $L__BB0_26;
mov.f32 %f69, 0fBF800000;
fma.rn.f32 %f92, %f92, %f69, %f66;
$L__BB0_26:
cvta.to.global.u64 %rd42, %rd16;
mul.wide.s32 %rd43, %r2, 4;
add.s64 %rd44, %rd42, %rd43;
st.global.f32 [%rd44], %f92;
$L__BB0_27:
setp.gt.s32 %p23, %r1, 29;
@%p23 bra $L__BB0_29;
mul.hi.s32 %r132, %r1, 1717986919;
shr.u32 %r133, %r132, 31;
shr.s32 %r134, %r132, 1;
add.s32 %r135, %r134, %r133;
mul.hi.s32 %r136, %r135, 1431655766;
shr.u32 %r137, %r136, 31;
add.s32 %r138, %r136, %r137;
mul.lo.s32 %r139, %r138, 3;
sub.s32 %r140, %r135, %r139;
mov.f32 %f70, 0f3F000000;
mov.f32 %f71, 0f3BBB989D;
fma.rn.f32 %f72, %f89, %f71, %f70;
cvt.sat.f32.f32 %f73, %f72;
mov.f32 %f74, 0f4B400001;
mov.f32 %f75, 0f437C0000;
fma.rm.f32 %f76, %f73, %f75, %f74;
add.f32 %f77, %f76, 0fCB40007F;
neg.f32 %f78, %f77;
mov.f32 %f79, 0f3FB8AA3B;
fma.rn.f32 %f80, %f89, %f79, %f78;
mov.f32 %f81, 0f32A57060;
fma.rn.f32 %f82, %f89, %f81, %f80;
mov.b32 %r141, %f76;
shl.b32 %r142, %r141, 23;
mov.b32 %f83, %r142;
ex2.approx.ftz.f32 %f84, %f82;
mul.f32 %f85, %f84, %f83;
mul.hi.s32 %r143, %r1, -2004318071;
add.s32 %r144, %r143, %r1;
shr.u32 %r145, %r144, 31;
shr.s32 %r146, %r144, 3;
add.s32 %r147, %r146, %r145;
mul.lo.s32 %r148, %r135, 5;
sub.s32 %r149, %r1, %r148;
mad.lo.s32 %r150, %r147, 15, %r149;
mad.lo.s32 %r151, %r140, 5, %r150;
cvta.to.global.u64 %rd45, %rd17;
mul.wide.s32 %rd46, %r151, 4;
add.s64 %rd47, %rd45, %rd46;
st.global.f32 [%rd47], %f85;
$L__BB0_29:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_382_cu_3b4d9499_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_382_cu_3b4d9499_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_382_cu_3b4d9499_160113std14__numeric_typeIvE5valueE = 1;
.global .align 4 .b8 __cudart_i2opi_f[24] = {65, 144, 67, 60, 153, 149, 98, 219, 192, 221, 52, 245, 209, 87, 39, 252, 41, 21, 68, 78, 110, 131, 249, 162};
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_382_cu_3b4d9499_1601111nvfuser_382ENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEES2_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_382_cu_3b4d9499_1601111nvfuser_382ENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEES2__param_0[32],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_382_cu_3b4d9499_1601111nvfuser_382ENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEES2__param_1[24],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_382_cu_3b4d9499_1601111nvfuser_382ENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEES2__param_2[24]
)
{
.local .align 4 .b8 __local_depot0[28];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<24>;
.reg .f32 %f<93>;
.reg .b32 %r<157>;
.reg .f64 %fd<5>;
.reg .b64 %rd<54>;
mov.u64 %SPL, __local_depot0;
ld.param.u64 %rd17, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_382_cu_3b4d9499_1601111nvfuser_382ENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEES2__param_2];
ld.param.u64 %rd16, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_382_cu_3b4d9499_1601111nvfuser_382ENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEES2__param_1];
ld.param.u64 %rd15, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_382_cu_3b4d9499_1601111nvfuser_382ENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEES2__param_0];
add.u64 %rd1, %SPL, 0;
mov.u32 %r1, %tid.x;
setp.gt.s32 %p1, %r1, 29;
add.s64 %rd2, %rd1, 24;
mov.f32 %f89, 0f00000000;
@%p1 bra $L__BB0_13;
cvta.to.global.u64 %rd19, %rd15;
mul.wide.s32 %rd20, %r1, 4;
add.s64 %rd21, %rd19, %rd20;
ld.global.f32 %f1, [%rd21];
mul.f32 %f28, %f1, 0f3F22F983;
cvt.rni.s32.f32 %r152, %f28;
cvt.rn.f32.s32 %f29, %r152;
mov.f32 %f30, 0fBFC90FDA;
fma.rn.f32 %f31, %f29, %f30, %f1;
mov.f32 %f32, 0fB3A22168;
fma.rn.f32 %f33, %f29, %f32, %f31;
mov.f32 %f34, 0fA7C234C5;
fma.rn.f32 %f86, %f29, %f34, %f33;
abs.f32 %f3, %f1;
setp.ltu.f32 %p2, %f3, 0f47CE4780;
@%p2 bra $L__BB0_9;
setp.eq.f32 %p3, %f3, 0f7F800000;
@%p3 bra $L__BB0_8;
bra.uni $L__BB0_3;
$L__BB0_8:
mov.f32 %f37, 0f00000000;
mul.rn.f32 %f86, %f1, %f37;
mov.u32 %r152, 0;
bra.uni $L__BB0_9;
$L__BB0_3:
mov.b32 %r3, %f1;
shr.u32 %r68, %r3, 23;
and.b32 %r69, %r68, 255;
add.s32 %r4, %r69, -128;
shl.b32 %r70, %r3, 8;
or.b32 %r5, %r70, -2147483648;
shr.u32 %r6, %r4, 5;
mov.u64 %rd50, 0;
mov.u32 %r149, 0;
mov.u64 %rd49, __cudart_i2opi_f;
mov.u64 %rd48, %rd1;
$L__BB0_4:
.pragma "nounroll";
ld.global.nc.u32 %r71, [%rd49];
mad.wide.u32 %rd24, %r71, %r5, %rd50;
shr.u64 %rd50, %rd24, 32;
st.local.u32 [%rd48], %rd24;
add.s64 %rd49, %rd49, 4;
add.s64 %rd48, %rd48, 4;
add.s32 %r149, %r149, 1;
setp.ne.s32 %p4, %r149, 6;
@%p4 bra $L__BB0_4;
st.local.u32 [%rd2], %rd50;
mov.u32 %r72, 4;
sub.s32 %r9, %r72, %r6;
mov.u32 %r73, 6;
sub.s32 %r74, %r73, %r6;
mul.wide.s32 %rd25, %r74, 4;
add.s64 %rd26, %rd1, %rd25;
ld.local.u32 %r150, [%rd26];
ld.local.u32 %r151, [%rd26+-4];
and.b32 %r12, %r4, 31;
setp.eq.s32 %p5, %r12, 0;
@%p5 bra $L__BB0_7;
mov.u32 %r75, 32;
sub.s32 %r76, %r75, %r12;
shr.u32 %r77, %r151, %r76;
shl.b32 %r78, %r150, %r12;
add.s32 %r150, %r77, %r78;
mul.wide.s32 %rd27, %r9, 4;
add.s64 %rd28, %rd1, %rd27;
ld.local.u32 %r79, [%rd28];
shr.u32 %r80, %r79, %r76;
shl.b32 %r81, %r151, %r12;
add.s32 %r151, %r80, %r81;
$L__BB0_7:
and.b32 %r82, %r3, -2147483648;
shr.u32 %r83, %r151, 30;
shl.b32 %r84, %r150, 2;
or.b32 %r85, %r83, %r84;
shr.u32 %r86, %r85, 31;
shr.u32 %r87, %r150, 30;
add.s32 %r88, %r86, %r87;
neg.s32 %r89, %r88;
setp.eq.s32 %p6, %r82, 0;
selp.b32 %r152, %r88, %r89, %p6;
setp.ne.s32 %p7, %r86, 0;
xor.b32 %r90, %r82, -2147483648;
selp.b32 %r91, %r90, %r82, %p7;
selp.b32 %r92, -1, 0, %p7;
xor.b32 %r93, %r85, %r92;
shl.b32 %r94, %r151, 2;
xor.b32 %r95, %r94, %r92;
cvt.u64.u32 %rd29, %r93;
cvt.u64.u32 %rd30, %r95;
bfi.b64 %rd31, %rd29, %rd30, 32, 32;
cvt.rn.f64.s64 %fd1, %rd31;
mul.f64 %fd2, %fd1, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f35, %fd2;
setp.eq.s32 %p8, %r91, 0;
neg.f32 %f36, %f35;
selp.f32 %f86, %f35, %f36, %p8;
$L__BB0_9:
and.b32 %r19, %r152, 1;
setp.eq.s32 %p9, %r19, 0;
selp.f32 %f7, %f86, 0f3F800000, %p9;
mul.rn.f32 %f8, %f86, %f86;
mov.f32 %f87, 0fB94D4153;
@%p9 bra $L__BB0_11;
mov.f32 %f39, 0fBAB607ED;
mov.f32 %f40, 0f37CBAC00;
fma.rn.f32 %f87, %f40, %f8, %f39;
$L__BB0_11:
selp.f32 %f41, 0f3C0885E4, 0f3D2AAABB, %p9;
fma.rn.f32 %f42, %f87, %f8, %f41;
selp.f32 %f43, 0fBE2AAAA8, 0fBEFFFFFF, %p9;
fma.rn.f32 %f44, %f42, %f8, %f43;
mov.f32 %f45, 0f00000000;
fma.rn.f32 %f46, %f8, %f7, %f45;
fma.rn.f32 %f89, %f44, %f46, %f7;
and.b32 %r97, %r152, 2;
setp.eq.s32 %p11, %r97, 0;
@%p11 bra $L__BB0_13;
mov.f32 %f48, 0fBF800000;
fma.rn.f32 %f89, %f89, %f48, %f45;
$L__BB0_13:
@%p1 bra $L__BB0_27;
mul.f32 %f49, %f89, 0f3F22F983;
cvt.rni.s32.f32 %r156, %f49;
cvt.rn.f32.s32 %f50, %r156;
mov.f32 %f51, 0fBFC90FDA;
fma.rn.f32 %f52, %f50, %f51, %f89;
mov.f32 %f53, 0fB3A22168;
fma.rn.f32 %f54, %f50, %f53, %f52;
mov.f32 %f55, 0fA7C234C5;
fma.rn.f32 %f90, %f50, %f55, %f54;
abs.f32 %f16, %f89;
setp.ltu.f32 %p13, %f16, 0f47CE4780;
@%p13 bra $L__BB0_22;
setp.eq.f32 %p14, %f16, 0f7F800000;
@%p14 bra $L__BB0_21;
bra.uni $L__BB0_16;
$L__BB0_21:
mov.f32 %f58, 0f00000000;
mul.rn.f32 %f90, %f89, %f58;
mov.u32 %r156, 0;
bra.uni $L__BB0_22;
$L__BB0_16:
mov.b32 %r21, %f89;
shr.u32 %r99, %r21, 23;
and.b32 %r100, %r99, 255;
add.s32 %r22, %r100, -128;
shl.b32 %r101, %r21, 8;
or.b32 %r23, %r101, -2147483648;
shr.u32 %r24, %r22, 5;
mov.u64 %rd53, 0;
mov.u32 %r153, 0;
mov.u64 %rd52, __cudart_i2opi_f;
mov.u64 %rd51, %rd1;
$L__BB0_17:
.pragma "nounroll";
ld.global.nc.u32 %r102, [%rd52];
mad.wide.u32 %rd34, %r102, %r23, %rd53;
shr.u64 %rd53, %rd34, 32;
st.local.u32 [%rd51], %rd34;
add.s64 %rd52, %rd52, 4;
add.s64 %rd51, %rd51, 4;
add.s32 %r153, %r153, 1;
setp.ne.s32 %p15, %r153, 6;
@%p15 bra $L__BB0_17;
st.local.u32 [%rd2], %rd53;
mov.u32 %r103, 4;
sub.s32 %r27, %r103, %r24;
mov.u32 %r104, 6;
sub.s32 %r105, %r104, %r24;
mul.wide.s32 %rd35, %r105, 4;
add.s64 %rd36, %rd1, %rd35;
ld.local.u32 %r154, [%rd36];
ld.local.u32 %r155, [%rd36+-4];
and.b32 %r30, %r22, 31;
setp.eq.s32 %p16, %r30, 0;
@%p16 bra $L__BB0_20;
mov.u32 %r106, 32;
sub.s32 %r107, %r106, %r30;
shr.u32 %r108, %r155, %r107;
shl.b32 %r109, %r154, %r30;
add.s32 %r154, %r108, %r109;
mul.wide.s32 %rd37, %r27, 4;
add.s64 %rd38, %rd1, %rd37;
ld.local.u32 %r110, [%rd38];
shr.u32 %r111, %r110, %r107;
shl.b32 %r112, %r155, %r30;
add.s32 %r155, %r111, %r112;
$L__BB0_20:
and.b32 %r113, %r21, -2147483648;
shr.u32 %r114, %r155, 30;
shl.b32 %r115, %r154, 2;
or.b32 %r116, %r114, %r115;
shr.u32 %r117, %r116, 31;
shr.u32 %r118, %r154, 30;
add.s32 %r119, %r117, %r118;
neg.s32 %r120, %r119;
setp.eq.s32 %p17, %r113, 0;
selp.b32 %r156, %r119, %r120, %p17;
setp.ne.s32 %p18, %r117, 0;
xor.b32 %r121, %r113, -2147483648;
selp.b32 %r122, %r121, %r113, %p18;
selp.b32 %r123, -1, 0, %p18;
xor.b32 %r124, %r116, %r123;
shl.b32 %r125, %r155, 2;
xor.b32 %r126, %r125, %r123;
cvt.u64.u32 %rd39, %r124;
cvt.u64.u32 %rd40, %r126;
bfi.b64 %rd41, %rd39, %rd40, 32, 32;
cvt.rn.f64.s64 %fd3, %rd41;
mul.f64 %fd4, %fd3, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f56, %fd4;
setp.eq.s32 %p19, %r122, 0;
neg.f32 %f57, %f56;
selp.f32 %f90, %f56, %f57, %p19;
$L__BB0_22:
add.s32 %r37, %r156, 1;
and.b32 %r38, %r37, 1;
setp.eq.s32 %p20, %r38, 0;
selp.f32 %f20, %f90, 0f3F800000, %p20;
mul.rn.f32 %f21, %f90, %f90;
mov.f32 %f91, 0fB94D4153;
@%p20 bra $L__BB0_24;
mov.f32 %f60, 0fBAB607ED;
mov.f32 %f61, 0f37CBAC00;
fma.rn.f32 %f91, %f61, %f21, %f60;
$L__BB0_24:
selp.f32 %f62, 0f3C0885E4, 0f3D2AAABB, %p20;
fma.rn.f32 %f63, %f91, %f21, %f62;
selp.f32 %f64, 0fBE2AAAA8, 0fBEFFFFFF, %p20;
fma.rn.f32 %f65, %f63, %f21, %f64;
mov.f32 %f66, 0f00000000;
fma.rn.f32 %f67, %f21, %f20, %f66;
fma.rn.f32 %f92, %f65, %f67, %f20;
and.b32 %r128, %r37, 2;
setp.eq.s32 %p22, %r128, 0;
@%p22 bra $L__BB0_26;
mov.f32 %f69, 0fBF800000;
fma.rn.f32 %f92, %f92, %f69, %f66;
$L__BB0_26:
cvta.to.global.u64 %rd42, %rd16;
mul.wide.s32 %rd43, %r1, 4;
add.s64 %rd44, %rd42, %rd43;
st.global.f32 [%rd44], %f92;
$L__BB0_27:
@%p1 bra $L__BB0_29;
mul.hi.s32 %r129, %r1, 1717986919;
shr.u32 %r130, %r129, 31;
shr.s32 %r131, %r129, 1;
add.s32 %r132, %r131, %r130;
mul.hi.s32 %r133, %r132, 1431655766;
shr.u32 %r134, %r133, 31;
add.s32 %r135, %r133, %r134;
mul.lo.s32 %r136, %r135, 3;
sub.s32 %r137, %r132, %r136;
mov.f32 %f70, 0f3F000000;
mov.f32 %f71, 0f3BBB989D;
fma.rn.f32 %f72, %f89, %f71, %f70;
cvt.sat.f32.f32 %f73, %f72;
mov.f32 %f74, 0f4B400001;
mov.f32 %f75, 0f437C0000;
fma.rm.f32 %f76, %f73, %f75, %f74;
add.f32 %f77, %f76, 0fCB40007F;
neg.f32 %f78, %f77;
mov.f32 %f79, 0f3FB8AA3B;
fma.rn.f32 %f80, %f89, %f79, %f78;
mov.f32 %f81, 0f32A57060;
fma.rn.f32 %f82, %f89, %f81, %f80;
mov.b32 %r138, %f76;
shl.b32 %r139, %r138, 23;
mov.b32 %f83, %r139;
ex2.approx.ftz.f32 %f84, %f82;
mul.f32 %f85, %f84, %f83;
mul.hi.s32 %r140, %r1, -2004318071;
add.s32 %r141, %r140, %r1;
shr.u32 %r142, %r141, 31;
shr.s32 %r143, %r141, 3;
add.s32 %r144, %r143, %r142;
mul.lo.s32 %r145, %r132, 5;
sub.s32 %r146, %r1, %r145;
mad.lo.s32 %r147, %r144, 15, %r146;
mad.lo.s32 %r148, %r137, 5, %r147;
cvta.to.global.u64 %rd45, %rd17;
mul.wide.s32 %rd46, %r148, 4;
add.s64 %rd47, %rd45, %rd46;
st.global.f32 [%rd47], %f85;
$L__BB0_29:
ret;
}
--- 53997da5d
+++ 03a1b695e
@@ -24,36 +24,33 @@
.local .align 4 .b8 __local_depot0[28];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<24>;
.reg .f32 %f<93>;
- .reg .b32 %r<160>;
+ .reg .b32 %r<157>;
.reg .f64 %fd<5>;
.reg .b64 %rd<54>;
mov.u64 %SPL, __local_depot0;
ld.param.u64 %rd17, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEES2__param_2];
ld.param.u64 %rd16, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEES2__param_1];
ld.param.u64 %rd15, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEES2__param_0];
add.u64 %rd1, %SPL, 0;
- mov.u32 %r68, %ctaid.x;
- shl.b32 %r69, %r68, 7;
mov.u32 %r1, %tid.x;
- add.s32 %r2, %r69, %r1;
- setp.gt.s32 %p1, %r2, 29;
+ setp.gt.s32 %p1, %r1, 29;
add.s64 %rd2, %rd1, 24;
mov.f32 %f89, 0f00000000;
@%p1 bra $L__BB0_13;
cvta.to.global.u64 %rd19, %rd15;
- mul.wide.s32 %rd20, %r2, 4;
+ mul.wide.s32 %rd20, %r1, 4;
add.s64 %rd21, %rd19, %rd20;
ld.global.f32 %f1, [%rd21];
mul.f32 %f28, %f1, 0f3F22F983;
- cvt.rni.s32.f32 %r155, %f28;
- cvt.rn.f32.s32 %f29, %r155;
+ cvt.rni.s32.f32 %r152, %f28;
+ cvt.rn.f32.s32 %f29, %r152;
mov.f32 %f30, 0fBFC90FDA;
fma.rn.f32 %f31, %f29, %f30, %f1;
mov.f32 %f32, 0fB3A22168;
fma.rn.f32 %f33, %f29, %f32, %f31;
mov.f32 %f34, 0fA7C234C5;
@@ -67,94 +64,94 @@
bra.uni $L__BB0_3;
$L__BB0_8:
mov.f32 %f37, 0f00000000;
mul.rn.f32 %f86, %f1, %f37;
- mov.u32 %r155, 0;
+ mov.u32 %r152, 0;
bra.uni $L__BB0_9;
$L__BB0_3:
- mov.b32 %r4, %f1;
- shr.u32 %r71, %r4, 23;
- and.b32 %r72, %r71, 255;
- add.s32 %r5, %r72, -128;
- shl.b32 %r73, %r4, 8;
- or.b32 %r6, %r73, -2147483648;
- shr.u32 %r7, %r5, 5;
+ mov.b32 %r3, %f1;
+ shr.u32 %r68, %r3, 23;
+ and.b32 %r69, %r68, 255;
+ add.s32 %r4, %r69, -128;
+ shl.b32 %r70, %r3, 8;
+ or.b32 %r5, %r70, -2147483648;
+ shr.u32 %r6, %r4, 5;
mov.u64 %rd50, 0;
- mov.u32 %r152, 0;
+ mov.u32 %r149, 0;
mov.u64 %rd49, __cudart_i2opi_f;
mov.u64 %rd48, %rd1;
$L__BB0_4:
.pragma "nounroll";
- ld.global.nc.u32 %r74, [%rd49];
- mad.wide.u32 %rd24, %r74, %r6, %rd50;
+ ld.global.nc.u32 %r71, [%rd49];
+ mad.wide.u32 %rd24, %r71, %r5, %rd50;
shr.u64 %rd50, %rd24, 32;
st.local.u32 [%rd48], %rd24;
add.s64 %rd49, %rd49, 4;
add.s64 %rd48, %rd48, 4;
- add.s32 %r152, %r152, 1;
- setp.ne.s32 %p4, %r152, 6;
+ add.s32 %r149, %r149, 1;
+ setp.ne.s32 %p4, %r149, 6;
@%p4 bra $L__BB0_4;
st.local.u32 [%rd2], %rd50;
- mov.u32 %r75, 4;
- sub.s32 %r10, %r75, %r7;
- mov.u32 %r76, 6;
- sub.s32 %r77, %r76, %r7;
- mul.wide.s32 %rd25, %r77, 4;
+ mov.u32 %r72, 4;
+ sub.s32 %r9, %r72, %r6;
+ mov.u32 %r73, 6;
+ sub.s32 %r74, %r73, %r6;
+ mul.wide.s32 %rd25, %r74, 4;
add.s64 %rd26, %rd1, %rd25;
- ld.local.u32 %r153, [%rd26];
- ld.local.u32 %r154, [%rd26+-4];
- and.b32 %r13, %r5, 31;
- setp.eq.s32 %p5, %r13, 0;
+ ld.local.u32 %r150, [%rd26];
+ ld.local.u32 %r151, [%rd26+-4];
+ and.b32 %r12, %r4, 31;
+ setp.eq.s32 %p5, %r12, 0;
@%p5 bra $L__BB0_7;
- mov.u32 %r78, 32;
- sub.s32 %r79, %r78, %r13;
- shr.u32 %r80, %r154, %r79;
- shl.b32 %r81, %r153, %r13;
- add.s32 %r153, %r80, %r81;
- mul.wide.s32 %rd27, %r10, 4;
+ mov.u32 %r75, 32;
+ sub.s32 %r76, %r75, %r12;
+ shr.u32 %r77, %r151, %r76;
+ shl.b32 %r78, %r150, %r12;
+ add.s32 %r150, %r77, %r78;
+ mul.wide.s32 %rd27, %r9, 4;
add.s64 %rd28, %rd1, %rd27;
- ld.local.u32 %r82, [%rd28];
- shr.u32 %r83, %r82, %r79;
- shl.b32 %r84, %r154, %r13;
- add.s32 %r154, %r83, %r84;
+ ld.local.u32 %r79, [%rd28];
+ shr.u32 %r80, %r79, %r76;
+ shl.b32 %r81, %r151, %r12;
+ add.s32 %r151, %r80, %r81;
$L__BB0_7:
- and.b32 %r85, %r4, -2147483648;
- shr.u32 %r86, %r154, 30;
- shl.b32 %r87, %r153, 2;
- or.b32 %r88, %r86, %r87;
- shr.u32 %r89, %r88, 31;
- shr.u32 %r90, %r153, 30;
- add.s32 %r91, %r89, %r90;
- neg.s32 %r92, %r91;
- setp.eq.s32 %p6, %r85, 0;
- selp.b32 %r155, %r91, %r92, %p6;
- setp.ne.s32 %p7, %r89, 0;
- xor.b32 %r93, %r85, -2147483648;
- selp.b32 %r94, %r93, %r85, %p7;
- selp.b32 %r95, -1, 0, %p7;
- xor.b32 %r96, %r88, %r95;
- shl.b32 %r97, %r154, 2;
- xor.b32 %r98, %r97, %r95;
- cvt.u64.u32 %rd29, %r96;
- cvt.u64.u32 %rd30, %r98;
+ and.b32 %r82, %r3, -2147483648;
+ shr.u32 %r83, %r151, 30;
+ shl.b32 %r84, %r150, 2;
+ or.b32 %r85, %r83, %r84;
+ shr.u32 %r86, %r85, 31;
+ shr.u32 %r87, %r150, 30;
+ add.s32 %r88, %r86, %r87;
+ neg.s32 %r89, %r88;
+ setp.eq.s32 %p6, %r82, 0;
+ selp.b32 %r152, %r88, %r89, %p6;
+ setp.ne.s32 %p7, %r86, 0;
+ xor.b32 %r90, %r82, -2147483648;
+ selp.b32 %r91, %r90, %r82, %p7;
+ selp.b32 %r92, -1, 0, %p7;
+ xor.b32 %r93, %r85, %r92;
+ shl.b32 %r94, %r151, 2;
+ xor.b32 %r95, %r94, %r92;
+ cvt.u64.u32 %rd29, %r93;
+ cvt.u64.u32 %rd30, %r95;
bfi.b64 %rd31, %rd29, %rd30, 32, 32;
cvt.rn.f64.s64 %fd1, %rd31;
mul.f64 %fd2, %fd1, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f35, %fd2;
- setp.eq.s32 %p8, %r94, 0;
+ setp.eq.s32 %p8, %r91, 0;
neg.f32 %f36, %f35;
selp.f32 %f86, %f35, %f36, %p8;
$L__BB0_9:
- and.b32 %r20, %r155, 1;
- setp.eq.s32 %p9, %r20, 0;
+ and.b32 %r19, %r152, 1;
+ setp.eq.s32 %p9, %r19, 0;
selp.f32 %f7, %f86, 0f3F800000, %p9;
mul.rn.f32 %f8, %f86, %f86;
mov.f32 %f87, 0fB94D4153;
@%p9 bra $L__BB0_11;
@@ -168,23 +165,23 @@
selp.f32 %f43, 0fBE2AAAA8, 0fBEFFFFFF, %p9;
fma.rn.f32 %f44, %f42, %f8, %f43;
mov.f32 %f45, 0f00000000;
fma.rn.f32 %f46, %f8, %f7, %f45;
fma.rn.f32 %f89, %f44, %f46, %f7;
- and.b32 %r100, %r155, 2;
- setp.eq.s32 %p11, %r100, 0;
+ and.b32 %r97, %r152, 2;
+ setp.eq.s32 %p11, %r97, 0;
@%p11 bra $L__BB0_13;
mov.f32 %f48, 0fBF800000;
fma.rn.f32 %f89, %f89, %f48, %f45;
$L__BB0_13:
@%p1 bra $L__BB0_27;
mul.f32 %f49, %f89, 0f3F22F983;
- cvt.rni.s32.f32 %r159, %f49;
- cvt.rn.f32.s32 %f50, %r159;
+ cvt.rni.s32.f32 %r156, %f49;
+ cvt.rn.f32.s32 %f50, %r156;
mov.f32 %f51, 0fBFC90FDA;
fma.rn.f32 %f52, %f50, %f51, %f89;
mov.f32 %f53, 0fB3A22168;
fma.rn.f32 %f54, %f50, %f53, %f52;
mov.f32 %f55, 0fA7C234C5;
@@ -198,95 +195,95 @@
bra.uni $L__BB0_16;
$L__BB0_21:
mov.f32 %f58, 0f00000000;
mul.rn.f32 %f90, %f89, %f58;
- mov.u32 %r159, 0;
+ mov.u32 %r156, 0;
bra.uni $L__BB0_22;
$L__BB0_16:
- mov.b32 %r22, %f89;
- shr.u32 %r102, %r22, 23;
- and.b32 %r103, %r102, 255;
- add.s32 %r23, %r103, -128;
- shl.b32 %r104, %r22, 8;
- or.b32 %r24, %r104, -2147483648;
- shr.u32 %r25, %r23, 5;
+ mov.b32 %r21, %f89;
+ shr.u32 %r99, %r21, 23;
+ and.b32 %r100, %r99, 255;
+ add.s32 %r22, %r100, -128;
+ shl.b32 %r101, %r21, 8;
+ or.b32 %r23, %r101, -2147483648;
+ shr.u32 %r24, %r22, 5;
mov.u64 %rd53, 0;
- mov.u32 %r156, 0;
+ mov.u32 %r153, 0;
mov.u64 %rd52, __cudart_i2opi_f;
mov.u64 %rd51, %rd1;
$L__BB0_17:
.pragma "nounroll";
- ld.global.nc.u32 %r105, [%rd52];
- mad.wide.u32 %rd34, %r105, %r24, %rd53;
+ ld.global.nc.u32 %r102, [%rd52];
+ mad.wide.u32 %rd34, %r102, %r23, %rd53;
shr.u64 %rd53, %rd34, 32;
st.local.u32 [%rd51], %rd34;
add.s64 %rd52, %rd52, 4;
add.s64 %rd51, %rd51, 4;
- add.s32 %r156, %r156, 1;
- setp.ne.s32 %p15, %r156, 6;
+ add.s32 %r153, %r153, 1;
+ setp.ne.s32 %p15, %r153, 6;
@%p15 bra $L__BB0_17;
st.local.u32 [%rd2], %rd53;
- mov.u32 %r106, 4;
- sub.s32 %r28, %r106, %r25;
- mov.u32 %r107, 6;
- sub.s32 %r108, %r107, %r25;
- mul.wide.s32 %rd35, %r108, 4;
+ mov.u32 %r103, 4;
+ sub.s32 %r27, %r103, %r24;
+ mov.u32 %r104, 6;
+ sub.s32 %r105, %r104, %r24;
+ mul.wide.s32 %rd35, %r105, 4;
add.s64 %rd36, %rd1, %rd35;
- ld.local.u32 %r157, [%rd36];
- ld.local.u32 %r158, [%rd36+-4];
- and.b32 %r31, %r23, 31;
- setp.eq.s32 %p16, %r31, 0;
+ ld.local.u32 %r154, [%rd36];
+ ld.local.u32 %r155, [%rd36+-4];
+ and.b32 %r30, %r22, 31;
+ setp.eq.s32 %p16, %r30, 0;
@%p16 bra $L__BB0_20;
- mov.u32 %r109, 32;
- sub.s32 %r110, %r109, %r31;
- shr.u32 %r111, %r158, %r110;
- shl.b32 %r112, %r157, %r31;
- add.s32 %r157, %r111, %r112;
- mul.wide.s32 %rd37, %r28, 4;
+ mov.u32 %r106, 32;
+ sub.s32 %r107, %r106, %r30;
+ shr.u32 %r108, %r155, %r107;
+ shl.b32 %r109, %r154, %r30;
+ add.s32 %r154, %r108, %r109;
+ mul.wide.s32 %rd37, %r27, 4;
add.s64 %rd38, %rd1, %rd37;
- ld.local.u32 %r113, [%rd38];
- shr.u32 %r114, %r113, %r110;
- shl.b32 %r115, %r158, %r31;
- add.s32 %r158, %r114, %r115;
+ ld.local.u32 %r110, [%rd38];
+ shr.u32 %r111, %r110, %r107;
+ shl.b32 %r112, %r155, %r30;
+ add.s32 %r155, %r111, %r112;
$L__BB0_20:
- and.b32 %r116, %r22, -2147483648;
- shr.u32 %r117, %r158, 30;
- shl.b32 %r118, %r157, 2;
- or.b32 %r119, %r117, %r118;
- shr.u32 %r120, %r119, 31;
- shr.u32 %r121, %r157, 30;
- add.s32 %r122, %r120, %r121;
- neg.s32 %r123, %r122;
- setp.eq.s32 %p17, %r116, 0;
- selp.b32 %r159, %r122, %r123, %p17;
- setp.ne.s32 %p18, %r120, 0;
- xor.b32 %r124, %r116, -2147483648;
- selp.b32 %r125, %r124, %r116, %p18;
- selp.b32 %r126, -1, 0, %p18;
- xor.b32 %r127, %r119, %r126;
- shl.b32 %r128, %r158, 2;
- xor.b32 %r129, %r128, %r126;
- cvt.u64.u32 %rd39, %r127;
- cvt.u64.u32 %rd40, %r129;
+ and.b32 %r113, %r21, -2147483648;
+ shr.u32 %r114, %r155, 30;
+ shl.b32 %r115, %r154, 2;
+ or.b32 %r116, %r114, %r115;
+ shr.u32 %r117, %r116, 31;
+ shr.u32 %r118, %r154, 30;
+ add.s32 %r119, %r117, %r118;
+ neg.s32 %r120, %r119;
+ setp.eq.s32 %p17, %r113, 0;
+ selp.b32 %r156, %r119, %r120, %p17;
+ setp.ne.s32 %p18, %r117, 0;
+ xor.b32 %r121, %r113, -2147483648;
+ selp.b32 %r122, %r121, %r113, %p18;
+ selp.b32 %r123, -1, 0, %p18;
+ xor.b32 %r124, %r116, %r123;
+ shl.b32 %r125, %r155, 2;
+ xor.b32 %r126, %r125, %r123;
+ cvt.u64.u32 %rd39, %r124;
+ cvt.u64.u32 %rd40, %r126;
bfi.b64 %rd41, %rd39, %rd40, 32, 32;
cvt.rn.f64.s64 %fd3, %rd41;
mul.f64 %fd4, %fd3, 0d3BF921FB54442D19;
cvt.rn.f32.f64 %f56, %fd4;
- setp.eq.s32 %p19, %r125, 0;
+ setp.eq.s32 %p19, %r122, 0;
neg.f32 %f57, %f56;
selp.f32 %f90, %f56, %f57, %p19;
$L__BB0_22:
- add.s32 %r38, %r159, 1;
- and.b32 %r39, %r38, 1;
- setp.eq.s32 %p20, %r39, 0;
+ add.s32 %r37, %r156, 1;
+ and.b32 %r38, %r37, 1;
+ setp.eq.s32 %p20, %r38, 0;
selp.f32 %f20, %f90, 0f3F800000, %p20;
mul.rn.f32 %f21, %f90, %f90;
mov.f32 %f91, 0fB94D4153;
@%p20 bra $L__BB0_24;
@@ -300,36 +297,35 @@
selp.f32 %f64, 0fBE2AAAA8, 0fBEFFFFFF, %p20;
fma.rn.f32 %f65, %f63, %f21, %f64;
mov.f32 %f66, 0f00000000;
fma.rn.f32 %f67, %f21, %f20, %f66;
fma.rn.f32 %f92, %f65, %f67, %f20;
- and.b32 %r131, %r38, 2;
- setp.eq.s32 %p22, %r131, 0;
+ and.b32 %r128, %r37, 2;
+ setp.eq.s32 %p22, %r128, 0;
@%p22 bra $L__BB0_26;
mov.f32 %f69, 0fBF800000;
fma.rn.f32 %f92, %f92, %f69, %f66;
$L__BB0_26:
cvta.to.global.u64 %rd42, %rd16;
- mul.wide.s32 %rd43, %r2, 4;
+ mul.wide.s32 %rd43, %r1, 4;
add.s64 %rd44, %rd42, %rd43;
st.global.f32 [%rd44], %f92;
$L__BB0_27:
- setp.gt.s32 %p23, %r1, 29;
- @%p23 bra $L__BB0_29;
-
- mul.hi.s32 %r132, %r1, 1717986919;
- shr.u32 %r133, %r132, 31;
- shr.s32 %r134, %r132, 1;
- add.s32 %r135, %r134, %r133;
- mul.hi.s32 %r136, %r135, 1431655766;
- shr.u32 %r137, %r136, 31;
- add.s32 %r138, %r136, %r137;
- mul.lo.s32 %r139, %r138, 3;
- sub.s32 %r140, %r135, %r139;
+ @%p1 bra $L__BB0_29;
+
+ mul.hi.s32 %r129, %r1, 1717986919;
+ shr.u32 %r130, %r129, 31;
+ shr.s32 %r131, %r129, 1;
+ add.s32 %r132, %r131, %r130;
+ mul.hi.s32 %r133, %r132, 1431655766;
+ shr.u32 %r134, %r133, 31;
+ add.s32 %r135, %r133, %r134;
+ mul.lo.s32 %r136, %r135, 3;
+ sub.s32 %r137, %r132, %r136;
mov.f32 %f70, 0f3F000000;
mov.f32 %f71, 0f3BBB989D;
fma.rn.f32 %f72, %f89, %f71, %f70;
cvt.sat.f32.f32 %f73, %f72;
mov.f32 %f74, 0f4B400001;
@@ -339,26 +335,26 @@
neg.f32 %f78, %f77;
mov.f32 %f79, 0f3FB8AA3B;
fma.rn.f32 %f80, %f89, %f79, %f78;
mov.f32 %f81, 0f32A57060;
fma.rn.f32 %f82, %f89, %f81, %f80;
- mov.b32 %r141, %f76;
- shl.b32 %r142, %r141, 23;
- mov.b32 %f83, %r142;
+ mov.b32 %r138, %f76;
+ shl.b32 %r139, %r138, 23;
+ mov.b32 %f83, %r139;
ex2.approx.ftz.f32 %f84, %f82;
mul.f32 %f85, %f84, %f83;
- mul.hi.s32 %r143, %r1, -2004318071;
- add.s32 %r144, %r143, %r1;
- shr.u32 %r145, %r144, 31;
- shr.s32 %r146, %r144, 3;
- add.s32 %r147, %r146, %r145;
- mul.lo.s32 %r148, %r135, 5;
- sub.s32 %r149, %r1, %r148;
- mad.lo.s32 %r150, %r147, 15, %r149;
- mad.lo.s32 %r151, %r140, 5, %r150;
+ mul.hi.s32 %r140, %r1, -2004318071;
+ add.s32 %r141, %r140, %r1;
+ shr.u32 %r142, %r141, 31;
+ shr.s32 %r143, %r141, 3;
+ add.s32 %r144, %r143, %r142;
+ mul.lo.s32 %r145, %r132, 5;
+ sub.s32 %r146, %r1, %r145;
+ mad.lo.s32 %r147, %r144, 15, %r146;
+ mad.lo.s32 %r148, %r137, 5, %r147;
cvta.to.global.u64 %rd45, %rd17;
- mul.wide.s32 %rd46, %r151, 4;
+ mul.wide.s32 %rd46, %r148, 4;
add.s64 %rd47, %rd45, %rd46;
st.global.f32 [%rd47], %f85;
$L__BB0_29:
ret;
12: GpuViewTest.ReplacedScalarInSplitOutput
Kernel 2
CUDA
PTX
53997da5d
Diff
03a1b695e
-1
+1 index type: int
registers: 10
gmem: 3
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<__half, 1, 1> T1, Tensor<float, 2, 2> T4, Tensor<float, 2, 2> T7) {
if ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < (32 * (ceilDiv(T1.logical_size[0LL], 32)))) && ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < (32 * T4.logical_size[1LL])))) {
Array<__half, 1, 1> T10;
T10[0] = 0;
T10[0]
= T1[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))];
Array<float, 1, 1> T11;
T11[0] = 0;
T11[0]
= T4[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))];
Array<__half, 1, 1> T9;
T9[0]
= T10[0];
Array<float, 1, 1> T6;
T6[0]
= __half2float(T9[0]);
Array<float, 1, 1> T12;
T12[0]
= T11[0]
* T6[0];
T7[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
= T12[0];
}
}
__global__ void nvfuser_N(Tensor<__half, 1, 1> T1, Tensor<float, 2, 2> T4, Tensor<float, 2, 2> T7) {
if (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < (32 * T4.logical_size[1LL]))) {
Array<__half, 1, 1> T10;
T10[0] = 0;
T10[0]
= T1[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))];
Array<float, 1, 1> T11;
T11[0] = 0;
T11[0]
= T4[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))];
Array<__half, 1, 1> T9;
T9[0]
= T10[0];
Array<float, 1, 1> T6;
T6[0]
= __half2float(T9[0]);
Array<float, 1, 1> T12;
T12[0]
= T11[0]
* T6[0];
T7[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
= T12[0];
}
}
--- 53997da5d
+++ 03a1b695e
@@ -1,7 +1,7 @@
__global__ void nvfuser_N(Tensor<__half, 1, 1> T1, Tensor<float, 2, 2> T4, Tensor<float, 2, 2> T7) {
- if ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < (32 * (ceilDiv(T1.logical_size[0LL], 32)))) && ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < (32 * T4.logical_size[1LL])))) {
+ if (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < (32 * T4.logical_size[1LL]))) {
Array<__half, 1, 1> T10;
T10[0] = 0;
T10[0]
= T1[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))];
Array<float, 1, 1> T11;
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_384_cu_9e2704e6_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_384_cu_9e2704e6_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_384_cu_9e2704e6_191103std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_384_cu_9e2704e6_1911011nvfuser_384ENS_6TensorINS_6__halfELi1ELi1EEENS0_IfLi2ELi2EEES3_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_384_cu_9e2704e6_1911011nvfuser_384ENS_6TensorINS_6__halfELi1ELi1EEENS0_IfLi2ELi2EEES3__param_0[16],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_384_cu_9e2704e6_1911011nvfuser_384ENS_6TensorINS_6__halfELi1ELi1EEENS0_IfLi2ELi2EEES3__param_1[24],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_384_cu_9e2704e6_1911011nvfuser_384ENS_6TensorINS_6__halfELi1ELi1EEENS0_IfLi2ELi2EEES3__param_2[24]
)
{
.reg .pred %p<3>;
.reg .b16 %rs<3>;
.reg .f32 %f<5>;
.reg .b32 %r<31>;
.reg .b64 %rd<12>;
ld.param.v2.u32 {%r12, %r13}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_384_cu_9e2704e6_1911011nvfuser_384ENS_6TensorINS_6__halfELi1ELi1EEENS0_IfLi2ELi2EEES3__param_0+8];
ld.param.v2.u32 {%r14, %r15}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_384_cu_9e2704e6_1911011nvfuser_384ENS_6TensorINS_6__halfELi1ELi1EEENS0_IfLi2ELi2EEES3__param_1+8];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_384_cu_9e2704e6_1911011nvfuser_384ENS_6TensorINS_6__halfELi1ELi1EEENS0_IfLi2ELi2EEES3__param_2];
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_384_cu_9e2704e6_1911011nvfuser_384ENS_6TensorINS_6__halfELi1ELi1EEENS0_IfLi2ELi2EEES3__param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_384_cu_9e2704e6_1911011nvfuser_384ENS_6TensorINS_6__halfELi1ELi1EEENS0_IfLi2ELi2EEES3__param_0];
mov.u32 %r22, %ctaid.x;
shl.b32 %r23, %r22, 7;
mov.u32 %r24, %tid.x;
add.s32 %r1, %r23, %r24;
add.s32 %r25, %r12, 31;
shr.s32 %r26, %r25, 31;
shr.u32 %r27, %r26, 27;
add.s32 %r28, %r25, %r27;
and.b32 %r29, %r28, -32;
setp.ge.s32 %p1, %r1, %r29;
@%p1 bra $L__BB0_3;
shl.b32 %r30, %r15, 5;
setp.ge.s32 %p2, %r1, %r30;
@%p2 bra $L__BB0_3;
cvta.to.global.u64 %rd4, %rd1;
mul.wide.s32 %rd5, %r1, 2;
add.s64 %rd6, %rd4, %rd5;
ld.global.u16 %rs2, [%rd6];
cvta.to.global.u64 %rd7, %rd2;
mul.wide.s32 %rd8, %r1, 4;
add.s64 %rd9, %rd7, %rd8;
ld.global.f32 %f3, [%rd9];
// begin inline asm
{ cvt.f32.f16 %f2, %rs2;}
// end inline asm
mul.f32 %f4, %f3, %f2;
cvta.to.global.u64 %rd10, %rd3;
add.s64 %rd11, %rd10, %rd8;
st.global.f32 [%rd11], %f4;
$L__BB0_3:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_384_cu_3b4d9499_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_384_cu_3b4d9499_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_384_cu_3b4d9499_160113std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_384_cu_3b4d9499_1601111nvfuser_384ENS_6TensorINS_6__halfELi1ELi1EEENS0_IfLi2ELi2EEES3_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_384_cu_3b4d9499_1601111nvfuser_384ENS_6TensorINS_6__halfELi1ELi1EEENS0_IfLi2ELi2EEES3__param_0[16],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_384_cu_3b4d9499_1601111nvfuser_384ENS_6TensorINS_6__halfELi1ELi1EEENS0_IfLi2ELi2EEES3__param_1[24],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_384_cu_3b4d9499_1601111nvfuser_384ENS_6TensorINS_6__halfELi1ELi1EEENS0_IfLi2ELi2EEES3__param_2[24]
)
{
.reg .pred %p<2>;
.reg .b16 %rs<3>;
.reg .f32 %f<5>;
.reg .b32 %r<26>;
.reg .b64 %rd<12>;
ld.param.v2.u32 {%r14, %r15}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_384_cu_3b4d9499_1601111nvfuser_384ENS_6TensorINS_6__halfELi1ELi1EEENS0_IfLi2ELi2EEES3__param_1+8];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_384_cu_3b4d9499_1601111nvfuser_384ENS_6TensorINS_6__halfELi1ELi1EEENS0_IfLi2ELi2EEES3__param_2];
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_384_cu_3b4d9499_1601111nvfuser_384ENS_6TensorINS_6__halfELi1ELi1EEENS0_IfLi2ELi2EEES3__param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_384_cu_3b4d9499_1601111nvfuser_384ENS_6TensorINS_6__halfELi1ELi1EEENS0_IfLi2ELi2EEES3__param_0];
mov.u32 %r22, %ctaid.x;
shl.b32 %r23, %r22, 7;
mov.u32 %r24, %tid.x;
add.s32 %r1, %r23, %r24;
shl.b32 %r25, %r15, 5;
setp.ge.s32 %p1, %r1, %r25;
@%p1 bra $L__BB0_2;
cvta.to.global.u64 %rd4, %rd2;
cvta.to.global.u64 %rd5, %rd3;
cvta.to.global.u64 %rd6, %rd1;
mul.wide.s32 %rd7, %r1, 2;
add.s64 %rd8, %rd6, %rd7;
ld.global.u16 %rs2, [%rd8];
mul.wide.s32 %rd9, %r1, 4;
add.s64 %rd10, %rd4, %rd9;
ld.global.f32 %f3, [%rd10];
// begin inline asm
{ cvt.f32.f16 %f2, %rs2;}
// end inline asm
mul.f32 %f4, %f3, %f2;
add.s64 %rd11, %rd5, %rd9;
st.global.f32 [%rd11], %f4;
$L__BB0_2:
ret;
}
--- 53997da5d
+++ 03a1b695e
@@ -18,56 +18,47 @@
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorINS_6__halfELi1ELi1EEENS0_IfLi2ELi2EEES3__param_0[16],
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorINS_6__halfELi1ELi1EEENS0_IfLi2ELi2EEES3__param_1[24],
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorINS_6__halfELi1ELi1EEENS0_IfLi2ELi2EEES3__param_2[24]
)
{
- .reg .pred %p<3>;
+ .reg .pred %p<2>;
.reg .b16 %rs<3>;
.reg .f32 %f<5>;
- .reg .b32 %r<31>;
+ .reg .b32 %r<26>;
.reg .b64 %rd<12>;
- ld.param.v2.u32 {%r12, %r13}, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi1ELi1EEENS0_IfLi2ELi2EEES3__param_0+8];
ld.param.v2.u32 {%r14, %r15}, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi1ELi1EEENS0_IfLi2ELi2EEES3__param_1+8];
ld.param.u64 %rd3, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi1ELi1EEENS0_IfLi2ELi2EEES3__param_2];
ld.param.u64 %rd2, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi1ELi1EEENS0_IfLi2ELi2EEES3__param_1];
ld.param.u64 %rd1, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi1ELi1EEENS0_IfLi2ELi2EEES3__param_0];
mov.u32 %r22, %ctaid.x;
shl.b32 %r23, %r22, 7;
mov.u32 %r24, %tid.x;
add.s32 %r1, %r23, %r24;
- add.s32 %r25, %r12, 31;
- shr.s32 %r26, %r25, 31;
- shr.u32 %r27, %r26, 27;
- add.s32 %r28, %r25, %r27;
- and.b32 %r29, %r28, -32;
- setp.ge.s32 %p1, %r1, %r29;
- @%p1 bra $L__BB0_3;
+ shl.b32 %r25, %r15, 5;
+ setp.ge.s32 %p1, %r1, %r25;
+ @%p1 bra $L__BB0_2;
- shl.b32 %r30, %r15, 5;
- setp.ge.s32 %p2, %r1, %r30;
- @%p2 bra $L__BB0_3;
-
- cvta.to.global.u64 %rd4, %rd1;
- mul.wide.s32 %rd5, %r1, 2;
- add.s64 %rd6, %rd4, %rd5;
- ld.global.u16 %rs2, [%rd6];
- cvta.to.global.u64 %rd7, %rd2;
- mul.wide.s32 %rd8, %r1, 4;
- add.s64 %rd9, %rd7, %rd8;
- ld.global.f32 %f3, [%rd9];
+ cvta.to.global.u64 %rd4, %rd2;
+ cvta.to.global.u64 %rd5, %rd3;
+ cvta.to.global.u64 %rd6, %rd1;
+ mul.wide.s32 %rd7, %r1, 2;
+ add.s64 %rd8, %rd6, %rd7;
+ ld.global.u16 %rs2, [%rd8];
+ mul.wide.s32 %rd9, %r1, 4;
+ add.s64 %rd10, %rd4, %rd9;
+ ld.global.f32 %f3, [%rd10];
{ cvt.f32.f16 %f2, %rs2;}
mul.f32 %f4, %f3, %f2;
- cvta.to.global.u64 %rd10, %rd3;
- add.s64 %rd11, %rd10, %rd8;
+ add.s64 %rd11, %rd5, %rd9;
st.global.f32 [%rd11], %f4;
-$L__BB0_3:
+$L__BB0_2:
ret;
}
13: ReshapeReduction.FusionReshapeReduction/42
Kernel 1
CUDA
PTX
53997da5d
Diff
03a1b695e
-4
+4 index type: int
registers: 16
gmem: 3
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 4, 4> T0, Tensor<float, 4, 4> T1, Tensor<float, 4, 4> T6) {
if (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < 120)) {
Array<float, 1, 1> T9;
T9[0] = 0;
T9[0]
= T1[((((T1.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 60)) + (T1.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 60) / 20))) + (T1.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 60) % 20) / 5))) + (T1.alloc_stride[3LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 60) % 20) % 5)))];
Array<float, 1, 1> T8;
T8[0] = 0;
T8[0]
= T0[((((T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 60)) + (T0.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 60) / 20))) + (T0.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 60) % 20) / 5))) + (T0.alloc_stride[3LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 60) % 20) % 5)))];
Array<float, 1, 1> T2;
T2[0]
= T8[0]
+ T9[0];
Array<float, 1, 1> T10;
T10[0]
= T2[0];
T6[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
= T10[0];
}
}
__global__ void nvfuser_N(Tensor<float, 4, 4> T0, Tensor<float, 4, 4> T1, Tensor<float, 4, 4> T6) {
if ((((nvfuser_index_t)threadIdx.x) < 120)) {
Array<float, 1, 1> T9;
T9[0] = 0;
T9[0]
= T1[((((T1.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) / 60)) + (T1.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.x) % 60) / 20))) + (T1.alloc_stride[2LL] * (((((nvfuser_index_t)threadIdx.x) % 60) % 20) / 5))) + (T1.alloc_stride[3LL] * (((((nvfuser_index_t)threadIdx.x) % 60) % 20) % 5)))];
Array<float, 1, 1> T8;
T8[0] = 0;
T8[0]
= T0[((((T0.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) / 60)) + (T0.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.x) % 60) / 20))) + (T0.alloc_stride[2LL] * (((((nvfuser_index_t)threadIdx.x) % 60) % 20) / 5))) + (T0.alloc_stride[3LL] * (((((nvfuser_index_t)threadIdx.x) % 60) % 20) % 5)))];
Array<float, 1, 1> T2;
T2[0]
= T8[0]
+ T9[0];
Array<float, 1, 1> T10;
T10[0]
= T2[0];
T6[((nvfuser_index_t)threadIdx.x)]
= T10[0];
}
}
--- 53997da5d
+++ 03a1b695e
@@ -1,21 +1,21 @@
__global__ void nvfuser_N(Tensor<float, 4, 4> T0, Tensor<float, 4, 4> T1, Tensor<float, 4, 4> T6) {
- if (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < 120)) {
+ if ((((nvfuser_index_t)threadIdx.x) < 120)) {
Array<float, 1, 1> T9;
T9[0] = 0;
T9[0]
- = T1[((((T1.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 60)) + (T1.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 60) / 20))) + (T1.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 60) % 20) / 5))) + (T1.alloc_stride[3LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 60) % 20) % 5)))];
+ = T1[((((T1.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) / 60)) + (T1.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.x) % 60) / 20))) + (T1.alloc_stride[2LL] * (((((nvfuser_index_t)threadIdx.x) % 60) % 20) / 5))) + (T1.alloc_stride[3LL] * (((((nvfuser_index_t)threadIdx.x) % 60) % 20) % 5)))];
Array<float, 1, 1> T8;
T8[0] = 0;
T8[0]
- = T0[((((T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 60)) + (T0.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 60) / 20))) + (T0.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 60) % 20) / 5))) + (T0.alloc_stride[3LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 60) % 20) % 5)))];
+ = T0[((((T0.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) / 60)) + (T0.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.x) % 60) / 20))) + (T0.alloc_stride[2LL] * (((((nvfuser_index_t)threadIdx.x) % 60) % 20) / 5))) + (T0.alloc_stride[3LL] * (((((nvfuser_index_t)threadIdx.x) % 60) % 20) % 5)))];
Array<float, 1, 1> T2;
T2[0]
= T8[0]
+ T9[0];
Array<float, 1, 1> T10;
T10[0]
= T2[0];
- T6[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
+ T6[((nvfuser_index_t)threadIdx.x)]
= T10[0];
}
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_448_cu_465d07a7_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_448_cu_465d07a7_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_448_cu_465d07a7_191103std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_448_cu_465d07a7_1911011nvfuser_448ENS_6TensorIfLi4ELi4EEES1_S1_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_448_cu_465d07a7_1911011nvfuser_448ENS_6TensorIfLi4ELi4EEES1_S1__param_0[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_448_cu_465d07a7_1911011nvfuser_448ENS_6TensorIfLi4ELi4EEES1_S1__param_1[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_448_cu_465d07a7_1911011nvfuser_448ENS_6TensorIfLi4ELi4EEES1_S1__param_2[40]
)
{
.reg .pred %p<2>;
.reg .f32 %f<4>;
.reg .b32 %r<80>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r30, %r31}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_448_cu_465d07a7_1911011nvfuser_448ENS_6TensorIfLi4ELi4EEES1_S1__param_0+24];
ld.param.v2.u32 {%r32, %r33}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_448_cu_465d07a7_1911011nvfuser_448ENS_6TensorIfLi4ELi4EEES1_S1__param_0+32];
ld.param.v2.u32 {%r38, %r39}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_448_cu_465d07a7_1911011nvfuser_448ENS_6TensorIfLi4ELi4EEES1_S1__param_1+24];
ld.param.v2.u32 {%r40, %r41}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_448_cu_465d07a7_1911011nvfuser_448ENS_6TensorIfLi4ELi4EEES1_S1__param_1+32];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_448_cu_465d07a7_1911011nvfuser_448ENS_6TensorIfLi4ELi4EEES1_S1__param_2];
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_448_cu_465d07a7_1911011nvfuser_448ENS_6TensorIfLi4ELi4EEES1_S1__param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_448_cu_465d07a7_1911011nvfuser_448ENS_6TensorIfLi4ELi4EEES1_S1__param_0];
mov.u32 %r50, %ctaid.x;
shl.b32 %r51, %r50, 7;
mov.u32 %r52, %tid.x;
add.s32 %r1, %r51, %r52;
setp.gt.s32 %p1, %r1, 119;
@%p1 bra $L__BB0_2;
cvta.to.global.u64 %rd4, %rd2;
cvta.to.global.u64 %rd5, %rd1;
cvta.to.global.u64 %rd6, %rd3;
mul.hi.s32 %r53, %r1, -2004318071;
add.s32 %r54, %r53, %r1;
shr.u32 %r55, %r54, 31;
shr.s32 %r56, %r54, 5;
add.s32 %r57, %r56, %r55;
mul.lo.s32 %r58, %r57, 60;
sub.s32 %r59, %r1, %r58;
mul.hi.s32 %r60, %r59, 1717986919;
shr.u32 %r61, %r60, 31;
shr.s32 %r62, %r60, 3;
add.s32 %r63, %r62, %r61;
mul.lo.s32 %r64, %r39, %r63;
mad.lo.s32 %r65, %r38, %r57, %r64;
mul.lo.s32 %r66, %r63, 20;
sub.s32 %r67, %r59, %r66;
mul.hi.s32 %r68, %r67, 1717986919;
shr.u32 %r69, %r68, 31;
shr.s32 %r70, %r68, 1;
add.s32 %r71, %r70, %r69;
mad.lo.s32 %r72, %r40, %r71, %r65;
mul.lo.s32 %r73, %r71, 5;
sub.s32 %r74, %r67, %r73;
mad.lo.s32 %r75, %r41, %r74, %r72;
mul.wide.s32 %rd7, %r75, 4;
add.s64 %rd8, %rd4, %rd7;
mul.lo.s32 %r76, %r31, %r63;
mad.lo.s32 %r77, %r30, %r57, %r76;
mad.lo.s32 %r78, %r71, %r32, %r77;
mad.lo.s32 %r79, %r74, %r33, %r78;
mul.wide.s32 %rd9, %r79, 4;
add.s64 %rd10, %rd5, %rd9;
ld.global.f32 %f1, [%rd10];
ld.global.f32 %f2, [%rd8];
add.f32 %f3, %f2, %f1;
mul.wide.s32 %rd11, %r1, 4;
add.s64 %rd12, %rd6, %rd11;
st.global.f32 [%rd12], %f3;
$L__BB0_2:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_448_cu_3024a27b_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_448_cu_3024a27b_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_448_cu_3024a27b_160113std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_448_cu_3024a27b_1601111nvfuser_448ENS_6TensorIfLi4ELi4EEES1_S1_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_448_cu_3024a27b_1601111nvfuser_448ENS_6TensorIfLi4ELi4EEES1_S1__param_0[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_448_cu_3024a27b_1601111nvfuser_448ENS_6TensorIfLi4ELi4EEES1_S1__param_1[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_448_cu_3024a27b_1601111nvfuser_448ENS_6TensorIfLi4ELi4EEES1_S1__param_2[40]
)
{
.reg .pred %p<2>;
.reg .f32 %f<4>;
.reg .b32 %r<77>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r30, %r31}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_448_cu_3024a27b_1601111nvfuser_448ENS_6TensorIfLi4ELi4EEES1_S1__param_0+24];
ld.param.v2.u32 {%r32, %r33}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_448_cu_3024a27b_1601111nvfuser_448ENS_6TensorIfLi4ELi4EEES1_S1__param_0+32];
ld.param.v2.u32 {%r38, %r39}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_448_cu_3024a27b_1601111nvfuser_448ENS_6TensorIfLi4ELi4EEES1_S1__param_1+24];
ld.param.v2.u32 {%r40, %r41}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_448_cu_3024a27b_1601111nvfuser_448ENS_6TensorIfLi4ELi4EEES1_S1__param_1+32];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_448_cu_3024a27b_1601111nvfuser_448ENS_6TensorIfLi4ELi4EEES1_S1__param_2];
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_448_cu_3024a27b_1601111nvfuser_448ENS_6TensorIfLi4ELi4EEES1_S1__param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_448_cu_3024a27b_1601111nvfuser_448ENS_6TensorIfLi4ELi4EEES1_S1__param_0];
mov.u32 %r1, %tid.x;
setp.gt.s32 %p1, %r1, 119;
@%p1 bra $L__BB0_2;
cvta.to.global.u64 %rd4, %rd2;
cvta.to.global.u64 %rd5, %rd1;
cvta.to.global.u64 %rd6, %rd3;
mul.hi.s32 %r50, %r1, -2004318071;
add.s32 %r51, %r50, %r1;
shr.u32 %r52, %r51, 31;
shr.s32 %r53, %r51, 5;
add.s32 %r54, %r53, %r52;
mul.lo.s32 %r55, %r54, 60;
sub.s32 %r56, %r1, %r55;
mul.hi.s32 %r57, %r56, 1717986919;
shr.u32 %r58, %r57, 31;
shr.s32 %r59, %r57, 3;
add.s32 %r60, %r59, %r58;
mul.lo.s32 %r61, %r39, %r60;
mad.lo.s32 %r62, %r38, %r54, %r61;
mul.lo.s32 %r63, %r60, 20;
sub.s32 %r64, %r56, %r63;
mul.hi.s32 %r65, %r64, 1717986919;
shr.u32 %r66, %r65, 31;
shr.s32 %r67, %r65, 1;
add.s32 %r68, %r67, %r66;
mad.lo.s32 %r69, %r40, %r68, %r62;
mul.lo.s32 %r70, %r68, 5;
sub.s32 %r71, %r64, %r70;
mad.lo.s32 %r72, %r41, %r71, %r69;
mul.wide.s32 %rd7, %r72, 4;
add.s64 %rd8, %rd4, %rd7;
mul.lo.s32 %r73, %r31, %r60;
mad.lo.s32 %r74, %r30, %r54, %r73;
mad.lo.s32 %r75, %r32, %r68, %r74;
mad.lo.s32 %r76, %r33, %r71, %r75;
mul.wide.s32 %rd9, %r76, 4;
add.s64 %rd10, %rd5, %rd9;
ld.global.f32 %f1, [%rd10];
ld.global.f32 %f2, [%rd8];
add.f32 %f3, %f2, %f1;
mul.wide.s32 %rd11, %r1, 4;
add.s64 %rd12, %rd6, %rd11;
st.global.f32 [%rd12], %f3;
$L__BB0_2:
ret;
}
--- 53997da5d
+++ 03a1b695e
@@ -20,61 +20,58 @@
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_S1__param_2[40]
)
{
.reg .pred %p<2>;
.reg .f32 %f<4>;
- .reg .b32 %r<80>;
+ .reg .b32 %r<77>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r30, %r31}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_S1__param_0+24];
ld.param.v2.u32 {%r32, %r33}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_S1__param_0+32];
ld.param.v2.u32 {%r38, %r39}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_S1__param_1+24];
ld.param.v2.u32 {%r40, %r41}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_S1__param_1+32];
ld.param.u64 %rd3, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_S1__param_2];
ld.param.u64 %rd2, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_S1__param_1];
ld.param.u64 %rd1, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_S1__param_0];
- mov.u32 %r50, %ctaid.x;
- shl.b32 %r51, %r50, 7;
- mov.u32 %r52, %tid.x;
- add.s32 %r1, %r51, %r52;
+ mov.u32 %r1, %tid.x;
setp.gt.s32 %p1, %r1, 119;
@%p1 bra $L__BB0_2;
cvta.to.global.u64 %rd4, %rd2;
cvta.to.global.u64 %rd5, %rd1;
cvta.to.global.u64 %rd6, %rd3;
- mul.hi.s32 %r53, %r1, -2004318071;
- add.s32 %r54, %r53, %r1;
- shr.u32 %r55, %r54, 31;
- shr.s32 %r56, %r54, 5;
- add.s32 %r57, %r56, %r55;
- mul.lo.s32 %r58, %r57, 60;
- sub.s32 %r59, %r1, %r58;
- mul.hi.s32 %r60, %r59, 1717986919;
- shr.u32 %r61, %r60, 31;
- shr.s32 %r62, %r60, 3;
- add.s32 %r63, %r62, %r61;
- mul.lo.s32 %r64, %r39, %r63;
- mad.lo.s32 %r65, %r38, %r57, %r64;
- mul.lo.s32 %r66, %r63, 20;
- sub.s32 %r67, %r59, %r66;
- mul.hi.s32 %r68, %r67, 1717986919;
- shr.u32 %r69, %r68, 31;
- shr.s32 %r70, %r68, 1;
- add.s32 %r71, %r70, %r69;
- mad.lo.s32 %r72, %r40, %r71, %r65;
- mul.lo.s32 %r73, %r71, 5;
- sub.s32 %r74, %r67, %r73;
- mad.lo.s32 %r75, %r41, %r74, %r72;
- mul.wide.s32 %rd7, %r75, 4;
+ mul.hi.s32 %r50, %r1, -2004318071;
+ add.s32 %r51, %r50, %r1;
+ shr.u32 %r52, %r51, 31;
+ shr.s32 %r53, %r51, 5;
+ add.s32 %r54, %r53, %r52;
+ mul.lo.s32 %r55, %r54, 60;
+ sub.s32 %r56, %r1, %r55;
+ mul.hi.s32 %r57, %r56, 1717986919;
+ shr.u32 %r58, %r57, 31;
+ shr.s32 %r59, %r57, 3;
+ add.s32 %r60, %r59, %r58;
+ mul.lo.s32 %r61, %r39, %r60;
+ mad.lo.s32 %r62, %r38, %r54, %r61;
+ mul.lo.s32 %r63, %r60, 20;
+ sub.s32 %r64, %r56, %r63;
+ mul.hi.s32 %r65, %r64, 1717986919;
+ shr.u32 %r66, %r65, 31;
+ shr.s32 %r67, %r65, 1;
+ add.s32 %r68, %r67, %r66;
+ mad.lo.s32 %r69, %r40, %r68, %r62;
+ mul.lo.s32 %r70, %r68, 5;
+ sub.s32 %r71, %r64, %r70;
+ mad.lo.s32 %r72, %r41, %r71, %r69;
+ mul.wide.s32 %rd7, %r72, 4;
add.s64 %rd8, %rd4, %rd7;
- mul.lo.s32 %r76, %r31, %r63;
- mad.lo.s32 %r77, %r30, %r57, %r76;
- mad.lo.s32 %r78, %r71, %r32, %r77;
- mad.lo.s32 %r79, %r74, %r33, %r78;
- mul.wide.s32 %rd9, %r79, 4;
+ mul.lo.s32 %r73, %r31, %r60;
+ mad.lo.s32 %r74, %r30, %r54, %r73;
+ mad.lo.s32 %r75, %r32, %r68, %r74;
+ mad.lo.s32 %r76, %r33, %r71, %r75;
+ mul.wide.s32 %rd9, %r76, 4;
add.s64 %rd10, %rd5, %rd9;
ld.global.f32 %f1, [%rd10];
ld.global.f32 %f2, [%rd8];
add.f32 %f3, %f2, %f1;
mul.wide.s32 %rd11, %r1, 4;
14: ReshapeReduction.FusionReshapeReduction/43
Kernel 1
CUDA
PTX
53997da5d
Diff
03a1b695e
-4
+4 index type: int
registers: 0
gmem: 0
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 5, 5> T4, Tensor<float, 4, 4> T5) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
Array<float, 1, 1> T8;
T8[0] = 0.000000000e+00f;
#pragma unroll 1
for(nvfuser_index_t i0 = 0; i0 < (ceilDiv(1, ((nvfuser_index_t)blockDim.x))); ++i0) {
if ((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242) && (((3 + (4 * ((nvfuser_index_t)threadIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * i0)) < 4))) {
Array<float, 4, 4> T6;
T6.set(float(0.000000000e+00f));
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T6[0], &T4[((((4 * ((nvfuser_index_t)threadIdx.x)) + (4 * ((nvfuser_index_t)threadIdx.y))) + ((4 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * i0))]);
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
T8[0]
= T8[0]
+ T6[i1];
}
} else {
Array<float, 4, 4> T6;
T6.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242) && (((3 + (4 * ((nvfuser_index_t)threadIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * i0)) < 4))) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T6[0], &T4[((((4 * ((nvfuser_index_t)threadIdx.x)) + (4 * ((nvfuser_index_t)threadIdx.y))) + ((4 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * i0))]);
}
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
T8[0]
= T8[0]
+ T6[i1];
}
}
}
Array<float, 1, 1> T7;
T7[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T7[0], T8[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242))) {
T5[(((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x)))]
= T7[0];
}
}
__global__ void nvfuser_N(Tensor<float, 5, 5> T4, Tensor<float, 4, 4> T5) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
Array<float, 1, 1> T8;
T8[0] = 0.000000000e+00f;
#pragma unroll 1
for(nvfuser_index_t i0 = 0; i0 < (ceilDiv(1, ((nvfuser_index_t)blockDim.x))); ++i0) {
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242)) {
Array<float, 4, 4> T6;
T6.set(float(0.000000000e+00f));
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T6[0], &T4[((4 * ((nvfuser_index_t)threadIdx.y)) + ((4 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))]);
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
T8[0]
= T8[0]
+ T6[i1];
}
} else {
Array<float, 4, 4> T6;
T6.set(float(0.000000000e+00f));
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242)) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T6[0], &T4[((4 * ((nvfuser_index_t)threadIdx.y)) + ((4 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))]);
}
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
T8[0]
= T8[0]
+ T6[i1];
}
}
}
Array<float, 1, 1> T7;
T7[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T7[0], T8[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242))) {
T5[(((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x)))]
= T7[0];
}
}
--- 53997da5d
+++ 03a1b695e
@@ -3,25 +3,25 @@
void* shared_mem = array;
Array<float, 1, 1> T8;
T8[0] = 0.000000000e+00f;
#pragma unroll 1
for(nvfuser_index_t i0 = 0; i0 < (ceilDiv(1, ((nvfuser_index_t)blockDim.x))); ++i0) {
- if ((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242) && (((3 + (4 * ((nvfuser_index_t)threadIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * i0)) < 4))) {
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242)) {
Array<float, 4, 4> T6;
T6.set(float(0.000000000e+00f));
- loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T6[0], &T4[((((4 * ((nvfuser_index_t)threadIdx.x)) + (4 * ((nvfuser_index_t)threadIdx.y))) + ((4 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * i0))]);
+ loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T6[0], &T4[((4 * ((nvfuser_index_t)threadIdx.y)) + ((4 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))]);
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
T8[0]
= T8[0]
+ T6[i1];
}
} else {
Array<float, 4, 4> T6;
T6.set(float(0.000000000e+00f));
- if ((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242) && (((3 + (4 * ((nvfuser_index_t)threadIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * i0)) < 4))) {
- loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T6[0], &T4[((((4 * ((nvfuser_index_t)threadIdx.x)) + (4 * ((nvfuser_index_t)threadIdx.y))) + ((4 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * i0))]);
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242)) {
+ loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T6[0], &T4[((4 * ((nvfuser_index_t)threadIdx.y)) + ((4 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))]);
}
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
T8[0]
= T8[0]
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_449_cu_465d07a7_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_449_cu_465d07a7_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_449_cu_465d07a7_191103std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_449_cu_465d07a7_191105arrayE[];
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_449_cu_465d07a7_1911011nvfuser_449ENS_6TensorIfLi5ELi5EEENS0_IfLi4ELi4EEE(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_449_cu_465d07a7_1911011nvfuser_449ENS_6TensorIfLi5ELi5EEENS0_IfLi4ELi4EEE_param_0[48],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_449_cu_465d07a7_1911011nvfuser_449ENS_6TensorIfLi5ELi5EEENS0_IfLi4ELi4EEE_param_1[40]
)
{
.reg .pred %p<16>;
.reg .f32 %f<52>;
.reg .b32 %r<52>;
.reg .b64 %rd<22>;
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_449_cu_465d07a7_1911011nvfuser_449ENS_6TensorIfLi5ELi5EEENS0_IfLi4ELi4EEE_param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_449_cu_465d07a7_1911011nvfuser_449ENS_6TensorIfLi5ELi5EEENS0_IfLi4ELi4EEE_param_0];
mov.u32 %r1, %ntid.x;
mov.u32 %r28, %ctaid.x;
mov.u32 %r2, %ntid.y;
mov.u32 %r3, %tid.y;
mad.lo.s32 %r4, %r2, %r28, %r3;
setp.gt.s32 %p1, %r4, 241;
@%p1 bra $L__BB0_2;
mov.u32 %r5, %tid.x;
setp.lt.s32 %p2, %r5, 1;
@%p2 bra $L__BB0_6;
$L__BB0_2:
setp.lt.s32 %p3, %r4, 242;
mov.f32 %f46, 0f00000000;
mov.f32 %f47, 0f00000000;
mov.f32 %f48, 0f00000000;
mov.f32 %f49, 0f00000000;
@%p3 bra $L__BB0_3;
bra.uni $L__BB0_5;
$L__BB0_3:
mov.u32 %r6, %tid.x;
setp.gt.s32 %p4, %r6, 0;
@%p4 bra $L__BB0_5;
add.s32 %r33, %r4, %r6;
shl.b32 %r34, %r33, 2;
mul.wide.s32 %rd5, %r34, 4;
add.s64 %rd4, %rd1, %rd5;
// begin inline asm
ld.global.cs.v4.u32 {%r29,%r30,%r31,%r32}, [%rd4];
// end inline asm
mov.b32 %f23, %r29;
add.f32 %f49, %f23, 0f00000000;
mov.b32 %f48, %r30;
mov.b32 %f47, %r31;
mov.b32 %f46, %r32;
$L__BB0_5:
add.f32 %f24, %f49, %f48;
add.f32 %f25, %f24, %f47;
add.f32 %f50, %f25, %f46;
$L__BB0_7:
mov.u32 %r41, %tid.z;
mad.lo.s32 %r42, %r2, %r41, %r3;
mov.u32 %r7, %tid.x;
mad.lo.s32 %r8, %r42, %r1, %r7;
mul.wide.u32 %rd8, %r8, 4;
mov.u64 %rd9, _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_449_cu_465d07a7_191105arrayE;
add.s64 %rd2, %rd9, %rd8;
st.shared.f32 [%rd2], %f50;
bar.sync 0;
clz.b32 %r43, %r1;
mov.u32 %r44, 31;
sub.s32 %r45, %r44, %r43;
mov.u32 %r46, 1;
shl.b32 %r51, %r46, %r45;
setp.ge.u32 %p5, %r7, %r51;
add.s32 %r47, %r51, %r7;
setp.ge.u32 %p6, %r47, %r1;
or.pred %p7, %p5, %p6;
@%p7 bra $L__BB0_9;
add.s32 %r48, %r8, %r51;
mul.wide.s32 %rd10, %r48, 4;
add.s64 %rd12, %rd9, %rd10;
ld.shared.f32 %f33, [%rd2];
ld.shared.f32 %f34, [%rd12];
add.f32 %f35, %f34, %f33;
st.shared.f32 [%rd2], %f35;
$L__BB0_9:
bar.sync 0;
setp.lt.s32 %p8, %r51, 4;
@%p8 bra $L__BB0_13;
$L__BB0_10:
shr.u32 %r11, %r51, 1;
setp.ge.u32 %p9, %r7, %r11;
@%p9 bra $L__BB0_12;
add.s32 %r49, %r11, %r8;
mul.wide.s32 %rd13, %r49, 4;
add.s64 %rd15, %rd9, %rd13;
ld.shared.f32 %f36, [%rd2];
ld.shared.f32 %f37, [%rd15];
add.f32 %f38, %f37, %f36;
st.shared.f32 [%rd2], %f38;
$L__BB0_12:
bar.sync 0;
setp.gt.u32 %p10, %r51, 7;
mov.u32 %r51, %r11;
@%p10 bra $L__BB0_10;
$L__BB0_13:
setp.ne.s32 %p11, %r7, 0;
mov.f32 %f51, 0f00000000;
@%p11 bra $L__BB0_16;
ld.shared.f32 %f40, [%rd2];
add.f32 %f51, %f40, 0f00000000;
setp.lt.u32 %p12, %r1, 2;
@%p12 bra $L__BB0_16;
add.s32 %r50, %r8, 1;
mul.wide.u32 %rd16, %r50, 4;
add.s64 %rd18, %rd9, %rd16;
ld.shared.f32 %f41, [%rd18];
add.f32 %f51, %f51, %f41;
$L__BB0_16:
bar.sync 0;
or.pred %p15, %p11, %p1;
@%p15 bra $L__BB0_18;
cvta.to.global.u64 %rd19, %rd3;
mul.wide.s32 %rd20, %r4, 4;
add.s64 %rd21, %rd19, %rd20;
st.global.f32 [%rd21], %f51;
$L__BB0_18:
ret;
$L__BB0_6:
add.s32 %r39, %r4, %r5;
shl.b32 %r40, %r39, 2;
mul.wide.s32 %rd7, %r40, 4;
add.s64 %rd6, %rd1, %rd7;
// begin inline asm
ld.global.cs.v4.u32 {%r35,%r36,%r37,%r38}, [%rd6];
// end inline asm
mov.b32 %f26, %r35;
add.f32 %f27, %f26, 0f00000000;
mov.b32 %f28, %r36;
add.f32 %f29, %f27, %f28;
mov.b32 %f30, %r37;
add.f32 %f31, %f29, %f30;
mov.b32 %f32, %r38;
add.f32 %f50, %f31, %f32;
bra.uni $L__BB0_7;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_449_cu_3024a27b_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_449_cu_3024a27b_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_449_cu_3024a27b_160113std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_449_cu_3024a27b_160115arrayE[];
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_449_cu_3024a27b_1601111nvfuser_449ENS_6TensorIfLi5ELi5EEENS0_IfLi4ELi4EEE(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_449_cu_3024a27b_1601111nvfuser_449ENS_6TensorIfLi5ELi5EEENS0_IfLi4ELi4EEE_param_0[48],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_449_cu_3024a27b_1601111nvfuser_449ENS_6TensorIfLi5ELi5EEENS0_IfLi4ELi4EEE_param_1[40]
)
{
.reg .pred %p<13>;
.reg .f32 %f<25>;
.reg .b32 %r<63>;
.reg .b64 %rd<20>;
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_449_cu_3024a27b_1601111nvfuser_449ENS_6TensorIfLi5ELi5EEENS0_IfLi4ELi4EEE_param_1];
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_449_cu_3024a27b_1601111nvfuser_449ENS_6TensorIfLi5ELi5EEENS0_IfLi4ELi4EEE_param_0];
mov.u32 %r1, %ntid.x;
mov.u32 %r46, %ctaid.x;
mov.u32 %r2, %ntid.y;
mov.u32 %r3, %tid.y;
mad.lo.s32 %r4, %r2, %r46, %r3;
setp.gt.s32 %p1, %r4, 241;
mov.f32 %f23, 0f00000000;
@%p1 bra $L__BB0_2;
shl.b32 %r51, %r4, 2;
mul.wide.s32 %rd5, %r51, 4;
add.s64 %rd4, %rd2, %rd5;
// begin inline asm
ld.global.cs.v4.u32 {%r47,%r48,%r49,%r50}, [%rd4];
// end inline asm
mov.b32 %f7, %r47;
add.f32 %f8, %f7, 0f00000000;
mov.b32 %f9, %r48;
add.f32 %f10, %f8, %f9;
mov.b32 %f11, %r49;
add.f32 %f12, %f10, %f11;
mov.b32 %f13, %r50;
add.f32 %f23, %f12, %f13;
$L__BB0_2:
mov.u32 %r52, %tid.z;
mad.lo.s32 %r53, %r2, %r52, %r3;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r6, %r53, %r1, %r5;
mul.wide.u32 %rd6, %r6, 4;
mov.u64 %rd7, _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_449_cu_3024a27b_160115arrayE;
add.s64 %rd1, %rd7, %rd6;
st.shared.f32 [%rd1], %f23;
bar.sync 0;
clz.b32 %r54, %r1;
mov.u32 %r55, 31;
sub.s32 %r56, %r55, %r54;
mov.u32 %r57, 1;
shl.b32 %r62, %r57, %r56;
setp.ge.u32 %p2, %r5, %r62;
add.s32 %r58, %r62, %r5;
setp.ge.u32 %p3, %r58, %r1;
or.pred %p4, %p2, %p3;
@%p4 bra $L__BB0_4;
add.s32 %r59, %r6, %r62;
mul.wide.s32 %rd8, %r59, 4;
add.s64 %rd10, %rd7, %rd8;
ld.shared.f32 %f14, [%rd1];
ld.shared.f32 %f15, [%rd10];
add.f32 %f16, %f15, %f14;
st.shared.f32 [%rd1], %f16;
$L__BB0_4:
bar.sync 0;
setp.lt.s32 %p5, %r62, 4;
@%p5 bra $L__BB0_8;
$L__BB0_5:
shr.u32 %r9, %r62, 1;
setp.ge.u32 %p6, %r5, %r9;
@%p6 bra $L__BB0_7;
add.s32 %r60, %r9, %r6;
mul.wide.s32 %rd11, %r60, 4;
add.s64 %rd13, %rd7, %rd11;
ld.shared.f32 %f17, [%rd1];
ld.shared.f32 %f18, [%rd13];
add.f32 %f19, %f18, %f17;
st.shared.f32 [%rd1], %f19;
$L__BB0_7:
bar.sync 0;
setp.gt.u32 %p7, %r62, 7;
mov.u32 %r62, %r9;
@%p7 bra $L__BB0_5;
$L__BB0_8:
setp.ne.s32 %p8, %r5, 0;
mov.f32 %f24, 0f00000000;
@%p8 bra $L__BB0_11;
ld.shared.f32 %f21, [%rd1];
add.f32 %f24, %f21, 0f00000000;
setp.lt.u32 %p9, %r1, 2;
@%p9 bra $L__BB0_11;
add.s32 %r61, %r6, 1;
mul.wide.u32 %rd14, %r61, 4;
add.s64 %rd16, %rd7, %rd14;
ld.shared.f32 %f22, [%rd16];
add.f32 %f24, %f24, %f22;
$L__BB0_11:
bar.sync 0;
or.pred %p12, %p8, %p1;
@%p12 bra $L__BB0_13;
cvta.to.global.u64 %rd17, %rd3;
mul.wide.s32 %rd18, %r4, 4;
add.s64 %rd19, %rd17, %rd18;
st.global.f32 [%rd19], %f24;
$L__BB0_13:
ret;
}
--- 53997da5d
+++ 03a1b695e
@@ -18,160 +18,122 @@
.entry _ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEENS0_IfLi4ELi4EEE(
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEENS0_IfLi4ELi4EEE_param_0[48],
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEENS0_IfLi4ELi4EEE_param_1[40]
)
{
- .reg .pred %p<16>;
- .reg .f32 %f<52>;
- .reg .b32 %r<52>;
- .reg .b64 %rd<22>;
+ .reg .pred %p<13>;
+ .reg .f32 %f<25>;
+ .reg .b32 %r<63>;
+ .reg .b64 %rd<20>;
ld.param.u64 %rd3, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEENS0_IfLi4ELi4EEE_param_1];
- ld.param.u64 %rd1, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEENS0_IfLi4ELi4EEE_param_0];
+ ld.param.u64 %rd2, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEENS0_IfLi4ELi4EEE_param_0];
mov.u32 %r1, %ntid.x;
- mov.u32 %r28, %ctaid.x;
+ mov.u32 %r46, %ctaid.x;
mov.u32 %r2, %ntid.y;
mov.u32 %r3, %tid.y;
- mad.lo.s32 %r4, %r2, %r28, %r3;
+ mad.lo.s32 %r4, %r2, %r46, %r3;
setp.gt.s32 %p1, %r4, 241;
+ mov.f32 %f23, 0f00000000;
@%p1 bra $L__BB0_2;
- mov.u32 %r5, %tid.x;
- setp.lt.s32 %p2, %r5, 1;
- @%p2 bra $L__BB0_6;
+ shl.b32 %r51, %r4, 2;
+ mul.wide.s32 %rd5, %r51, 4;
+ add.s64 %rd4, %rd2, %rd5;
+
+ ld.global.cs.v4.u32 {%r47,%r48,%r49,%r50}, [%rd4];
+
+ mov.b32 %f7, %r47;
+ add.f32 %f8, %f7, 0f00000000;
+ mov.b32 %f9, %r48;
+ add.f32 %f10, %f8, %f9;
+ mov.b32 %f11, %r49;
+ add.f32 %f12, %f10, %f11;
+ mov.b32 %f13, %r50;
+ add.f32 %f23, %f12, %f13;
$L__BB0_2:
- setp.lt.s32 %p3, %r4, 242;
- mov.f32 %f46, 0f00000000;
- mov.f32 %f47, 0f00000000;
- mov.f32 %f48, 0f00000000;
- mov.f32 %f49, 0f00000000;
- @%p3 bra $L__BB0_3;
- bra.uni $L__BB0_5;
+ mov.u32 %r52, %tid.z;
+ mad.lo.s32 %r53, %r2, %r52, %r3;
+ mov.u32 %r5, %tid.x;
+ mad.lo.s32 %r6, %r53, %r1, %r5;
+ mul.wide.u32 %rd6, %r6, 4;
+ mov.u64 %rd7, _ZN11kernelscope6kernelE;
+ add.s64 %rd1, %rd7, %rd6;
+ st.shared.f32 [%rd1], %f23;
+ bar.sync 0;
+ clz.b32 %r54, %r1;
+ mov.u32 %r55, 31;
+ sub.s32 %r56, %r55, %r54;
+ mov.u32 %r57, 1;
+ shl.b32 %r62, %r57, %r56;
+ setp.ge.u32 %p2, %r5, %r62;
+ add.s32 %r58, %r62, %r5;
+ setp.ge.u32 %p3, %r58, %r1;
+ or.pred %p4, %p2, %p3;
+ @%p4 bra $L__BB0_4;
-$L__BB0_3:
- mov.u32 %r6, %tid.x;
- setp.gt.s32 %p4, %r6, 0;
- @%p4 bra $L__BB0_5;
+ add.s32 %r59, %r6, %r62;
+ mul.wide.s32 %rd8, %r59, 4;
+ add.s64 %rd10, %rd7, %rd8;
+ ld.shared.f32 %f14, [%rd1];
+ ld.shared.f32 %f15, [%rd10];
+ add.f32 %f16, %f15, %f14;
+ st.shared.f32 [%rd1], %f16;
- add.s32 %r33, %r4, %r6;
- shl.b32 %r34, %r33, 2;
- mul.wide.s32 %rd5, %r34, 4;
- add.s64 %rd4, %rd1, %rd5;
-
- ld.global.cs.v4.u32 {%r29,%r30,%r31,%r32}, [%rd4];
-
- mov.b32 %f23, %r29;
- add.f32 %f49, %f23, 0f00000000;
- mov.b32 %f48, %r30;
- mov.b32 %f47, %r31;
- mov.b32 %f46, %r32;
+$L__BB0_4:
+ bar.sync 0;
+ setp.lt.s32 %p5, %r62, 4;
+ @%p5 bra $L__BB0_8;
$L__BB0_5:
- add.f32 %f24, %f49, %f48;
- add.f32 %f25, %f24, %f47;
- add.f32 %f50, %f25, %f46;
+ shr.u32 %r9, %r62, 1;
+ setp.ge.u32 %p6, %r5, %r9;
+ @%p6 bra $L__BB0_7;
+
+ add.s32 %r60, %r9, %r6;
+ mul.wide.s32 %rd11, %r60, 4;
+ add.s64 %rd13, %rd7, %rd11;
+ ld.shared.f32 %f17, [%rd1];
+ ld.shared.f32 %f18, [%rd13];
+ add.f32 %f19, %f18, %f17;
+ st.shared.f32 [%rd1], %f19;
$L__BB0_7:
- mov.u32 %r41, %tid.z;
- mad.lo.s32 %r42, %r2, %r41, %r3;
- mov.u32 %r7, %tid.x;
- mad.lo.s32 %r8, %r42, %r1, %r7;
- mul.wide.u32 %rd8, %r8, 4;
- mov.u64 %rd9, _ZN11kernelscope6kernelE;
- add.s64 %rd2, %rd9, %rd8;
- st.shared.f32 [%rd2], %f50;
bar.sync 0;
- clz.b32 %r43, %r1;
- mov.u32 %r44, 31;
- sub.s32 %r45, %r44, %r43;
- mov.u32 %r46, 1;
- shl.b32 %r51, %r46, %r45;
- setp.ge.u32 %p5, %r7, %r51;
- add.s32 %r47, %r51, %r7;
- setp.ge.u32 %p6, %r47, %r1;
- or.pred %p7, %p5, %p6;
- @%p7 bra $L__BB0_9;
+ setp.gt.u32 %p7, %r62, 7;
+ mov.u32 %r62, %r9;
+ @%p7 bra $L__BB0_5;
- add.s32 %r48, %r8, %r51;
- mul.wide.s32 %rd10, %r48, 4;
- add.s64 %rd12, %rd9, %rd10;
- ld.shared.f32 %f33, [%rd2];
- ld.shared.f32 %f34, [%rd12];
- add.f32 %f35, %f34, %f33;
- st.shared.f32 [%rd2], %f35;
+$L__BB0_8:
+ setp.ne.s32 %p8, %r5, 0;
+ mov.f32 %f24, 0f00000000;
+ @%p8 bra $L__BB0_11;
-$L__BB0_9:
+ ld.shared.f32 %f21, [%rd1];
+ add.f32 %f24, %f21, 0f00000000;
+ setp.lt.u32 %p9, %r1, 2;
+ @%p9 bra $L__BB0_11;
+
+ add.s32 %r61, %r6, 1;
+ mul.wide.u32 %rd14, %r61, 4;
+ add.s64 %rd16, %rd7, %rd14;
+ ld.shared.f32 %f22, [%rd16];
+ add.f32 %f24, %f24, %f22;
+
+$L__BB0_11:
bar.sync 0;
- setp.lt.s32 %p8, %r51, 4;
- @%p8 bra $L__BB0_13;
+ or.pred %p12, %p8, %p1;
+ @%p12 bra $L__BB0_13;
-$L__BB0_10:
- shr.u32 %r11, %r51, 1;
- setp.ge.u32 %p9, %r7, %r11;
- @%p9 bra $L__BB0_12;
-
- add.s32 %r49, %r11, %r8;
- mul.wide.s32 %rd13, %r49, 4;
- add.s64 %rd15, %rd9, %rd13;
- ld.shared.f32 %f36, [%rd2];
- ld.shared.f32 %f37, [%rd15];
- add.f32 %f38, %f37, %f36;
- st.shared.f32 [%rd2], %f38;
-
-$L__BB0_12:
- bar.sync 0;
- setp.gt.u32 %p10, %r51, 7;
- mov.u32 %r51, %r11;
- @%p10 bra $L__BB0_10;
+ cvta.to.global.u64 %rd17, %rd3;
+ mul.wide.s32 %rd18, %r4, 4;
+ add.s64 %rd19, %rd17, %rd18;
+ st.global.f32 [%rd19], %f24;
$L__BB0_13:
- setp.ne.s32 %p11, %r7, 0;
- mov.f32 %f51, 0f00000000;
- @%p11 bra $L__BB0_16;
-
- ld.shared.f32 %f40, [%rd2];
- add.f32 %f51, %f40, 0f00000000;
- setp.lt.u32 %p12, %r1, 2;
- @%p12 bra $L__BB0_16;
-
- add.s32 %r50, %r8, 1;
- mul.wide.u32 %rd16, %r50, 4;
- add.s64 %rd18, %rd9, %rd16;
- ld.shared.f32 %f41, [%rd18];
- add.f32 %f51, %f51, %f41;
-
-$L__BB0_16:
- bar.sync 0;
- or.pred %p15, %p11, %p1;
- @%p15 bra $L__BB0_18;
-
- cvta.to.global.u64 %rd19, %rd3;
- mul.wide.s32 %rd20, %r4, 4;
- add.s64 %rd21, %rd19, %rd20;
- st.global.f32 [%rd21], %f51;
-
-$L__BB0_18:
ret;
-
-$L__BB0_6:
- add.s32 %r39, %r4, %r5;
- shl.b32 %r40, %r39, 2;
- mul.wide.s32 %rd7, %r40, 4;
- add.s64 %rd6, %rd1, %rd7;
-
- ld.global.cs.v4.u32 {%r35,%r36,%r37,%r38}, [%rd6];
-
- mov.b32 %f26, %r35;
- add.f32 %f27, %f26, 0f00000000;
- mov.b32 %f28, %r36;
- add.f32 %f29, %f27, %f28;
- mov.b32 %f30, %r37;
- add.f32 %f31, %f29, %f30;
- mov.b32 %f32, %r38;
- add.f32 %f50, %f31, %f32;
- bra.uni $L__BB0_7;
}
15: ReshapeReduction.FusionReshapeReduction/45
Kernel 1
CUDA
PTX
53997da5d
Diff
03a1b695e
-4
+4 index type: int
registers: 0
gmem: 0
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 5, 5> T4, Tensor<float, 4, 4> T5) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
Array<float, 1, 1> T8;
T8[0] = 0.000000000e+00f;
#pragma unroll 1
for(nvfuser_index_t i0 = 0; i0 < (ceilDiv(1, ((nvfuser_index_t)blockDim.x))); ++i0) {
if ((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242) && (((3 + (4 * ((nvfuser_index_t)threadIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * i0)) < 4))) {
Array<float, 4, 4> T6;
T6.set(float(0.000000000e+00f));
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T6[0], &T4[((((4 * ((nvfuser_index_t)threadIdx.x)) + (4 * ((nvfuser_index_t)threadIdx.y))) + ((4 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * i0))]);
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
T8[0]
= T8[0]
+ T6[i1];
}
} else {
Array<float, 4, 4> T6;
T6.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242) && (((3 + (4 * ((nvfuser_index_t)threadIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * i0)) < 4))) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T6[0], &T4[((((4 * ((nvfuser_index_t)threadIdx.x)) + (4 * ((nvfuser_index_t)threadIdx.y))) + ((4 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * i0))]);
}
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
T8[0]
= T8[0]
+ T6[i1];
}
}
}
Array<float, 1, 1> T7;
T7[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T7[0], T8[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242))) {
T5[(((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x)))]
= T7[0];
}
}
__global__ void nvfuser_N(Tensor<float, 5, 5> T4, Tensor<float, 4, 4> T5) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
Array<float, 1, 1> T8;
T8[0] = 0.000000000e+00f;
#pragma unroll 1
for(nvfuser_index_t i0 = 0; i0 < (ceilDiv(1, ((nvfuser_index_t)blockDim.x))); ++i0) {
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242)) {
Array<float, 4, 4> T6;
T6.set(float(0.000000000e+00f));
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T6[0], &T4[((4 * ((nvfuser_index_t)threadIdx.y)) + ((4 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))]);
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
T8[0]
= T8[0]
+ T6[i1];
}
} else {
Array<float, 4, 4> T6;
T6.set(float(0.000000000e+00f));
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242)) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T6[0], &T4[((4 * ((nvfuser_index_t)threadIdx.y)) + ((4 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))]);
}
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
T8[0]
= T8[0]
+ T6[i1];
}
}
}
Array<float, 1, 1> T7;
T7[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T7[0], T8[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242))) {
T5[(((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x)))]
= T7[0];
}
}
--- 53997da5d
+++ 03a1b695e
@@ -3,25 +3,25 @@
void* shared_mem = array;
Array<float, 1, 1> T8;
T8[0] = 0.000000000e+00f;
#pragma unroll 1
for(nvfuser_index_t i0 = 0; i0 < (ceilDiv(1, ((nvfuser_index_t)blockDim.x))); ++i0) {
- if ((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242) && (((3 + (4 * ((nvfuser_index_t)threadIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * i0)) < 4))) {
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242)) {
Array<float, 4, 4> T6;
T6.set(float(0.000000000e+00f));
- loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T6[0], &T4[((((4 * ((nvfuser_index_t)threadIdx.x)) + (4 * ((nvfuser_index_t)threadIdx.y))) + ((4 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * i0))]);
+ loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T6[0], &T4[((4 * ((nvfuser_index_t)threadIdx.y)) + ((4 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))]);
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
T8[0]
= T8[0]
+ T6[i1];
}
} else {
Array<float, 4, 4> T6;
T6.set(float(0.000000000e+00f));
- if ((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242) && (((3 + (4 * ((nvfuser_index_t)threadIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * i0)) < 4))) {
- loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T6[0], &T4[((((4 * ((nvfuser_index_t)threadIdx.x)) + (4 * ((nvfuser_index_t)threadIdx.y))) + ((4 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x))) + ((4 * ((nvfuser_index_t)blockDim.x)) * i0))]);
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 242)) {
+ loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T6[0], &T4[((4 * ((nvfuser_index_t)threadIdx.y)) + ((4 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))]);
}
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
T8[0]
= T8[0]
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_452_cu_465d07a7_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_452_cu_465d07a7_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_452_cu_465d07a7_191103std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_452_cu_465d07a7_191105arrayE[];
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_452_cu_465d07a7_1911011nvfuser_452ENS_6TensorIfLi5ELi5EEENS0_IfLi4ELi4EEE(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_452_cu_465d07a7_1911011nvfuser_452ENS_6TensorIfLi5ELi5EEENS0_IfLi4ELi4EEE_param_0[48],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_452_cu_465d07a7_1911011nvfuser_452ENS_6TensorIfLi5ELi5EEENS0_IfLi4ELi4EEE_param_1[40]
)
{
.reg .pred %p<16>;
.reg .f32 %f<52>;
.reg .b32 %r<52>;
.reg .b64 %rd<22>;
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_452_cu_465d07a7_1911011nvfuser_452ENS_6TensorIfLi5ELi5EEENS0_IfLi4ELi4EEE_param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_452_cu_465d07a7_1911011nvfuser_452ENS_6TensorIfLi5ELi5EEENS0_IfLi4ELi4EEE_param_0];
mov.u32 %r1, %ntid.x;
mov.u32 %r28, %ctaid.x;
mov.u32 %r2, %ntid.y;
mov.u32 %r3, %tid.y;
mad.lo.s32 %r4, %r2, %r28, %r3;
setp.gt.s32 %p1, %r4, 241;
@%p1 bra $L__BB0_2;
mov.u32 %r5, %tid.x;
setp.lt.s32 %p2, %r5, 1;
@%p2 bra $L__BB0_6;
$L__BB0_2:
setp.lt.s32 %p3, %r4, 242;
mov.f32 %f46, 0f00000000;
mov.f32 %f47, 0f00000000;
mov.f32 %f48, 0f00000000;
mov.f32 %f49, 0f00000000;
@%p3 bra $L__BB0_3;
bra.uni $L__BB0_5;
$L__BB0_3:
mov.u32 %r6, %tid.x;
setp.gt.s32 %p4, %r6, 0;
@%p4 bra $L__BB0_5;
add.s32 %r33, %r4, %r6;
shl.b32 %r34, %r33, 2;
mul.wide.s32 %rd5, %r34, 4;
add.s64 %rd4, %rd1, %rd5;
// begin inline asm
ld.global.cs.v4.u32 {%r29,%r30,%r31,%r32}, [%rd4];
// end inline asm
mov.b32 %f23, %r29;
add.f32 %f49, %f23, 0f00000000;
mov.b32 %f48, %r30;
mov.b32 %f47, %r31;
mov.b32 %f46, %r32;
$L__BB0_5:
add.f32 %f24, %f49, %f48;
add.f32 %f25, %f24, %f47;
add.f32 %f50, %f25, %f46;
$L__BB0_7:
mov.u32 %r41, %tid.z;
mad.lo.s32 %r42, %r2, %r41, %r3;
mov.u32 %r7, %tid.x;
mad.lo.s32 %r8, %r42, %r1, %r7;
mul.wide.u32 %rd8, %r8, 4;
mov.u64 %rd9, _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_452_cu_465d07a7_191105arrayE;
add.s64 %rd2, %rd9, %rd8;
st.shared.f32 [%rd2], %f50;
bar.sync 0;
clz.b32 %r43, %r1;
mov.u32 %r44, 31;
sub.s32 %r45, %r44, %r43;
mov.u32 %r46, 1;
shl.b32 %r51, %r46, %r45;
setp.ge.u32 %p5, %r7, %r51;
add.s32 %r47, %r51, %r7;
setp.ge.u32 %p6, %r47, %r1;
or.pred %p7, %p5, %p6;
@%p7 bra $L__BB0_9;
add.s32 %r48, %r8, %r51;
mul.wide.s32 %rd10, %r48, 4;
add.s64 %rd12, %rd9, %rd10;
ld.shared.f32 %f33, [%rd2];
ld.shared.f32 %f34, [%rd12];
add.f32 %f35, %f34, %f33;
st.shared.f32 [%rd2], %f35;
$L__BB0_9:
bar.sync 0;
setp.lt.s32 %p8, %r51, 4;
@%p8 bra $L__BB0_13;
$L__BB0_10:
shr.u32 %r11, %r51, 1;
setp.ge.u32 %p9, %r7, %r11;
@%p9 bra $L__BB0_12;
add.s32 %r49, %r11, %r8;
mul.wide.s32 %rd13, %r49, 4;
add.s64 %rd15, %rd9, %rd13;
ld.shared.f32 %f36, [%rd2];
ld.shared.f32 %f37, [%rd15];
add.f32 %f38, %f37, %f36;
st.shared.f32 [%rd2], %f38;
$L__BB0_12:
bar.sync 0;
setp.gt.u32 %p10, %r51, 7;
mov.u32 %r51, %r11;
@%p10 bra $L__BB0_10;
$L__BB0_13:
setp.ne.s32 %p11, %r7, 0;
mov.f32 %f51, 0f00000000;
@%p11 bra $L__BB0_16;
ld.shared.f32 %f40, [%rd2];
add.f32 %f51, %f40, 0f00000000;
setp.lt.u32 %p12, %r1, 2;
@%p12 bra $L__BB0_16;
add.s32 %r50, %r8, 1;
mul.wide.u32 %rd16, %r50, 4;
add.s64 %rd18, %rd9, %rd16;
ld.shared.f32 %f41, [%rd18];
add.f32 %f51, %f51, %f41;
$L__BB0_16:
bar.sync 0;
or.pred %p15, %p11, %p1;
@%p15 bra $L__BB0_18;
cvta.to.global.u64 %rd19, %rd3;
mul.wide.s32 %rd20, %r4, 4;
add.s64 %rd21, %rd19, %rd20;
st.global.f32 [%rd21], %f51;
$L__BB0_18:
ret;
$L__BB0_6:
add.s32 %r39, %r4, %r5;
shl.b32 %r40, %r39, 2;
mul.wide.s32 %rd7, %r40, 4;
add.s64 %rd6, %rd1, %rd7;
// begin inline asm
ld.global.cs.v4.u32 {%r35,%r36,%r37,%r38}, [%rd6];
// end inline asm
mov.b32 %f26, %r35;
add.f32 %f27, %f26, 0f00000000;
mov.b32 %f28, %r36;
add.f32 %f29, %f27, %f28;
mov.b32 %f30, %r37;
add.f32 %f31, %f29, %f30;
mov.b32 %f32, %r38;
add.f32 %f50, %f31, %f32;
bra.uni $L__BB0_7;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_452_cu_3024a27b_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_452_cu_3024a27b_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_452_cu_3024a27b_160113std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_452_cu_3024a27b_160115arrayE[];
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_452_cu_3024a27b_1601111nvfuser_452ENS_6TensorIfLi5ELi5EEENS0_IfLi4ELi4EEE(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_452_cu_3024a27b_1601111nvfuser_452ENS_6TensorIfLi5ELi5EEENS0_IfLi4ELi4EEE_param_0[48],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_452_cu_3024a27b_1601111nvfuser_452ENS_6TensorIfLi5ELi5EEENS0_IfLi4ELi4EEE_param_1[40]
)
{
.reg .pred %p<13>;
.reg .f32 %f<25>;
.reg .b32 %r<63>;
.reg .b64 %rd<20>;
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_452_cu_3024a27b_1601111nvfuser_452ENS_6TensorIfLi5ELi5EEENS0_IfLi4ELi4EEE_param_1];
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_452_cu_3024a27b_1601111nvfuser_452ENS_6TensorIfLi5ELi5EEENS0_IfLi4ELi4EEE_param_0];
mov.u32 %r1, %ntid.x;
mov.u32 %r46, %ctaid.x;
mov.u32 %r2, %ntid.y;
mov.u32 %r3, %tid.y;
mad.lo.s32 %r4, %r2, %r46, %r3;
setp.gt.s32 %p1, %r4, 241;
mov.f32 %f23, 0f00000000;
@%p1 bra $L__BB0_2;
shl.b32 %r51, %r4, 2;
mul.wide.s32 %rd5, %r51, 4;
add.s64 %rd4, %rd2, %rd5;
// begin inline asm
ld.global.cs.v4.u32 {%r47,%r48,%r49,%r50}, [%rd4];
// end inline asm
mov.b32 %f7, %r47;
add.f32 %f8, %f7, 0f00000000;
mov.b32 %f9, %r48;
add.f32 %f10, %f8, %f9;
mov.b32 %f11, %r49;
add.f32 %f12, %f10, %f11;
mov.b32 %f13, %r50;
add.f32 %f23, %f12, %f13;
$L__BB0_2:
mov.u32 %r52, %tid.z;
mad.lo.s32 %r53, %r2, %r52, %r3;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r6, %r53, %r1, %r5;
mul.wide.u32 %rd6, %r6, 4;
mov.u64 %rd7, _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_452_cu_3024a27b_160115arrayE;
add.s64 %rd1, %rd7, %rd6;
st.shared.f32 [%rd1], %f23;
bar.sync 0;
clz.b32 %r54, %r1;
mov.u32 %r55, 31;
sub.s32 %r56, %r55, %r54;
mov.u32 %r57, 1;
shl.b32 %r62, %r57, %r56;
setp.ge.u32 %p2, %r5, %r62;
add.s32 %r58, %r62, %r5;
setp.ge.u32 %p3, %r58, %r1;
or.pred %p4, %p2, %p3;
@%p4 bra $L__BB0_4;
add.s32 %r59, %r6, %r62;
mul.wide.s32 %rd8, %r59, 4;
add.s64 %rd10, %rd7, %rd8;
ld.shared.f32 %f14, [%rd1];
ld.shared.f32 %f15, [%rd10];
add.f32 %f16, %f15, %f14;
st.shared.f32 [%rd1], %f16;
$L__BB0_4:
bar.sync 0;
setp.lt.s32 %p5, %r62, 4;
@%p5 bra $L__BB0_8;
$L__BB0_5:
shr.u32 %r9, %r62, 1;
setp.ge.u32 %p6, %r5, %r9;
@%p6 bra $L__BB0_7;
add.s32 %r60, %r9, %r6;
mul.wide.s32 %rd11, %r60, 4;
add.s64 %rd13, %rd7, %rd11;
ld.shared.f32 %f17, [%rd1];
ld.shared.f32 %f18, [%rd13];
add.f32 %f19, %f18, %f17;
st.shared.f32 [%rd1], %f19;
$L__BB0_7:
bar.sync 0;
setp.gt.u32 %p7, %r62, 7;
mov.u32 %r62, %r9;
@%p7 bra $L__BB0_5;
$L__BB0_8:
setp.ne.s32 %p8, %r5, 0;
mov.f32 %f24, 0f00000000;
@%p8 bra $L__BB0_11;
ld.shared.f32 %f21, [%rd1];
add.f32 %f24, %f21, 0f00000000;
setp.lt.u32 %p9, %r1, 2;
@%p9 bra $L__BB0_11;
add.s32 %r61, %r6, 1;
mul.wide.u32 %rd14, %r61, 4;
add.s64 %rd16, %rd7, %rd14;
ld.shared.f32 %f22, [%rd16];
add.f32 %f24, %f24, %f22;
$L__BB0_11:
bar.sync 0;
or.pred %p12, %p8, %p1;
@%p12 bra $L__BB0_13;
cvta.to.global.u64 %rd17, %rd3;
mul.wide.s32 %rd18, %r4, 4;
add.s64 %rd19, %rd17, %rd18;
st.global.f32 [%rd19], %f24;
$L__BB0_13:
ret;
}
--- 53997da5d
+++ 03a1b695e
@@ -18,160 +18,122 @@
.entry _ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEENS0_IfLi4ELi4EEE(
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEENS0_IfLi4ELi4EEE_param_0[48],
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEENS0_IfLi4ELi4EEE_param_1[40]
)
{
- .reg .pred %p<16>;
- .reg .f32 %f<52>;
- .reg .b32 %r<52>;
- .reg .b64 %rd<22>;
+ .reg .pred %p<13>;
+ .reg .f32 %f<25>;
+ .reg .b32 %r<63>;
+ .reg .b64 %rd<20>;
ld.param.u64 %rd3, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEENS0_IfLi4ELi4EEE_param_1];
- ld.param.u64 %rd1, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEENS0_IfLi4ELi4EEE_param_0];
+ ld.param.u64 %rd2, [_ZN11kernelscope6kernelENS_6TensorIfLi5ELi5EEENS0_IfLi4ELi4EEE_param_0];
mov.u32 %r1, %ntid.x;
- mov.u32 %r28, %ctaid.x;
+ mov.u32 %r46, %ctaid.x;
mov.u32 %r2, %ntid.y;
mov.u32 %r3, %tid.y;
- mad.lo.s32 %r4, %r2, %r28, %r3;
+ mad.lo.s32 %r4, %r2, %r46, %r3;
setp.gt.s32 %p1, %r4, 241;
+ mov.f32 %f23, 0f00000000;
@%p1 bra $L__BB0_2;
- mov.u32 %r5, %tid.x;
- setp.lt.s32 %p2, %r5, 1;
- @%p2 bra $L__BB0_6;
+ shl.b32 %r51, %r4, 2;
+ mul.wide.s32 %rd5, %r51, 4;
+ add.s64 %rd4, %rd2, %rd5;
+
+ ld.global.cs.v4.u32 {%r47,%r48,%r49,%r50}, [%rd4];
+
+ mov.b32 %f7, %r47;
+ add.f32 %f8, %f7, 0f00000000;
+ mov.b32 %f9, %r48;
+ add.f32 %f10, %f8, %f9;
+ mov.b32 %f11, %r49;
+ add.f32 %f12, %f10, %f11;
+ mov.b32 %f13, %r50;
+ add.f32 %f23, %f12, %f13;
$L__BB0_2:
- setp.lt.s32 %p3, %r4, 242;
- mov.f32 %f46, 0f00000000;
- mov.f32 %f47, 0f00000000;
- mov.f32 %f48, 0f00000000;
- mov.f32 %f49, 0f00000000;
- @%p3 bra $L__BB0_3;
- bra.uni $L__BB0_5;
+ mov.u32 %r52, %tid.z;
+ mad.lo.s32 %r53, %r2, %r52, %r3;
+ mov.u32 %r5, %tid.x;
+ mad.lo.s32 %r6, %r53, %r1, %r5;
+ mul.wide.u32 %rd6, %r6, 4;
+ mov.u64 %rd7, _ZN11kernelscope6kernelE;
+ add.s64 %rd1, %rd7, %rd6;
+ st.shared.f32 [%rd1], %f23;
+ bar.sync 0;
+ clz.b32 %r54, %r1;
+ mov.u32 %r55, 31;
+ sub.s32 %r56, %r55, %r54;
+ mov.u32 %r57, 1;
+ shl.b32 %r62, %r57, %r56;
+ setp.ge.u32 %p2, %r5, %r62;
+ add.s32 %r58, %r62, %r5;
+ setp.ge.u32 %p3, %r58, %r1;
+ or.pred %p4, %p2, %p3;
+ @%p4 bra $L__BB0_4;
-$L__BB0_3:
- mov.u32 %r6, %tid.x;
- setp.gt.s32 %p4, %r6, 0;
- @%p4 bra $L__BB0_5;
+ add.s32 %r59, %r6, %r62;
+ mul.wide.s32 %rd8, %r59, 4;
+ add.s64 %rd10, %rd7, %rd8;
+ ld.shared.f32 %f14, [%rd1];
+ ld.shared.f32 %f15, [%rd10];
+ add.f32 %f16, %f15, %f14;
+ st.shared.f32 [%rd1], %f16;
- add.s32 %r33, %r4, %r6;
- shl.b32 %r34, %r33, 2;
- mul.wide.s32 %rd5, %r34, 4;
- add.s64 %rd4, %rd1, %rd5;
-
- ld.global.cs.v4.u32 {%r29,%r30,%r31,%r32}, [%rd4];
-
- mov.b32 %f23, %r29;
- add.f32 %f49, %f23, 0f00000000;
- mov.b32 %f48, %r30;
- mov.b32 %f47, %r31;
- mov.b32 %f46, %r32;
+$L__BB0_4:
+ bar.sync 0;
+ setp.lt.s32 %p5, %r62, 4;
+ @%p5 bra $L__BB0_8;
$L__BB0_5:
- add.f32 %f24, %f49, %f48;
- add.f32 %f25, %f24, %f47;
- add.f32 %f50, %f25, %f46;
+ shr.u32 %r9, %r62, 1;
+ setp.ge.u32 %p6, %r5, %r9;
+ @%p6 bra $L__BB0_7;
+
+ add.s32 %r60, %r9, %r6;
+ mul.wide.s32 %rd11, %r60, 4;
+ add.s64 %rd13, %rd7, %rd11;
+ ld.shared.f32 %f17, [%rd1];
+ ld.shared.f32 %f18, [%rd13];
+ add.f32 %f19, %f18, %f17;
+ st.shared.f32 [%rd1], %f19;
$L__BB0_7:
- mov.u32 %r41, %tid.z;
- mad.lo.s32 %r42, %r2, %r41, %r3;
- mov.u32 %r7, %tid.x;
- mad.lo.s32 %r8, %r42, %r1, %r7;
- mul.wide.u32 %rd8, %r8, 4;
- mov.u64 %rd9, _ZN11kernelscope6kernelE;
- add.s64 %rd2, %rd9, %rd8;
- st.shared.f32 [%rd2], %f50;
bar.sync 0;
- clz.b32 %r43, %r1;
- mov.u32 %r44, 31;
- sub.s32 %r45, %r44, %r43;
- mov.u32 %r46, 1;
- shl.b32 %r51, %r46, %r45;
- setp.ge.u32 %p5, %r7, %r51;
- add.s32 %r47, %r51, %r7;
- setp.ge.u32 %p6, %r47, %r1;
- or.pred %p7, %p5, %p6;
- @%p7 bra $L__BB0_9;
+ setp.gt.u32 %p7, %r62, 7;
+ mov.u32 %r62, %r9;
+ @%p7 bra $L__BB0_5;
- add.s32 %r48, %r8, %r51;
- mul.wide.s32 %rd10, %r48, 4;
- add.s64 %rd12, %rd9, %rd10;
- ld.shared.f32 %f33, [%rd2];
- ld.shared.f32 %f34, [%rd12];
- add.f32 %f35, %f34, %f33;
- st.shared.f32 [%rd2], %f35;
+$L__BB0_8:
+ setp.ne.s32 %p8, %r5, 0;
+ mov.f32 %f24, 0f00000000;
+ @%p8 bra $L__BB0_11;
-$L__BB0_9:
+ ld.shared.f32 %f21, [%rd1];
+ add.f32 %f24, %f21, 0f00000000;
+ setp.lt.u32 %p9, %r1, 2;
+ @%p9 bra $L__BB0_11;
+
+ add.s32 %r61, %r6, 1;
+ mul.wide.u32 %rd14, %r61, 4;
+ add.s64 %rd16, %rd7, %rd14;
+ ld.shared.f32 %f22, [%rd16];
+ add.f32 %f24, %f24, %f22;
+
+$L__BB0_11:
bar.sync 0;
- setp.lt.s32 %p8, %r51, 4;
- @%p8 bra $L__BB0_13;
+ or.pred %p12, %p8, %p1;
+ @%p12 bra $L__BB0_13;
-$L__BB0_10:
- shr.u32 %r11, %r51, 1;
- setp.ge.u32 %p9, %r7, %r11;
- @%p9 bra $L__BB0_12;
-
- add.s32 %r49, %r11, %r8;
- mul.wide.s32 %rd13, %r49, 4;
- add.s64 %rd15, %rd9, %rd13;
- ld.shared.f32 %f36, [%rd2];
- ld.shared.f32 %f37, [%rd15];
- add.f32 %f38, %f37, %f36;
- st.shared.f32 [%rd2], %f38;
-
-$L__BB0_12:
- bar.sync 0;
- setp.gt.u32 %p10, %r51, 7;
- mov.u32 %r51, %r11;
- @%p10 bra $L__BB0_10;
+ cvta.to.global.u64 %rd17, %rd3;
+ mul.wide.s32 %rd18, %r4, 4;
+ add.s64 %rd19, %rd17, %rd18;
+ st.global.f32 [%rd19], %f24;
$L__BB0_13:
- setp.ne.s32 %p11, %r7, 0;
- mov.f32 %f51, 0f00000000;
- @%p11 bra $L__BB0_16;
-
- ld.shared.f32 %f40, [%rd2];
- add.f32 %f51, %f40, 0f00000000;
- setp.lt.u32 %p12, %r1, 2;
- @%p12 bra $L__BB0_16;
-
- add.s32 %r50, %r8, 1;
- mul.wide.u32 %rd16, %r50, 4;
- add.s64 %rd18, %rd9, %rd16;
- ld.shared.f32 %f41, [%rd18];
- add.f32 %f51, %f51, %f41;
-
-$L__BB0_16:
- bar.sync 0;
- or.pred %p15, %p11, %p1;
- @%p15 bra $L__BB0_18;
-
- cvta.to.global.u64 %rd19, %rd3;
- mul.wide.s32 %rd20, %r4, 4;
- add.s64 %rd21, %rd19, %rd20;
- st.global.f32 [%rd21], %f51;
-
-$L__BB0_18:
ret;
-
-$L__BB0_6:
- add.s32 %r39, %r4, %r5;
- shl.b32 %r40, %r39, 2;
- mul.wide.s32 %rd7, %r40, 4;
- add.s64 %rd6, %rd1, %rd7;
-
- ld.global.cs.v4.u32 {%r35,%r36,%r37,%r38}, [%rd6];
-
- mov.b32 %f26, %r35;
- add.f32 %f27, %f26, 0f00000000;
- mov.b32 %f28, %r36;
- add.f32 %f29, %f27, %f28;
- mov.b32 %f30, %r37;
- add.f32 %f31, %f29, %f30;
- mov.b32 %f32, %r38;
- add.f32 %f50, %f31, %f32;
- bra.uni $L__BB0_7;
}
16: ReshapeReduction.FusionReshapeReduction/56
Kernel 1
CUDA
PTX
53997da5d
Diff
03a1b695e
-4
+4 index type: int
registers: 0
gmem: 0
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 3, 3> T4, Tensor<float, 2, 2> T5) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
Array<float, 1, 1> T8;
T8[0] = 0.000000000e+00f;
#pragma unroll 1
for(nvfuser_index_t i0 = 0; i0 < (ceilDiv(1, ((nvfuser_index_t)blockDim.x))); ++i0) {
if ((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454) && (((1 + (2 * ((nvfuser_index_t)threadIdx.x))) + ((2 * ((nvfuser_index_t)blockDim.x)) * i0)) < 2))) {
Array<float, 2, 2> T6;
T6.set(float(0.000000000e+00f));
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/false, CacheOp::Streaming>(&T6[0], &T4[((((2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x))) + ((2 * ((nvfuser_index_t)blockDim.x)) * i0))]);
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 2; ++i1) {
T8[0]
= T8[0]
+ T6[i1];
}
} else {
Array<float, 2, 2> T6;
T6.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454) && (((1 + (2 * ((nvfuser_index_t)threadIdx.x))) + ((2 * ((nvfuser_index_t)blockDim.x)) * i0)) < 2))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/false, CacheOp::Streaming>(&T6[0], &T4[((((2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x))) + ((2 * ((nvfuser_index_t)blockDim.x)) * i0))]);
}
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 2; ++i1) {
T8[0]
= T8[0]
+ T6[i1];
}
}
}
Array<float, 1, 1> T7;
T7[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T7[0], T8[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
T5[(((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x)))]
= T7[0];
}
}
__global__ void nvfuser_N(Tensor<float, 3, 3> T4, Tensor<float, 2, 2> T5) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
Array<float, 1, 1> T8;
T8[0] = 0.000000000e+00f;
#pragma unroll 1
for(nvfuser_index_t i0 = 0; i0 < (ceilDiv(1, ((nvfuser_index_t)blockDim.x))); ++i0) {
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
Array<float, 2, 2> T6;
T6.set(float(0.000000000e+00f));
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/false, CacheOp::Streaming>(&T6[0], &T4[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))]);
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 2; ++i1) {
T8[0]
= T8[0]
+ T6[i1];
}
} else {
Array<float, 2, 2> T6;
T6.set(float(0.000000000e+00f));
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/false, CacheOp::Streaming>(&T6[0], &T4[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))]);
}
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 2; ++i1) {
T8[0]
= T8[0]
+ T6[i1];
}
}
}
Array<float, 1, 1> T7;
T7[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T7[0], T8[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
T5[(((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x)))]
= T7[0];
}
}
--- 53997da5d
+++ 03a1b695e
@@ -3,25 +3,25 @@
void* shared_mem = array;
Array<float, 1, 1> T8;
T8[0] = 0.000000000e+00f;
#pragma unroll 1
for(nvfuser_index_t i0 = 0; i0 < (ceilDiv(1, ((nvfuser_index_t)blockDim.x))); ++i0) {
- if ((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454) && (((1 + (2 * ((nvfuser_index_t)threadIdx.x))) + ((2 * ((nvfuser_index_t)blockDim.x)) * i0)) < 2))) {
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
Array<float, 2, 2> T6;
T6.set(float(0.000000000e+00f));
- loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/false, CacheOp::Streaming>(&T6[0], &T4[((((2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x))) + ((2 * ((nvfuser_index_t)blockDim.x)) * i0))]);
+ loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/false, CacheOp::Streaming>(&T6[0], &T4[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))]);
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 2; ++i1) {
T8[0]
= T8[0]
+ T6[i1];
}
} else {
Array<float, 2, 2> T6;
T6.set(float(0.000000000e+00f));
- if ((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454) && (((1 + (2 * ((nvfuser_index_t)threadIdx.x))) + ((2 * ((nvfuser_index_t)blockDim.x)) * i0)) < 2))) {
- loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/false, CacheOp::Streaming>(&T6[0], &T4[((((2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x))) + ((2 * ((nvfuser_index_t)blockDim.x)) * i0))]);
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
+ loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/false, CacheOp::Streaming>(&T6[0], &T4[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))]);
}
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 2; ++i1) {
T8[0]
= T8[0]
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_467_cu_638b7348_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_467_cu_638b7348_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_467_cu_638b7348_191103std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_467_cu_638b7348_191105arrayE[];
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_467_cu_638b7348_1911011nvfuser_467ENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEE(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_467_cu_638b7348_1911011nvfuser_467ENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEE_param_0[32],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_467_cu_638b7348_1911011nvfuser_467ENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEE_param_1[24]
)
{
.reg .pred %p<16>;
.reg .f32 %f<34>;
.reg .b32 %r<40>;
.reg .b64 %rd<22>;
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_467_cu_638b7348_1911011nvfuser_467ENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEE_param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_467_cu_638b7348_1911011nvfuser_467ENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEE_param_0];
mov.u32 %r1, %ntid.x;
mov.u32 %r20, %ctaid.x;
mov.u32 %r2, %ntid.y;
mov.u32 %r3, %tid.y;
mad.lo.s32 %r4, %r2, %r20, %r3;
setp.gt.s32 %p1, %r4, 27453;
@%p1 bra $L__BB0_2;
mov.u32 %r5, %tid.x;
setp.lt.s32 %p2, %r5, 1;
@%p2 bra $L__BB0_6;
$L__BB0_2:
setp.lt.s32 %p3, %r4, 27454;
mov.f32 %f30, 0f00000000;
mov.f32 %f31, 0f00000000;
@%p3 bra $L__BB0_3;
bra.uni $L__BB0_5;
$L__BB0_3:
mov.u32 %r6, %tid.x;
setp.gt.s32 %p4, %r6, 0;
@%p4 bra $L__BB0_5;
add.s32 %r23, %r4, %r6;
shl.b32 %r24, %r23, 1;
mul.wide.s32 %rd5, %r24, 4;
add.s64 %rd4, %rd1, %rd5;
// begin inline asm
ld.global.cs.v2.u32 {%r21,%r22}, [%rd4];
// end inline asm
mov.b32 %f15, %r21;
add.f32 %f31, %f15, 0f00000000;
mov.b32 %f30, %r22;
$L__BB0_5:
add.f32 %f32, %f31, %f30;
$L__BB0_7:
mov.u32 %r29, %tid.z;
mad.lo.s32 %r30, %r2, %r29, %r3;
mov.u32 %r7, %tid.x;
mad.lo.s32 %r8, %r30, %r1, %r7;
mul.wide.u32 %rd8, %r8, 4;
mov.u64 %rd9, _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_467_cu_638b7348_191105arrayE;
add.s64 %rd2, %rd9, %rd8;
st.shared.f32 [%rd2], %f32;
bar.sync 0;
clz.b32 %r31, %r1;
mov.u32 %r32, 31;
sub.s32 %r33, %r32, %r31;
mov.u32 %r34, 1;
shl.b32 %r39, %r34, %r33;
setp.ge.u32 %p5, %r7, %r39;
add.s32 %r35, %r39, %r7;
setp.ge.u32 %p6, %r35, %r1;
or.pred %p7, %p5, %p6;
@%p7 bra $L__BB0_9;
add.s32 %r36, %r8, %r39;
mul.wide.s32 %rd10, %r36, 4;
add.s64 %rd12, %rd9, %rd10;
ld.shared.f32 %f19, [%rd2];
ld.shared.f32 %f20, [%rd12];
add.f32 %f21, %f20, %f19;
st.shared.f32 [%rd2], %f21;
$L__BB0_9:
bar.sync 0;
setp.lt.s32 %p8, %r39, 4;
@%p8 bra $L__BB0_13;
$L__BB0_10:
shr.u32 %r11, %r39, 1;
setp.ge.u32 %p9, %r7, %r11;
@%p9 bra $L__BB0_12;
add.s32 %r37, %r11, %r8;
mul.wide.s32 %rd13, %r37, 4;
add.s64 %rd15, %rd9, %rd13;
ld.shared.f32 %f22, [%rd2];
ld.shared.f32 %f23, [%rd15];
add.f32 %f24, %f23, %f22;
st.shared.f32 [%rd2], %f24;
$L__BB0_12:
bar.sync 0;
setp.gt.u32 %p10, %r39, 7;
mov.u32 %r39, %r11;
@%p10 bra $L__BB0_10;
$L__BB0_13:
setp.ne.s32 %p11, %r7, 0;
mov.f32 %f33, 0f00000000;
@%p11 bra $L__BB0_16;
ld.shared.f32 %f26, [%rd2];
add.f32 %f33, %f26, 0f00000000;
setp.lt.u32 %p12, %r1, 2;
@%p12 bra $L__BB0_16;
add.s32 %r38, %r8, 1;
mul.wide.u32 %rd16, %r38, 4;
add.s64 %rd18, %rd9, %rd16;
ld.shared.f32 %f27, [%rd18];
add.f32 %f33, %f33, %f27;
$L__BB0_16:
bar.sync 0;
or.pred %p15, %p11, %p1;
@%p15 bra $L__BB0_18;
cvta.to.global.u64 %rd19, %rd3;
mul.wide.s32 %rd20, %r4, 4;
add.s64 %rd21, %rd19, %rd20;
st.global.f32 [%rd21], %f33;
$L__BB0_18:
ret;
$L__BB0_6:
add.s32 %r27, %r4, %r5;
shl.b32 %r28, %r27, 1;
mul.wide.s32 %rd7, %r28, 4;
add.s64 %rd6, %rd1, %rd7;
// begin inline asm
ld.global.cs.v2.u32 {%r25,%r26}, [%rd6];
// end inline asm
mov.b32 %f16, %r25;
add.f32 %f17, %f16, 0f00000000;
mov.b32 %f18, %r26;
add.f32 %f32, %f17, %f18;
bra.uni $L__BB0_7;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_467_cu_04f1218d_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_467_cu_04f1218d_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_467_cu_04f1218d_160113std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_467_cu_04f1218d_160115arrayE[];
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_467_cu_04f1218d_1601111nvfuser_467ENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEE(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_467_cu_04f1218d_1601111nvfuser_467ENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEE_param_0[32],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_467_cu_04f1218d_1601111nvfuser_467ENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEE_param_1[24]
)
{
.reg .pred %p<13>;
.reg .f32 %f<21>;
.reg .b32 %r<45>;
.reg .b64 %rd<20>;
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_467_cu_04f1218d_1601111nvfuser_467ENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEE_param_1];
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_467_cu_04f1218d_1601111nvfuser_467ENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEE_param_0];
mov.u32 %r1, %ntid.x;
mov.u32 %r30, %ctaid.x;
mov.u32 %r2, %ntid.y;
mov.u32 %r3, %tid.y;
mad.lo.s32 %r4, %r2, %r30, %r3;
setp.gt.s32 %p1, %r4, 27453;
mov.f32 %f19, 0f00000000;
@%p1 bra $L__BB0_2;
shl.b32 %r33, %r4, 1;
mul.wide.s32 %rd5, %r33, 4;
add.s64 %rd4, %rd2, %rd5;
// begin inline asm
ld.global.cs.v2.u32 {%r31,%r32}, [%rd4];
// end inline asm
mov.b32 %f7, %r31;
add.f32 %f8, %f7, 0f00000000;
mov.b32 %f9, %r32;
add.f32 %f19, %f8, %f9;
$L__BB0_2:
mov.u32 %r34, %tid.z;
mad.lo.s32 %r35, %r2, %r34, %r3;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r6, %r35, %r1, %r5;
mul.wide.u32 %rd6, %r6, 4;
mov.u64 %rd7, _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_467_cu_04f1218d_160115arrayE;
add.s64 %rd1, %rd7, %rd6;
st.shared.f32 [%rd1], %f19;
bar.sync 0;
clz.b32 %r36, %r1;
mov.u32 %r37, 31;
sub.s32 %r38, %r37, %r36;
mov.u32 %r39, 1;
shl.b32 %r44, %r39, %r38;
setp.ge.u32 %p2, %r5, %r44;
add.s32 %r40, %r44, %r5;
setp.ge.u32 %p3, %r40, %r1;
or.pred %p4, %p2, %p3;
@%p4 bra $L__BB0_4;
add.s32 %r41, %r6, %r44;
mul.wide.s32 %rd8, %r41, 4;
add.s64 %rd10, %rd7, %rd8;
ld.shared.f32 %f10, [%rd1];
ld.shared.f32 %f11, [%rd10];
add.f32 %f12, %f11, %f10;
st.shared.f32 [%rd1], %f12;
$L__BB0_4:
bar.sync 0;
setp.lt.s32 %p5, %r44, 4;
@%p5 bra $L__BB0_8;
$L__BB0_5:
shr.u32 %r9, %r44, 1;
setp.ge.u32 %p6, %r5, %r9;
@%p6 bra $L__BB0_7;
add.s32 %r42, %r9, %r6;
mul.wide.s32 %rd11, %r42, 4;
add.s64 %rd13, %rd7, %rd11;
ld.shared.f32 %f13, [%rd1];
ld.shared.f32 %f14, [%rd13];
add.f32 %f15, %f14, %f13;
st.shared.f32 [%rd1], %f15;
$L__BB0_7:
bar.sync 0;
setp.gt.u32 %p7, %r44, 7;
mov.u32 %r44, %r9;
@%p7 bra $L__BB0_5;
$L__BB0_8:
setp.ne.s32 %p8, %r5, 0;
mov.f32 %f20, 0f00000000;
@%p8 bra $L__BB0_11;
ld.shared.f32 %f17, [%rd1];
add.f32 %f20, %f17, 0f00000000;
setp.lt.u32 %p9, %r1, 2;
@%p9 bra $L__BB0_11;
add.s32 %r43, %r6, 1;
mul.wide.u32 %rd14, %r43, 4;
add.s64 %rd16, %rd7, %rd14;
ld.shared.f32 %f18, [%rd16];
add.f32 %f20, %f20, %f18;
$L__BB0_11:
bar.sync 0;
or.pred %p12, %p8, %p1;
@%p12 bra $L__BB0_13;
cvta.to.global.u64 %rd17, %rd3;
mul.wide.s32 %rd18, %r4, 4;
add.s64 %rd19, %rd17, %rd18;
st.global.f32 [%rd19], %f20;
$L__BB0_13:
ret;
}
--- 53997da5d
+++ 03a1b695e
@@ -18,150 +18,118 @@
.entry _ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEE(
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEE_param_0[32],
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEE_param_1[24]
)
{
- .reg .pred %p<16>;
- .reg .f32 %f<34>;
- .reg .b32 %r<40>;
- .reg .b64 %rd<22>;
+ .reg .pred %p<13>;
+ .reg .f32 %f<21>;
+ .reg .b32 %r<45>;
+ .reg .b64 %rd<20>;
ld.param.u64 %rd3, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEE_param_1];
- ld.param.u64 %rd1, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEE_param_0];
+ ld.param.u64 %rd2, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEE_param_0];
mov.u32 %r1, %ntid.x;
- mov.u32 %r20, %ctaid.x;
+ mov.u32 %r30, %ctaid.x;
mov.u32 %r2, %ntid.y;
mov.u32 %r3, %tid.y;
- mad.lo.s32 %r4, %r2, %r20, %r3;
+ mad.lo.s32 %r4, %r2, %r30, %r3;
setp.gt.s32 %p1, %r4, 27453;
+ mov.f32 %f19, 0f00000000;
@%p1 bra $L__BB0_2;
- mov.u32 %r5, %tid.x;
- setp.lt.s32 %p2, %r5, 1;
- @%p2 bra $L__BB0_6;
+ shl.b32 %r33, %r4, 1;
+ mul.wide.s32 %rd5, %r33, 4;
+ add.s64 %rd4, %rd2, %rd5;
+
+ ld.global.cs.v2.u32 {%r31,%r32}, [%rd4];
+
+ mov.b32 %f7, %r31;
+ add.f32 %f8, %f7, 0f00000000;
+ mov.b32 %f9, %r32;
+ add.f32 %f19, %f8, %f9;
$L__BB0_2:
- setp.lt.s32 %p3, %r4, 27454;
- mov.f32 %f30, 0f00000000;
- mov.f32 %f31, 0f00000000;
- @%p3 bra $L__BB0_3;
- bra.uni $L__BB0_5;
+ mov.u32 %r34, %tid.z;
+ mad.lo.s32 %r35, %r2, %r34, %r3;
+ mov.u32 %r5, %tid.x;
+ mad.lo.s32 %r6, %r35, %r1, %r5;
+ mul.wide.u32 %rd6, %r6, 4;
+ mov.u64 %rd7, _ZN11kernelscope6kernelE;
+ add.s64 %rd1, %rd7, %rd6;
+ st.shared.f32 [%rd1], %f19;
+ bar.sync 0;
+ clz.b32 %r36, %r1;
+ mov.u32 %r37, 31;
+ sub.s32 %r38, %r37, %r36;
+ mov.u32 %r39, 1;
+ shl.b32 %r44, %r39, %r38;
+ setp.ge.u32 %p2, %r5, %r44;
+ add.s32 %r40, %r44, %r5;
+ setp.ge.u32 %p3, %r40, %r1;
+ or.pred %p4, %p2, %p3;
+ @%p4 bra $L__BB0_4;
-$L__BB0_3:
- mov.u32 %r6, %tid.x;
- setp.gt.s32 %p4, %r6, 0;
- @%p4 bra $L__BB0_5;
+ add.s32 %r41, %r6, %r44;
+ mul.wide.s32 %rd8, %r41, 4;
+ add.s64 %rd10, %rd7, %rd8;
+ ld.shared.f32 %f10, [%rd1];
+ ld.shared.f32 %f11, [%rd10];
+ add.f32 %f12, %f11, %f10;
+ st.shared.f32 [%rd1], %f12;
- add.s32 %r23, %r4, %r6;
- shl.b32 %r24, %r23, 1;
- mul.wide.s32 %rd5, %r24, 4;
- add.s64 %rd4, %rd1, %rd5;
-
- ld.global.cs.v2.u32 {%r21,%r22}, [%rd4];
-
- mov.b32 %f15, %r21;
- add.f32 %f31, %f15, 0f00000000;
- mov.b32 %f30, %r22;
+$L__BB0_4:
+ bar.sync 0;
+ setp.lt.s32 %p5, %r44, 4;
+ @%p5 bra $L__BB0_8;
$L__BB0_5:
- add.f32 %f32, %f31, %f30;
+ shr.u32 %r9, %r44, 1;
+ setp.ge.u32 %p6, %r5, %r9;
+ @%p6 bra $L__BB0_7;
+
+ add.s32 %r42, %r9, %r6;
+ mul.wide.s32 %rd11, %r42, 4;
+ add.s64 %rd13, %rd7, %rd11;
+ ld.shared.f32 %f13, [%rd1];
+ ld.shared.f32 %f14, [%rd13];
+ add.f32 %f15, %f14, %f13;
+ st.shared.f32 [%rd1], %f15;
$L__BB0_7:
- mov.u32 %r29, %tid.z;
- mad.lo.s32 %r30, %r2, %r29, %r3;
- mov.u32 %r7, %tid.x;
- mad.lo.s32 %r8, %r30, %r1, %r7;
- mul.wide.u32 %rd8, %r8, 4;
- mov.u64 %rd9, _ZN11kernelscope6kernelE;
- add.s64 %rd2, %rd9, %rd8;
- st.shared.f32 [%rd2], %f32;
bar.sync 0;
- clz.b32 %r31, %r1;
- mov.u32 %r32, 31;
- sub.s32 %r33, %r32, %r31;
- mov.u32 %r34, 1;
- shl.b32 %r39, %r34, %r33;
- setp.ge.u32 %p5, %r7, %r39;
- add.s32 %r35, %r39, %r7;
- setp.ge.u32 %p6, %r35, %r1;
- or.pred %p7, %p5, %p6;
- @%p7 bra $L__BB0_9;
+ setp.gt.u32 %p7, %r44, 7;
+ mov.u32 %r44, %r9;
+ @%p7 bra $L__BB0_5;
- add.s32 %r36, %r8, %r39;
- mul.wide.s32 %rd10, %r36, 4;
- add.s64 %rd12, %rd9, %rd10;
- ld.shared.f32 %f19, [%rd2];
- ld.shared.f32 %f20, [%rd12];
- add.f32 %f21, %f20, %f19;
- st.shared.f32 [%rd2], %f21;
+$L__BB0_8:
+ setp.ne.s32 %p8, %r5, 0;
+ mov.f32 %f20, 0f00000000;
+ @%p8 bra $L__BB0_11;
-$L__BB0_9:
+ ld.shared.f32 %f17, [%rd1];
+ add.f32 %f20, %f17, 0f00000000;
+ setp.lt.u32 %p9, %r1, 2;
+ @%p9 bra $L__BB0_11;
+
+ add.s32 %r43, %r6, 1;
+ mul.wide.u32 %rd14, %r43, 4;
+ add.s64 %rd16, %rd7, %rd14;
+ ld.shared.f32 %f18, [%rd16];
+ add.f32 %f20, %f20, %f18;
+
+$L__BB0_11:
bar.sync 0;
- setp.lt.s32 %p8, %r39, 4;
- @%p8 bra $L__BB0_13;
+ or.pred %p12, %p8, %p1;
+ @%p12 bra $L__BB0_13;
-$L__BB0_10:
- shr.u32 %r11, %r39, 1;
- setp.ge.u32 %p9, %r7, %r11;
- @%p9 bra $L__BB0_12;
-
- add.s32 %r37, %r11, %r8;
- mul.wide.s32 %rd13, %r37, 4;
- add.s64 %rd15, %rd9, %rd13;
- ld.shared.f32 %f22, [%rd2];
- ld.shared.f32 %f23, [%rd15];
- add.f32 %f24, %f23, %f22;
- st.shared.f32 [%rd2], %f24;
-
-$L__BB0_12:
- bar.sync 0;
- setp.gt.u32 %p10, %r39, 7;
- mov.u32 %r39, %r11;
- @%p10 bra $L__BB0_10;
+ cvta.to.global.u64 %rd17, %rd3;
+ mul.wide.s32 %rd18, %r4, 4;
+ add.s64 %rd19, %rd17, %rd18;
+ st.global.f32 [%rd19], %f20;
$L__BB0_13:
- setp.ne.s32 %p11, %r7, 0;
- mov.f32 %f33, 0f00000000;
- @%p11 bra $L__BB0_16;
-
- ld.shared.f32 %f26, [%rd2];
- add.f32 %f33, %f26, 0f00000000;
- setp.lt.u32 %p12, %r1, 2;
- @%p12 bra $L__BB0_16;
-
- add.s32 %r38, %r8, 1;
- mul.wide.u32 %rd16, %r38, 4;
- add.s64 %rd18, %rd9, %rd16;
- ld.shared.f32 %f27, [%rd18];
- add.f32 %f33, %f33, %f27;
-
-$L__BB0_16:
- bar.sync 0;
- or.pred %p15, %p11, %p1;
- @%p15 bra $L__BB0_18;
-
- cvta.to.global.u64 %rd19, %rd3;
- mul.wide.s32 %rd20, %r4, 4;
- add.s64 %rd21, %rd19, %rd20;
- st.global.f32 [%rd21], %f33;
-
-$L__BB0_18:
ret;
-
-$L__BB0_6:
- add.s32 %r27, %r4, %r5;
- shl.b32 %r28, %r27, 1;
- mul.wide.s32 %rd7, %r28, 4;
- add.s64 %rd6, %rd1, %rd7;
-
- ld.global.cs.v2.u32 {%r25,%r26}, [%rd6];
-
- mov.b32 %f16, %r25;
- add.f32 %f17, %f16, 0f00000000;
- mov.b32 %f18, %r26;
- add.f32 %f32, %f17, %f18;
- bra.uni $L__BB0_7;
}
17: ReshapeReduction.FusionReshapeReduction/58
Kernel 1
CUDA
PTX
53997da5d
Diff
03a1b695e
-4
+4 index type: int
registers: 0
gmem: 0
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 3, 3> T4, Tensor<float, 2, 2> T5) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
Array<float, 1, 1> T8;
T8[0] = 0.000000000e+00f;
#pragma unroll 1
for(nvfuser_index_t i0 = 0; i0 < (ceilDiv(1, ((nvfuser_index_t)blockDim.x))); ++i0) {
if ((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454) && (((1 + (2 * ((nvfuser_index_t)threadIdx.x))) + ((2 * ((nvfuser_index_t)blockDim.x)) * i0)) < 2))) {
Array<float, 2, 2> T6;
T6.set(float(0.000000000e+00f));
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/false, CacheOp::Streaming>(&T6[0], &T4[((((2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x))) + ((2 * ((nvfuser_index_t)blockDim.x)) * i0))]);
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 2; ++i1) {
T8[0]
= T8[0]
+ T6[i1];
}
} else {
Array<float, 2, 2> T6;
T6.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454) && (((1 + (2 * ((nvfuser_index_t)threadIdx.x))) + ((2 * ((nvfuser_index_t)blockDim.x)) * i0)) < 2))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/false, CacheOp::Streaming>(&T6[0], &T4[((((2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x))) + ((2 * ((nvfuser_index_t)blockDim.x)) * i0))]);
}
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 2; ++i1) {
T8[0]
= T8[0]
+ T6[i1];
}
}
}
Array<float, 1, 1> T7;
T7[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T7[0], T8[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
T5[(((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x)))]
= T7[0];
}
}
__global__ void nvfuser_N(Tensor<float, 3, 3> T4, Tensor<float, 2, 2> T5) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
Array<float, 1, 1> T8;
T8[0] = 0.000000000e+00f;
#pragma unroll 1
for(nvfuser_index_t i0 = 0; i0 < (ceilDiv(1, ((nvfuser_index_t)blockDim.x))); ++i0) {
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
Array<float, 2, 2> T6;
T6.set(float(0.000000000e+00f));
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/false, CacheOp::Streaming>(&T6[0], &T4[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))]);
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 2; ++i1) {
T8[0]
= T8[0]
+ T6[i1];
}
} else {
Array<float, 2, 2> T6;
T6.set(float(0.000000000e+00f));
if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/false, CacheOp::Streaming>(&T6[0], &T4[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))]);
}
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 2; ++i1) {
T8[0]
= T8[0]
+ T6[i1];
}
}
}
Array<float, 1, 1> T7;
T7[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T7[0], T8[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454))) {
T5[(((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x)))]
= T7[0];
}
}
--- 53997da5d
+++ 03a1b695e
@@ -3,25 +3,25 @@
void* shared_mem = array;
Array<float, 1, 1> T8;
T8[0] = 0.000000000e+00f;
#pragma unroll 1
for(nvfuser_index_t i0 = 0; i0 < (ceilDiv(1, ((nvfuser_index_t)blockDim.x))); ++i0) {
- if ((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454) && (((1 + (2 * ((nvfuser_index_t)threadIdx.x))) + ((2 * ((nvfuser_index_t)blockDim.x)) * i0)) < 2))) {
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
Array<float, 2, 2> T6;
T6.set(float(0.000000000e+00f));
- loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/false, CacheOp::Streaming>(&T6[0], &T4[((((2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x))) + ((2 * ((nvfuser_index_t)blockDim.x)) * i0))]);
+ loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/false, CacheOp::Streaming>(&T6[0], &T4[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))]);
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 2; ++i1) {
T8[0]
= T8[0]
+ T6[i1];
}
} else {
Array<float, 2, 2> T6;
T6.set(float(0.000000000e+00f));
- if ((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454) && (((1 + (2 * ((nvfuser_index_t)threadIdx.x))) + ((2 * ((nvfuser_index_t)blockDim.x)) * i0)) < 2))) {
- loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/false, CacheOp::Streaming>(&T6[0], &T4[((((2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x))) + ((2 * ((nvfuser_index_t)blockDim.x)) * i0))]);
+ if (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 27454)) {
+ loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/false, CacheOp::Streaming>(&T6[0], &T4[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.x)))]);
}
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 2; ++i1) {
T8[0]
= T8[0]
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_471_cu_638b7348_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_471_cu_638b7348_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_471_cu_638b7348_191103std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_471_cu_638b7348_191105arrayE[];
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_471_cu_638b7348_1911011nvfuser_471ENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEE(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_471_cu_638b7348_1911011nvfuser_471ENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEE_param_0[32],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_471_cu_638b7348_1911011nvfuser_471ENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEE_param_1[24]
)
{
.reg .pred %p<16>;
.reg .f32 %f<34>;
.reg .b32 %r<40>;
.reg .b64 %rd<22>;
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_471_cu_638b7348_1911011nvfuser_471ENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEE_param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_471_cu_638b7348_1911011nvfuser_471ENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEE_param_0];
mov.u32 %r1, %ntid.x;
mov.u32 %r20, %ctaid.x;
mov.u32 %r2, %ntid.y;
mov.u32 %r3, %tid.y;
mad.lo.s32 %r4, %r2, %r20, %r3;
setp.gt.s32 %p1, %r4, 27453;
@%p1 bra $L__BB0_2;
mov.u32 %r5, %tid.x;
setp.lt.s32 %p2, %r5, 1;
@%p2 bra $L__BB0_6;
$L__BB0_2:
setp.lt.s32 %p3, %r4, 27454;
mov.f32 %f30, 0f00000000;
mov.f32 %f31, 0f00000000;
@%p3 bra $L__BB0_3;
bra.uni $L__BB0_5;
$L__BB0_3:
mov.u32 %r6, %tid.x;
setp.gt.s32 %p4, %r6, 0;
@%p4 bra $L__BB0_5;
add.s32 %r23, %r4, %r6;
shl.b32 %r24, %r23, 1;
mul.wide.s32 %rd5, %r24, 4;
add.s64 %rd4, %rd1, %rd5;
// begin inline asm
ld.global.cs.v2.u32 {%r21,%r22}, [%rd4];
// end inline asm
mov.b32 %f15, %r21;
add.f32 %f31, %f15, 0f00000000;
mov.b32 %f30, %r22;
$L__BB0_5:
add.f32 %f32, %f31, %f30;
$L__BB0_7:
mov.u32 %r29, %tid.z;
mad.lo.s32 %r30, %r2, %r29, %r3;
mov.u32 %r7, %tid.x;
mad.lo.s32 %r8, %r30, %r1, %r7;
mul.wide.u32 %rd8, %r8, 4;
mov.u64 %rd9, _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_471_cu_638b7348_191105arrayE;
add.s64 %rd2, %rd9, %rd8;
st.shared.f32 [%rd2], %f32;
bar.sync 0;
clz.b32 %r31, %r1;
mov.u32 %r32, 31;
sub.s32 %r33, %r32, %r31;
mov.u32 %r34, 1;
shl.b32 %r39, %r34, %r33;
setp.ge.u32 %p5, %r7, %r39;
add.s32 %r35, %r39, %r7;
setp.ge.u32 %p6, %r35, %r1;
or.pred %p7, %p5, %p6;
@%p7 bra $L__BB0_9;
add.s32 %r36, %r8, %r39;
mul.wide.s32 %rd10, %r36, 4;
add.s64 %rd12, %rd9, %rd10;
ld.shared.f32 %f19, [%rd2];
ld.shared.f32 %f20, [%rd12];
add.f32 %f21, %f20, %f19;
st.shared.f32 [%rd2], %f21;
$L__BB0_9:
bar.sync 0;
setp.lt.s32 %p8, %r39, 4;
@%p8 bra $L__BB0_13;
$L__BB0_10:
shr.u32 %r11, %r39, 1;
setp.ge.u32 %p9, %r7, %r11;
@%p9 bra $L__BB0_12;
add.s32 %r37, %r11, %r8;
mul.wide.s32 %rd13, %r37, 4;
add.s64 %rd15, %rd9, %rd13;
ld.shared.f32 %f22, [%rd2];
ld.shared.f32 %f23, [%rd15];
add.f32 %f24, %f23, %f22;
st.shared.f32 [%rd2], %f24;
$L__BB0_12:
bar.sync 0;
setp.gt.u32 %p10, %r39, 7;
mov.u32 %r39, %r11;
@%p10 bra $L__BB0_10;
$L__BB0_13:
setp.ne.s32 %p11, %r7, 0;
mov.f32 %f33, 0f00000000;
@%p11 bra $L__BB0_16;
ld.shared.f32 %f26, [%rd2];
add.f32 %f33, %f26, 0f00000000;
setp.lt.u32 %p12, %r1, 2;
@%p12 bra $L__BB0_16;
add.s32 %r38, %r8, 1;
mul.wide.u32 %rd16, %r38, 4;
add.s64 %rd18, %rd9, %rd16;
ld.shared.f32 %f27, [%rd18];
add.f32 %f33, %f33, %f27;
$L__BB0_16:
bar.sync 0;
or.pred %p15, %p11, %p1;
@%p15 bra $L__BB0_18;
cvta.to.global.u64 %rd19, %rd3;
mul.wide.s32 %rd20, %r4, 4;
add.s64 %rd21, %rd19, %rd20;
st.global.f32 [%rd21], %f33;
$L__BB0_18:
ret;
$L__BB0_6:
add.s32 %r27, %r4, %r5;
shl.b32 %r28, %r27, 1;
mul.wide.s32 %rd7, %r28, 4;
add.s64 %rd6, %rd1, %rd7;
// begin inline asm
ld.global.cs.v2.u32 {%r25,%r26}, [%rd6];
// end inline asm
mov.b32 %f16, %r25;
add.f32 %f17, %f16, 0f00000000;
mov.b32 %f18, %r26;
add.f32 %f32, %f17, %f18;
bra.uni $L__BB0_7;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_471_cu_04f1218d_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_471_cu_04f1218d_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_471_cu_04f1218d_160113std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_471_cu_04f1218d_160115arrayE[];
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_471_cu_04f1218d_1601111nvfuser_471ENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEE(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_471_cu_04f1218d_1601111nvfuser_471ENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEE_param_0[32],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_471_cu_04f1218d_1601111nvfuser_471ENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEE_param_1[24]
)
{
.reg .pred %p<13>;
.reg .f32 %f<21>;
.reg .b32 %r<45>;
.reg .b64 %rd<20>;
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_471_cu_04f1218d_1601111nvfuser_471ENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEE_param_1];
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_471_cu_04f1218d_1601111nvfuser_471ENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEE_param_0];
mov.u32 %r1, %ntid.x;
mov.u32 %r30, %ctaid.x;
mov.u32 %r2, %ntid.y;
mov.u32 %r3, %tid.y;
mad.lo.s32 %r4, %r2, %r30, %r3;
setp.gt.s32 %p1, %r4, 27453;
mov.f32 %f19, 0f00000000;
@%p1 bra $L__BB0_2;
shl.b32 %r33, %r4, 1;
mul.wide.s32 %rd5, %r33, 4;
add.s64 %rd4, %rd2, %rd5;
// begin inline asm
ld.global.cs.v2.u32 {%r31,%r32}, [%rd4];
// end inline asm
mov.b32 %f7, %r31;
add.f32 %f8, %f7, 0f00000000;
mov.b32 %f9, %r32;
add.f32 %f19, %f8, %f9;
$L__BB0_2:
mov.u32 %r34, %tid.z;
mad.lo.s32 %r35, %r2, %r34, %r3;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r6, %r35, %r1, %r5;
mul.wide.u32 %rd6, %r6, 4;
mov.u64 %rd7, _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_471_cu_04f1218d_160115arrayE;
add.s64 %rd1, %rd7, %rd6;
st.shared.f32 [%rd1], %f19;
bar.sync 0;
clz.b32 %r36, %r1;
mov.u32 %r37, 31;
sub.s32 %r38, %r37, %r36;
mov.u32 %r39, 1;
shl.b32 %r44, %r39, %r38;
setp.ge.u32 %p2, %r5, %r44;
add.s32 %r40, %r44, %r5;
setp.ge.u32 %p3, %r40, %r1;
or.pred %p4, %p2, %p3;
@%p4 bra $L__BB0_4;
add.s32 %r41, %r6, %r44;
mul.wide.s32 %rd8, %r41, 4;
add.s64 %rd10, %rd7, %rd8;
ld.shared.f32 %f10, [%rd1];
ld.shared.f32 %f11, [%rd10];
add.f32 %f12, %f11, %f10;
st.shared.f32 [%rd1], %f12;
$L__BB0_4:
bar.sync 0;
setp.lt.s32 %p5, %r44, 4;
@%p5 bra $L__BB0_8;
$L__BB0_5:
shr.u32 %r9, %r44, 1;
setp.ge.u32 %p6, %r5, %r9;
@%p6 bra $L__BB0_7;
add.s32 %r42, %r9, %r6;
mul.wide.s32 %rd11, %r42, 4;
add.s64 %rd13, %rd7, %rd11;
ld.shared.f32 %f13, [%rd1];
ld.shared.f32 %f14, [%rd13];
add.f32 %f15, %f14, %f13;
st.shared.f32 [%rd1], %f15;
$L__BB0_7:
bar.sync 0;
setp.gt.u32 %p7, %r44, 7;
mov.u32 %r44, %r9;
@%p7 bra $L__BB0_5;
$L__BB0_8:
setp.ne.s32 %p8, %r5, 0;
mov.f32 %f20, 0f00000000;
@%p8 bra $L__BB0_11;
ld.shared.f32 %f17, [%rd1];
add.f32 %f20, %f17, 0f00000000;
setp.lt.u32 %p9, %r1, 2;
@%p9 bra $L__BB0_11;
add.s32 %r43, %r6, 1;
mul.wide.u32 %rd14, %r43, 4;
add.s64 %rd16, %rd7, %rd14;
ld.shared.f32 %f18, [%rd16];
add.f32 %f20, %f20, %f18;
$L__BB0_11:
bar.sync 0;
or.pred %p12, %p8, %p1;
@%p12 bra $L__BB0_13;
cvta.to.global.u64 %rd17, %rd3;
mul.wide.s32 %rd18, %r4, 4;
add.s64 %rd19, %rd17, %rd18;
st.global.f32 [%rd19], %f20;
$L__BB0_13:
ret;
}
--- 53997da5d
+++ 03a1b695e
@@ -18,150 +18,118 @@
.entry _ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEE(
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEE_param_0[32],
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEE_param_1[24]
)
{
- .reg .pred %p<16>;
- .reg .f32 %f<34>;
- .reg .b32 %r<40>;
- .reg .b64 %rd<22>;
+ .reg .pred %p<13>;
+ .reg .f32 %f<21>;
+ .reg .b32 %r<45>;
+ .reg .b64 %rd<20>;
ld.param.u64 %rd3, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEE_param_1];
- ld.param.u64 %rd1, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEE_param_0];
+ ld.param.u64 %rd2, [_ZN11kernelscope6kernelENS_6TensorIfLi3ELi3EEENS0_IfLi2ELi2EEE_param_0];
mov.u32 %r1, %ntid.x;
- mov.u32 %r20, %ctaid.x;
+ mov.u32 %r30, %ctaid.x;
mov.u32 %r2, %ntid.y;
mov.u32 %r3, %tid.y;
- mad.lo.s32 %r4, %r2, %r20, %r3;
+ mad.lo.s32 %r4, %r2, %r30, %r3;
setp.gt.s32 %p1, %r4, 27453;
+ mov.f32 %f19, 0f00000000;
@%p1 bra $L__BB0_2;
- mov.u32 %r5, %tid.x;
- setp.lt.s32 %p2, %r5, 1;
- @%p2 bra $L__BB0_6;
+ shl.b32 %r33, %r4, 1;
+ mul.wide.s32 %rd5, %r33, 4;
+ add.s64 %rd4, %rd2, %rd5;
+
+ ld.global.cs.v2.u32 {%r31,%r32}, [%rd4];
+
+ mov.b32 %f7, %r31;
+ add.f32 %f8, %f7, 0f00000000;
+ mov.b32 %f9, %r32;
+ add.f32 %f19, %f8, %f9;
$L__BB0_2:
- setp.lt.s32 %p3, %r4, 27454;
- mov.f32 %f30, 0f00000000;
- mov.f32 %f31, 0f00000000;
- @%p3 bra $L__BB0_3;
- bra.uni $L__BB0_5;
+ mov.u32 %r34, %tid.z;
+ mad.lo.s32 %r35, %r2, %r34, %r3;
+ mov.u32 %r5, %tid.x;
+ mad.lo.s32 %r6, %r35, %r1, %r5;
+ mul.wide.u32 %rd6, %r6, 4;
+ mov.u64 %rd7, _ZN11kernelscope6kernelE;
+ add.s64 %rd1, %rd7, %rd6;
+ st.shared.f32 [%rd1], %f19;
+ bar.sync 0;
+ clz.b32 %r36, %r1;
+ mov.u32 %r37, 31;
+ sub.s32 %r38, %r37, %r36;
+ mov.u32 %r39, 1;
+ shl.b32 %r44, %r39, %r38;
+ setp.ge.u32 %p2, %r5, %r44;
+ add.s32 %r40, %r44, %r5;
+ setp.ge.u32 %p3, %r40, %r1;
+ or.pred %p4, %p2, %p3;
+ @%p4 bra $L__BB0_4;
-$L__BB0_3:
- mov.u32 %r6, %tid.x;
- setp.gt.s32 %p4, %r6, 0;
- @%p4 bra $L__BB0_5;
+ add.s32 %r41, %r6, %r44;
+ mul.wide.s32 %rd8, %r41, 4;
+ add.s64 %rd10, %rd7, %rd8;
+ ld.shared.f32 %f10, [%rd1];
+ ld.shared.f32 %f11, [%rd10];
+ add.f32 %f12, %f11, %f10;
+ st.shared.f32 [%rd1], %f12;
- add.s32 %r23, %r4, %r6;
- shl.b32 %r24, %r23, 1;
- mul.wide.s32 %rd5, %r24, 4;
- add.s64 %rd4, %rd1, %rd5;
-
- ld.global.cs.v2.u32 {%r21,%r22}, [%rd4];
-
- mov.b32 %f15, %r21;
- add.f32 %f31, %f15, 0f00000000;
- mov.b32 %f30, %r22;
+$L__BB0_4:
+ bar.sync 0;
+ setp.lt.s32 %p5, %r44, 4;
+ @%p5 bra $L__BB0_8;
$L__BB0_5:
- add.f32 %f32, %f31, %f30;
+ shr.u32 %r9, %r44, 1;
+ setp.ge.u32 %p6, %r5, %r9;
+ @%p6 bra $L__BB0_7;
+
+ add.s32 %r42, %r9, %r6;
+ mul.wide.s32 %rd11, %r42, 4;
+ add.s64 %rd13, %rd7, %rd11;
+ ld.shared.f32 %f13, [%rd1];
+ ld.shared.f32 %f14, [%rd13];
+ add.f32 %f15, %f14, %f13;
+ st.shared.f32 [%rd1], %f15;
$L__BB0_7:
- mov.u32 %r29, %tid.z;
- mad.lo.s32 %r30, %r2, %r29, %r3;
- mov.u32 %r7, %tid.x;
- mad.lo.s32 %r8, %r30, %r1, %r7;
- mul.wide.u32 %rd8, %r8, 4;
- mov.u64 %rd9, _ZN11kernelscope6kernelE;
- add.s64 %rd2, %rd9, %rd8;
- st.shared.f32 [%rd2], %f32;
bar.sync 0;
- clz.b32 %r31, %r1;
- mov.u32 %r32, 31;
- sub.s32 %r33, %r32, %r31;
- mov.u32 %r34, 1;
- shl.b32 %r39, %r34, %r33;
- setp.ge.u32 %p5, %r7, %r39;
- add.s32 %r35, %r39, %r7;
- setp.ge.u32 %p6, %r35, %r1;
- or.pred %p7, %p5, %p6;
- @%p7 bra $L__BB0_9;
+ setp.gt.u32 %p7, %r44, 7;
+ mov.u32 %r44, %r9;
+ @%p7 bra $L__BB0_5;
- add.s32 %r36, %r8, %r39;
- mul.wide.s32 %rd10, %r36, 4;
- add.s64 %rd12, %rd9, %rd10;
- ld.shared.f32 %f19, [%rd2];
- ld.shared.f32 %f20, [%rd12];
- add.f32 %f21, %f20, %f19;
- st.shared.f32 [%rd2], %f21;
+$L__BB0_8:
+ setp.ne.s32 %p8, %r5, 0;
+ mov.f32 %f20, 0f00000000;
+ @%p8 bra $L__BB0_11;
-$L__BB0_9:
+ ld.shared.f32 %f17, [%rd1];
+ add.f32 %f20, %f17, 0f00000000;
+ setp.lt.u32 %p9, %r1, 2;
+ @%p9 bra $L__BB0_11;
+
+ add.s32 %r43, %r6, 1;
+ mul.wide.u32 %rd14, %r43, 4;
+ add.s64 %rd16, %rd7, %rd14;
+ ld.shared.f32 %f18, [%rd16];
+ add.f32 %f20, %f20, %f18;
+
+$L__BB0_11:
bar.sync 0;
- setp.lt.s32 %p8, %r39, 4;
- @%p8 bra $L__BB0_13;
+ or.pred %p12, %p8, %p1;
+ @%p12 bra $L__BB0_13;
-$L__BB0_10:
- shr.u32 %r11, %r39, 1;
- setp.ge.u32 %p9, %r7, %r11;
- @%p9 bra $L__BB0_12;
-
- add.s32 %r37, %r11, %r8;
- mul.wide.s32 %rd13, %r37, 4;
- add.s64 %rd15, %rd9, %rd13;
- ld.shared.f32 %f22, [%rd2];
- ld.shared.f32 %f23, [%rd15];
- add.f32 %f24, %f23, %f22;
- st.shared.f32 [%rd2], %f24;
-
-$L__BB0_12:
- bar.sync 0;
- setp.gt.u32 %p10, %r39, 7;
- mov.u32 %r39, %r11;
- @%p10 bra $L__BB0_10;
+ cvta.to.global.u64 %rd17, %rd3;
+ mul.wide.s32 %rd18, %r4, 4;
+ add.s64 %rd19, %rd17, %rd18;
+ st.global.f32 [%rd19], %f20;
$L__BB0_13:
- setp.ne.s32 %p11, %r7, 0;
- mov.f32 %f33, 0f00000000;
- @%p11 bra $L__BB0_16;
-
- ld.shared.f32 %f26, [%rd2];
- add.f32 %f33, %f26, 0f00000000;
- setp.lt.u32 %p12, %r1, 2;
- @%p12 bra $L__BB0_16;
-
- add.s32 %r38, %r8, 1;
- mul.wide.u32 %rd16, %r38, 4;
- add.s64 %rd18, %rd9, %rd16;
- ld.shared.f32 %f27, [%rd18];
- add.f32 %f33, %f33, %f27;
-
-$L__BB0_16:
- bar.sync 0;
- or.pred %p15, %p11, %p1;
- @%p15 bra $L__BB0_18;
-
- cvta.to.global.u64 %rd19, %rd3;
- mul.wide.s32 %rd20, %r4, 4;
- add.s64 %rd21, %rd19, %rd20;
- st.global.f32 [%rd21], %f33;
-
-$L__BB0_18:
ret;
-
-$L__BB0_6:
- add.s32 %r27, %r4, %r5;
- shl.b32 %r28, %r27, 1;
- mul.wide.s32 %rd7, %r28, 4;
- add.s64 %rd6, %rd1, %rd7;
-
- ld.global.cs.v2.u32 {%r25,%r26}, [%rd6];
-
- mov.b32 %f16, %r25;
- add.f32 %f17, %f16, 0f00000000;
- mov.b32 %f18, %r26;
- add.f32 %f32, %f17, %f18;
- bra.uni $L__BB0_7;
}
18: ReshapeReduction.FusionReshapeReduction/62
Kernel 1
CUDA
PTX
53997da5d
Diff
03a1b695e
-4
+4 index type: int
registers: 16
gmem: 3
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 4, 4> T0, Tensor<float, 4, 4> T1, Tensor<float, 4, 4> T6) {
if (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < 120)) {
Array<float, 1, 1> T9;
T9[0] = 0;
T9[0]
= T1[((((T1.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 60)) + (T1.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 60) / 20))) + (T1.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 60) % 20) / 5))) + (T1.alloc_stride[3LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 60) % 20) % 5)))];
Array<float, 1, 1> T8;
T8[0] = 0;
T8[0]
= T0[((((T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 60)) + (T0.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 60) / 20))) + (T0.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 60) % 20) / 5))) + (T0.alloc_stride[3LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 60) % 20) % 5)))];
Array<float, 1, 1> T2;
T2[0]
= T8[0]
+ T9[0];
Array<float, 1, 1> T10;
T10[0]
= T2[0];
T6[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
= T10[0];
}
}
__global__ void nvfuser_N(Tensor<float, 4, 4> T0, Tensor<float, 4, 4> T1, Tensor<float, 4, 4> T6) {
if ((((nvfuser_index_t)threadIdx.x) < 120)) {
Array<float, 1, 1> T9;
T9[0] = 0;
T9[0]
= T1[((((T1.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) / 60)) + (T1.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.x) % 60) / 20))) + (T1.alloc_stride[2LL] * (((((nvfuser_index_t)threadIdx.x) % 60) % 20) / 5))) + (T1.alloc_stride[3LL] * (((((nvfuser_index_t)threadIdx.x) % 60) % 20) % 5)))];
Array<float, 1, 1> T8;
T8[0] = 0;
T8[0]
= T0[((((T0.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) / 60)) + (T0.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.x) % 60) / 20))) + (T0.alloc_stride[2LL] * (((((nvfuser_index_t)threadIdx.x) % 60) % 20) / 5))) + (T0.alloc_stride[3LL] * (((((nvfuser_index_t)threadIdx.x) % 60) % 20) % 5)))];
Array<float, 1, 1> T2;
T2[0]
= T8[0]
+ T9[0];
Array<float, 1, 1> T10;
T10[0]
= T2[0];
T6[((nvfuser_index_t)threadIdx.x)]
= T10[0];
}
}
--- 53997da5d
+++ 03a1b695e
@@ -1,21 +1,21 @@
__global__ void nvfuser_N(Tensor<float, 4, 4> T0, Tensor<float, 4, 4> T1, Tensor<float, 4, 4> T6) {
- if (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < 120)) {
+ if ((((nvfuser_index_t)threadIdx.x) < 120)) {
Array<float, 1, 1> T9;
T9[0] = 0;
T9[0]
- = T1[((((T1.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 60)) + (T1.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 60) / 20))) + (T1.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 60) % 20) / 5))) + (T1.alloc_stride[3LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 60) % 20) % 5)))];
+ = T1[((((T1.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) / 60)) + (T1.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.x) % 60) / 20))) + (T1.alloc_stride[2LL] * (((((nvfuser_index_t)threadIdx.x) % 60) % 20) / 5))) + (T1.alloc_stride[3LL] * (((((nvfuser_index_t)threadIdx.x) % 60) % 20) % 5)))];
Array<float, 1, 1> T8;
T8[0] = 0;
T8[0]
- = T0[((((T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) / 60)) + (T0.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 60) / 20))) + (T0.alloc_stride[2LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 60) % 20) / 5))) + (T0.alloc_stride[3LL] * ((((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) % 60) % 20) % 5)))];
+ = T0[((((T0.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) / 60)) + (T0.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.x) % 60) / 20))) + (T0.alloc_stride[2LL] * (((((nvfuser_index_t)threadIdx.x) % 60) % 20) / 5))) + (T0.alloc_stride[3LL] * (((((nvfuser_index_t)threadIdx.x) % 60) % 20) % 5)))];
Array<float, 1, 1> T2;
T2[0]
= T8[0]
+ T9[0];
Array<float, 1, 1> T10;
T10[0]
= T2[0];
- T6[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
+ T6[((nvfuser_index_t)threadIdx.x)]
= T10[0];
}
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_477_cu_638b7348_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_477_cu_638b7348_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_477_cu_638b7348_191103std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_477_cu_638b7348_1911011nvfuser_477ENS_6TensorIfLi4ELi4EEES1_S1_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_477_cu_638b7348_1911011nvfuser_477ENS_6TensorIfLi4ELi4EEES1_S1__param_0[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_477_cu_638b7348_1911011nvfuser_477ENS_6TensorIfLi4ELi4EEES1_S1__param_1[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_477_cu_638b7348_1911011nvfuser_477ENS_6TensorIfLi4ELi4EEES1_S1__param_2[40]
)
{
.reg .pred %p<2>;
.reg .f32 %f<4>;
.reg .b32 %r<80>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r30, %r31}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_477_cu_638b7348_1911011nvfuser_477ENS_6TensorIfLi4ELi4EEES1_S1__param_0+24];
ld.param.v2.u32 {%r32, %r33}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_477_cu_638b7348_1911011nvfuser_477ENS_6TensorIfLi4ELi4EEES1_S1__param_0+32];
ld.param.v2.u32 {%r38, %r39}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_477_cu_638b7348_1911011nvfuser_477ENS_6TensorIfLi4ELi4EEES1_S1__param_1+24];
ld.param.v2.u32 {%r40, %r41}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_477_cu_638b7348_1911011nvfuser_477ENS_6TensorIfLi4ELi4EEES1_S1__param_1+32];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_477_cu_638b7348_1911011nvfuser_477ENS_6TensorIfLi4ELi4EEES1_S1__param_2];
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_477_cu_638b7348_1911011nvfuser_477ENS_6TensorIfLi4ELi4EEES1_S1__param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_477_cu_638b7348_1911011nvfuser_477ENS_6TensorIfLi4ELi4EEES1_S1__param_0];
mov.u32 %r50, %ctaid.x;
shl.b32 %r51, %r50, 7;
mov.u32 %r52, %tid.x;
add.s32 %r1, %r51, %r52;
setp.gt.s32 %p1, %r1, 119;
@%p1 bra $L__BB0_2;
cvta.to.global.u64 %rd4, %rd2;
cvta.to.global.u64 %rd5, %rd1;
cvta.to.global.u64 %rd6, %rd3;
mul.hi.s32 %r53, %r1, -2004318071;
add.s32 %r54, %r53, %r1;
shr.u32 %r55, %r54, 31;
shr.s32 %r56, %r54, 5;
add.s32 %r57, %r56, %r55;
mul.lo.s32 %r58, %r57, 60;
sub.s32 %r59, %r1, %r58;
mul.hi.s32 %r60, %r59, 1717986919;
shr.u32 %r61, %r60, 31;
shr.s32 %r62, %r60, 3;
add.s32 %r63, %r62, %r61;
mul.lo.s32 %r64, %r39, %r63;
mad.lo.s32 %r65, %r38, %r57, %r64;
mul.lo.s32 %r66, %r63, 20;
sub.s32 %r67, %r59, %r66;
mul.hi.s32 %r68, %r67, 1717986919;
shr.u32 %r69, %r68, 31;
shr.s32 %r70, %r68, 1;
add.s32 %r71, %r70, %r69;
mad.lo.s32 %r72, %r40, %r71, %r65;
mul.lo.s32 %r73, %r71, 5;
sub.s32 %r74, %r67, %r73;
mad.lo.s32 %r75, %r41, %r74, %r72;
mul.wide.s32 %rd7, %r75, 4;
add.s64 %rd8, %rd4, %rd7;
mul.lo.s32 %r76, %r31, %r63;
mad.lo.s32 %r77, %r30, %r57, %r76;
mad.lo.s32 %r78, %r71, %r32, %r77;
mad.lo.s32 %r79, %r74, %r33, %r78;
mul.wide.s32 %rd9, %r79, 4;
add.s64 %rd10, %rd5, %rd9;
ld.global.f32 %f1, [%rd10];
ld.global.f32 %f2, [%rd8];
add.f32 %f3, %f2, %f1;
mul.wide.s32 %rd11, %r1, 4;
add.s64 %rd12, %rd6, %rd11;
st.global.f32 [%rd12], %f3;
$L__BB0_2:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_477_cu_8ba90e9e_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_477_cu_8ba90e9e_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_477_cu_8ba90e9e_160113std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_477_cu_8ba90e9e_1601111nvfuser_477ENS_6TensorIfLi4ELi4EEES1_S1_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_477_cu_8ba90e9e_1601111nvfuser_477ENS_6TensorIfLi4ELi4EEES1_S1__param_0[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_477_cu_8ba90e9e_1601111nvfuser_477ENS_6TensorIfLi4ELi4EEES1_S1__param_1[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_477_cu_8ba90e9e_1601111nvfuser_477ENS_6TensorIfLi4ELi4EEES1_S1__param_2[40]
)
{
.reg .pred %p<2>;
.reg .f32 %f<4>;
.reg .b32 %r<77>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r30, %r31}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_477_cu_8ba90e9e_1601111nvfuser_477ENS_6TensorIfLi4ELi4EEES1_S1__param_0+24];
ld.param.v2.u32 {%r32, %r33}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_477_cu_8ba90e9e_1601111nvfuser_477ENS_6TensorIfLi4ELi4EEES1_S1__param_0+32];
ld.param.v2.u32 {%r38, %r39}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_477_cu_8ba90e9e_1601111nvfuser_477ENS_6TensorIfLi4ELi4EEES1_S1__param_1+24];
ld.param.v2.u32 {%r40, %r41}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_477_cu_8ba90e9e_1601111nvfuser_477ENS_6TensorIfLi4ELi4EEES1_S1__param_1+32];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_477_cu_8ba90e9e_1601111nvfuser_477ENS_6TensorIfLi4ELi4EEES1_S1__param_2];
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_477_cu_8ba90e9e_1601111nvfuser_477ENS_6TensorIfLi4ELi4EEES1_S1__param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_477_cu_8ba90e9e_1601111nvfuser_477ENS_6TensorIfLi4ELi4EEES1_S1__param_0];
mov.u32 %r1, %tid.x;
setp.gt.s32 %p1, %r1, 119;
@%p1 bra $L__BB0_2;
cvta.to.global.u64 %rd4, %rd2;
cvta.to.global.u64 %rd5, %rd1;
cvta.to.global.u64 %rd6, %rd3;
mul.hi.s32 %r50, %r1, -2004318071;
add.s32 %r51, %r50, %r1;
shr.u32 %r52, %r51, 31;
shr.s32 %r53, %r51, 5;
add.s32 %r54, %r53, %r52;
mul.lo.s32 %r55, %r54, 60;
sub.s32 %r56, %r1, %r55;
mul.hi.s32 %r57, %r56, 1717986919;
shr.u32 %r58, %r57, 31;
shr.s32 %r59, %r57, 3;
add.s32 %r60, %r59, %r58;
mul.lo.s32 %r61, %r39, %r60;
mad.lo.s32 %r62, %r38, %r54, %r61;
mul.lo.s32 %r63, %r60, 20;
sub.s32 %r64, %r56, %r63;
mul.hi.s32 %r65, %r64, 1717986919;
shr.u32 %r66, %r65, 31;
shr.s32 %r67, %r65, 1;
add.s32 %r68, %r67, %r66;
mad.lo.s32 %r69, %r40, %r68, %r62;
mul.lo.s32 %r70, %r68, 5;
sub.s32 %r71, %r64, %r70;
mad.lo.s32 %r72, %r41, %r71, %r69;
mul.wide.s32 %rd7, %r72, 4;
add.s64 %rd8, %rd4, %rd7;
mul.lo.s32 %r73, %r31, %r60;
mad.lo.s32 %r74, %r30, %r54, %r73;
mad.lo.s32 %r75, %r32, %r68, %r74;
mad.lo.s32 %r76, %r33, %r71, %r75;
mul.wide.s32 %rd9, %r76, 4;
add.s64 %rd10, %rd5, %rd9;
ld.global.f32 %f1, [%rd10];
ld.global.f32 %f2, [%rd8];
add.f32 %f3, %f2, %f1;
mul.wide.s32 %rd11, %r1, 4;
add.s64 %rd12, %rd6, %rd11;
st.global.f32 [%rd12], %f3;
$L__BB0_2:
ret;
}
--- 53997da5d
+++ 03a1b695e
@@ -20,61 +20,58 @@
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_S1__param_2[40]
)
{
.reg .pred %p<2>;
.reg .f32 %f<4>;
- .reg .b32 %r<80>;
+ .reg .b32 %r<77>;
.reg .b64 %rd<13>;
ld.param.v2.u32 {%r30, %r31}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_S1__param_0+24];
ld.param.v2.u32 {%r32, %r33}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_S1__param_0+32];
ld.param.v2.u32 {%r38, %r39}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_S1__param_1+24];
ld.param.v2.u32 {%r40, %r41}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_S1__param_1+32];
ld.param.u64 %rd3, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_S1__param_2];
ld.param.u64 %rd2, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_S1__param_1];
ld.param.u64 %rd1, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEES1_S1__param_0];
- mov.u32 %r50, %ctaid.x;
- shl.b32 %r51, %r50, 7;
- mov.u32 %r52, %tid.x;
- add.s32 %r1, %r51, %r52;
+ mov.u32 %r1, %tid.x;
setp.gt.s32 %p1, %r1, 119;
@%p1 bra $L__BB0_2;
cvta.to.global.u64 %rd4, %rd2;
cvta.to.global.u64 %rd5, %rd1;
cvta.to.global.u64 %rd6, %rd3;
- mul.hi.s32 %r53, %r1, -2004318071;
- add.s32 %r54, %r53, %r1;
- shr.u32 %r55, %r54, 31;
- shr.s32 %r56, %r54, 5;
- add.s32 %r57, %r56, %r55;
- mul.lo.s32 %r58, %r57, 60;
- sub.s32 %r59, %r1, %r58;
- mul.hi.s32 %r60, %r59, 1717986919;
- shr.u32 %r61, %r60, 31;
- shr.s32 %r62, %r60, 3;
- add.s32 %r63, %r62, %r61;
- mul.lo.s32 %r64, %r39, %r63;
- mad.lo.s32 %r65, %r38, %r57, %r64;
- mul.lo.s32 %r66, %r63, 20;
- sub.s32 %r67, %r59, %r66;
- mul.hi.s32 %r68, %r67, 1717986919;
- shr.u32 %r69, %r68, 31;
- shr.s32 %r70, %r68, 1;
- add.s32 %r71, %r70, %r69;
- mad.lo.s32 %r72, %r40, %r71, %r65;
- mul.lo.s32 %r73, %r71, 5;
- sub.s32 %r74, %r67, %r73;
- mad.lo.s32 %r75, %r41, %r74, %r72;
- mul.wide.s32 %rd7, %r75, 4;
+ mul.hi.s32 %r50, %r1, -2004318071;
+ add.s32 %r51, %r50, %r1;
+ shr.u32 %r52, %r51, 31;
+ shr.s32 %r53, %r51, 5;
+ add.s32 %r54, %r53, %r52;
+ mul.lo.s32 %r55, %r54, 60;
+ sub.s32 %r56, %r1, %r55;
+ mul.hi.s32 %r57, %r56, 1717986919;
+ shr.u32 %r58, %r57, 31;
+ shr.s32 %r59, %r57, 3;
+ add.s32 %r60, %r59, %r58;
+ mul.lo.s32 %r61, %r39, %r60;
+ mad.lo.s32 %r62, %r38, %r54, %r61;
+ mul.lo.s32 %r63, %r60, 20;
+ sub.s32 %r64, %r56, %r63;
+ mul.hi.s32 %r65, %r64, 1717986919;
+ shr.u32 %r66, %r65, 31;
+ shr.s32 %r67, %r65, 1;
+ add.s32 %r68, %r67, %r66;
+ mad.lo.s32 %r69, %r40, %r68, %r62;
+ mul.lo.s32 %r70, %r68, 5;
+ sub.s32 %r71, %r64, %r70;
+ mad.lo.s32 %r72, %r41, %r71, %r69;
+ mul.wide.s32 %rd7, %r72, 4;
add.s64 %rd8, %rd4, %rd7;
- mul.lo.s32 %r76, %r31, %r63;
- mad.lo.s32 %r77, %r30, %r57, %r76;
- mad.lo.s32 %r78, %r71, %r32, %r77;
- mad.lo.s32 %r79, %r74, %r33, %r78;
- mul.wide.s32 %rd9, %r79, 4;
+ mul.lo.s32 %r73, %r31, %r60;
+ mad.lo.s32 %r74, %r30, %r54, %r73;
+ mad.lo.s32 %r75, %r32, %r68, %r74;
+ mad.lo.s32 %r76, %r33, %r71, %r75;
+ mul.wide.s32 %rd9, %r76, 4;
add.s64 %rd10, %rd5, %rd9;
ld.global.f32 %f1, [%rd10];
ld.global.f32 %f2, [%rd8];
add.f32 %f3, %f2, %f1;
mul.wide.s32 %rd11, %r1, 4;
19: ReshapeReduction.FusionReshapeReduction/68
Kernel 1
CUDA
PTX
53997da5d
Diff
03a1b695e
-1
+1 index type: int
registers: 0
gmem: 0
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 4, 4> T0, Tensor<float, 5, 5> T4) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
Array<float, 1, 1> T6;
T6[0] = 0.000000000e+00f;
#pragma unroll 1
for(nvfuser_index_t i0 = 0; i0 < (ceilDiv(T0.logical_size[3LL], ((nvfuser_index_t)blockDim.x))); ++i0) {
if ((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < (((8 * T0.logical_size[0LL]) * T0.logical_size[1LL]) * (ceilDiv((ceilDiv(T0.logical_size[2LL], 2)), 4)))) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i0)) < T0.logical_size[3LL]))) {
T6[0]
= T6[0]
+ T0[((((((T0.alloc_stride[3LL] * ((nvfuser_index_t)threadIdx.x)) + (T0.alloc_stride[0LL] * (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) / (8 * (ceilDiv((ceilDiv(T0.logical_size[2LL], 2)), 4)))) / T0.logical_size[1LL]))) + (T0.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) / (8 * (ceilDiv((ceilDiv(T0.logical_size[2LL], 2)), 4)))) % T0.logical_size[1LL]))) + (T0.alloc_stride[2LL] * (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) % (8 * (ceilDiv((ceilDiv(T0.logical_size[2LL], 2)), 4)))) % (4 * (ceilDiv((ceilDiv(T0.logical_size[2LL], 2)), 4)))))) + ((T0.alloc_stride[2LL] * (ceilDiv(T0.logical_size[2LL], 2))) * (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) % (8 * (ceilDiv((ceilDiv(T0.logical_size[2LL], 2)), 4)))) / (4 * (ceilDiv((ceilDiv(T0.logical_size[2LL], 2)), 4)))))) + ((((nvfuser_index_t)blockDim.x) * T0.alloc_stride[3LL]) * i0))];
}
}
Array<float, 1, 1> T2;
T2[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T2[0], T6[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T3;
T3[0]
= T2[0];
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < (((8 * T0.logical_size[0LL]) * T0.logical_size[1LL]) * (ceilDiv((ceilDiv(T0.logical_size[2LL], 2)), 4)))))) {
T4[(((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x)))]
= T3[0];
}
}
__global__ void nvfuser_N(Tensor<float, 4, 4> T0, Tensor<float, 5, 5> T4) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
Array<float, 1, 1> T6;
T6[0] = 0.000000000e+00f;
#pragma unroll 1
for(nvfuser_index_t i0 = 0; i0 < (ceilDiv(T0.logical_size[3LL], ((nvfuser_index_t)blockDim.x))); ++i0) {
if ((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < (((8 * T0.logical_size[0LL]) * T0.logical_size[1LL]) * (ceilDiv((ceilDiv(T0.logical_size[2LL], 2)), 4)))) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i0)) < T0.logical_size[3LL]))) {
T6[0]
= T6[0]
+ T0[(((((T0.alloc_stride[3LL] * ((nvfuser_index_t)threadIdx.x)) + (T0.alloc_stride[0LL] * (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) / (8 * (ceilDiv((ceilDiv(T0.logical_size[2LL], 2)), 4)))) / T0.logical_size[1LL]))) + (T0.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) / (8 * (ceilDiv((ceilDiv(T0.logical_size[2LL], 2)), 4)))) % T0.logical_size[1LL]))) + (T0.alloc_stride[2LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) % (8 * (ceilDiv((ceilDiv(T0.logical_size[2LL], 2)), 4)))))) + ((((nvfuser_index_t)blockDim.x) * T0.alloc_stride[3LL]) * i0))];
}
}
Array<float, 1, 1> T2;
T2[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T2[0], T6[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T3;
T3[0]
= T2[0];
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < (((8 * T0.logical_size[0LL]) * T0.logical_size[1LL]) * (ceilDiv((ceilDiv(T0.logical_size[2LL], 2)), 4)))))) {
T4[(((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x)))]
= T3[0];
}
}
--- 53997da5d
+++ 03a1b695e
@@ -6,11 +6,11 @@
#pragma unroll 1
for(nvfuser_index_t i0 = 0; i0 < (ceilDiv(T0.logical_size[3LL], ((nvfuser_index_t)blockDim.x))); ++i0) {
if ((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < (((8 * T0.logical_size[0LL]) * T0.logical_size[1LL]) * (ceilDiv((ceilDiv(T0.logical_size[2LL], 2)), 4)))) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i0)) < T0.logical_size[3LL]))) {
T6[0]
= T6[0]
- + T0[((((((T0.alloc_stride[3LL] * ((nvfuser_index_t)threadIdx.x)) + (T0.alloc_stride[0LL] * (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) / (8 * (ceilDiv((ceilDiv(T0.logical_size[2LL], 2)), 4)))) / T0.logical_size[1LL]))) + (T0.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) / (8 * (ceilDiv((ceilDiv(T0.logical_size[2LL], 2)), 4)))) % T0.logical_size[1LL]))) + (T0.alloc_stride[2LL] * (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) % (8 * (ceilDiv((ceilDiv(T0.logical_size[2LL], 2)), 4)))) % (4 * (ceilDiv((ceilDiv(T0.logical_size[2LL], 2)), 4)))))) + ((T0.alloc_stride[2LL] * (ceilDiv(T0.logical_size[2LL], 2))) * (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) % (8 * (ceilDiv((ceilDiv(T0.logical_size[2LL], 2)), 4)))) / (4 * (ceilDiv((ceilDiv(T0.logical_size[2LL], 2)), 4)))))) + ((((nvfuser_index_t)blockDim.x) * T0.alloc_stride[3LL]) * i0))];
+ + T0[(((((T0.alloc_stride[3LL] * ((nvfuser_index_t)threadIdx.x)) + (T0.alloc_stride[0LL] * (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) / (8 * (ceilDiv((ceilDiv(T0.logical_size[2LL], 2)), 4)))) / T0.logical_size[1LL]))) + (T0.alloc_stride[1LL] * (((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) / (8 * (ceilDiv((ceilDiv(T0.logical_size[2LL], 2)), 4)))) % T0.logical_size[1LL]))) + (T0.alloc_stride[2LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) % (8 * (ceilDiv((ceilDiv(T0.logical_size[2LL], 2)), 4)))))) + ((((nvfuser_index_t)blockDim.x) * T0.alloc_stride[3LL]) * i0))];
}
}
Array<float, 1, 1> T2;
T2[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T2[0], T6[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_485_cu_ecd35c5b_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_485_cu_ecd35c5b_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_485_cu_ecd35c5b_191103std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_485_cu_ecd35c5b_191105arrayE[];
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_485_cu_ecd35c5b_1911011nvfuser_485ENS_6TensorIfLi4ELi4EEENS0_IfLi5ELi5EEE(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_485_cu_ecd35c5b_1911011nvfuser_485ENS_6TensorIfLi4ELi4EEENS0_IfLi5ELi5EEE_param_0[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_485_cu_ecd35c5b_1911011nvfuser_485ENS_6TensorIfLi4ELi4EEENS0_IfLi5ELi5EEE_param_1[48]
)
{
.reg .pred %p<15>;
.reg .f32 %f<24>;
.reg .b32 %r<125>;
.reg .b64 %rd<21>;
ld.param.v2.u32 {%r50, %r51}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_485_cu_ecd35c5b_1911011nvfuser_485ENS_6TensorIfLi4ELi4EEENS0_IfLi5ELi5EEE_param_0+8];
ld.param.v2.u32 {%r52, %r53}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_485_cu_ecd35c5b_1911011nvfuser_485ENS_6TensorIfLi4ELi4EEENS0_IfLi5ELi5EEE_param_0+16];
ld.param.v2.u32 {%r54, %r55}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_485_cu_ecd35c5b_1911011nvfuser_485ENS_6TensorIfLi4ELi4EEENS0_IfLi5ELi5EEE_param_0+24];
ld.param.v2.u32 {%r56, %r57}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_485_cu_ecd35c5b_1911011nvfuser_485ENS_6TensorIfLi4ELi4EEENS0_IfLi5ELi5EEE_param_0+32];
ld.param.u64 %rd4, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_485_cu_ecd35c5b_1911011nvfuser_485ENS_6TensorIfLi4ELi4EEENS0_IfLi5ELi5EEE_param_1];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_485_cu_ecd35c5b_1911011nvfuser_485ENS_6TensorIfLi4ELi4EEENS0_IfLi5ELi5EEE_param_0];
mov.u32 %r4, %ntid.x;
add.s32 %r68, %r4, %r53;
add.s32 %r69, %r68, -1;
div.s32 %r6, %r69, %r4;
setp.gt.s32 %p1, %r6, 0;
@%p1 bra $L__BB0_2;
bra.uni $L__BB0_1;
$L__BB0_2:
mov.u32 %r71, %tid.y;
mov.u32 %r72, %ctaid.x;
mov.u32 %r73, %ntid.y;
mad.lo.s32 %r7, %r73, %r72, %r71;
mul.lo.s32 %r74, %r50, %r51;
add.s32 %r75, %r52, 1;
shr.u32 %r76, %r75, 31;
add.s32 %r77, %r75, %r76;
shr.s32 %r8, %r77, 1;
add.s32 %r78, %r8, 3;
shr.s32 %r79, %r78, 31;
shr.u32 %r80, %r79, 30;
add.s32 %r81, %r78, %r80;
shr.s32 %r82, %r81, 2;
mul.lo.s32 %r83, %r74, %r82;
shl.b32 %r9, %r83, 3;
shl.b32 %r10, %r82, 3;
and.b32 %r11, %r81, -4;
mov.u32 %r121, %tid.x;
mul.lo.s32 %r122, %r57, %r121;
mul.lo.s32 %r14, %r57, %r4;
cvta.to.global.u64 %rd1, %rd3;
mov.f32 %f21, 0f00000000;
mov.u32 %r123, 0;
$L__BB0_3:
.pragma "nounroll";
setp.ge.s32 %p2, %r7, %r9;
@%p2 bra $L__BB0_6;
setp.ge.s32 %p3, %r121, %r53;
@%p3 bra $L__BB0_6;
div.s32 %r84, %r7, %r10;
div.s32 %r85, %r84, %r51;
mul.lo.s32 %r86, %r85, %r51;
sub.s32 %r87, %r84, %r86;
mul.lo.s32 %r88, %r84, %r10;
sub.s32 %r89, %r7, %r88;
div.s32 %r90, %r89, %r11;
mul.lo.s32 %r91, %r90, %r11;
sub.s32 %r92, %r89, %r91;
mad.lo.s32 %r93, %r90, %r8, %r92;
mul.lo.s32 %r94, %r54, %r85;
mad.lo.s32 %r95, %r55, %r87, %r94;
mad.lo.s32 %r96, %r93, %r56, %r95;
add.s32 %r97, %r122, %r96;
mul.wide.s32 %rd5, %r97, 4;
add.s64 %rd6, %rd1, %rd5;
ld.global.f32 %f10, [%rd6];
add.f32 %f21, %f21, %f10;
$L__BB0_6:
add.s32 %r122, %r122, %r14;
add.s32 %r121, %r121, %r4;
add.s32 %r123, %r123, 1;
setp.lt.s32 %p4, %r123, %r6;
@%p4 bra $L__BB0_3;
bra.uni $L__BB0_7;
$L__BB0_1:
mov.f32 %f21, 0f00000000;
$L__BB0_7:
mov.u32 %r98, %tid.z;
mov.u32 %r24, %ntid.y;
mov.u32 %r25, %tid.y;
mad.lo.s32 %r99, %r24, %r98, %r25;
mov.u32 %r26, %tid.x;
mad.lo.s32 %r27, %r99, %r4, %r26;
mul.wide.u32 %rd7, %r27, 4;
mov.u64 %rd8, _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_485_cu_ecd35c5b_191105arrayE;
add.s64 %rd2, %rd8, %rd7;
st.shared.f32 [%rd2], %f21;
bar.sync 0;
clz.b32 %r100, %r4;
mov.u32 %r101, 31;
sub.s32 %r102, %r101, %r100;
mov.u32 %r103, 1;
shl.b32 %r124, %r103, %r102;
setp.ge.u32 %p5, %r26, %r124;
add.s32 %r104, %r124, %r26;
setp.ge.u32 %p6, %r104, %r4;
or.pred %p7, %p5, %p6;
@%p7 bra $L__BB0_9;
add.s32 %r105, %r27, %r124;
mul.wide.s32 %rd9, %r105, 4;
add.s64 %rd11, %rd8, %rd9;
ld.shared.f32 %f11, [%rd2];
ld.shared.f32 %f12, [%rd11];
add.f32 %f13, %f12, %f11;
st.shared.f32 [%rd2], %f13;
$L__BB0_9:
bar.sync 0;
setp.lt.s32 %p8, %r124, 4;
@%p8 bra $L__BB0_13;
$L__BB0_10:
shr.u32 %r30, %r124, 1;
setp.ge.u32 %p9, %r26, %r30;
@%p9 bra $L__BB0_12;
add.s32 %r106, %r30, %r27;
mul.wide.s32 %rd12, %r106, 4;
add.s64 %rd14, %rd8, %rd12;
ld.shared.f32 %f14, [%rd2];
ld.shared.f32 %f15, [%rd14];
add.f32 %f16, %f15, %f14;
st.shared.f32 [%rd2], %f16;
$L__BB0_12:
bar.sync 0;
setp.gt.u32 %p10, %r124, 7;
mov.u32 %r124, %r30;
@%p10 bra $L__BB0_10;
$L__BB0_13:
setp.ne.s32 %p11, %r26, 0;
mov.f32 %f23, 0f00000000;
@%p11 bra $L__BB0_16;
ld.shared.f32 %f18, [%rd2];
add.f32 %f23, %f18, 0f00000000;
setp.lt.u32 %p12, %r4, 2;
@%p12 bra $L__BB0_16;
add.s32 %r107, %r27, 1;
mul.wide.u32 %rd15, %r107, 4;
add.s64 %rd17, %rd8, %rd15;
ld.shared.f32 %f19, [%rd17];
add.f32 %f23, %f23, %f19;
$L__BB0_16:
bar.sync 0;
@%p11 bra $L__BB0_19;
mov.u32 %r108, %ctaid.x;
mad.lo.s32 %r31, %r24, %r108, %r25;
mul.lo.s32 %r109, %r50, %r51;
add.s32 %r110, %r52, 1;
shr.u32 %r111, %r110, 31;
add.s32 %r112, %r110, %r111;
shr.s32 %r113, %r112, 1;
add.s32 %r114, %r113, 3;
shr.s32 %r115, %r114, 31;
shr.u32 %r116, %r115, 30;
add.s32 %r117, %r114, %r116;
shr.u32 %r118, %r117, 2;
mul.lo.s32 %r119, %r109, %r118;
shl.b32 %r120, %r119, 3;
setp.ge.s32 %p14, %r31, %r120;
@%p14 bra $L__BB0_19;
cvta.to.global.u64 %rd18, %rd4;
mul.wide.s32 %rd19, %r31, 4;
add.s64 %rd20, %rd18, %rd19;
st.global.f32 [%rd20], %f23;
$L__BB0_19:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_485_cu_8ba90e9e_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_485_cu_8ba90e9e_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_485_cu_8ba90e9e_160113std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_485_cu_8ba90e9e_160115arrayE[];
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_485_cu_8ba90e9e_1601111nvfuser_485ENS_6TensorIfLi4ELi4EEENS0_IfLi5ELi5EEE(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_485_cu_8ba90e9e_1601111nvfuser_485ENS_6TensorIfLi4ELi4EEENS0_IfLi5ELi5EEE_param_0[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_485_cu_8ba90e9e_1601111nvfuser_485ENS_6TensorIfLi4ELi4EEENS0_IfLi5ELi5EEE_param_1[48]
)
{
.reg .pred %p<15>;
.reg .f32 %f<24>;
.reg .b32 %r<120>;
.reg .b64 %rd<21>;
ld.param.v2.u32 {%r48, %r49}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_485_cu_8ba90e9e_1601111nvfuser_485ENS_6TensorIfLi4ELi4EEENS0_IfLi5ELi5EEE_param_0+8];
ld.param.v2.u32 {%r50, %r51}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_485_cu_8ba90e9e_1601111nvfuser_485ENS_6TensorIfLi4ELi4EEENS0_IfLi5ELi5EEE_param_0+16];
ld.param.v2.u32 {%r52, %r53}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_485_cu_8ba90e9e_1601111nvfuser_485ENS_6TensorIfLi4ELi4EEENS0_IfLi5ELi5EEE_param_0+24];
ld.param.v2.u32 {%r54, %r55}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_485_cu_8ba90e9e_1601111nvfuser_485ENS_6TensorIfLi4ELi4EEENS0_IfLi5ELi5EEE_param_0+32];
ld.param.u64 %rd4, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_485_cu_8ba90e9e_1601111nvfuser_485ENS_6TensorIfLi4ELi4EEENS0_IfLi5ELi5EEE_param_1];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_485_cu_8ba90e9e_1601111nvfuser_485ENS_6TensorIfLi4ELi4EEENS0_IfLi5ELi5EEE_param_0];
mov.u32 %r4, %ntid.x;
add.s32 %r66, %r4, %r51;
add.s32 %r67, %r66, -1;
div.s32 %r6, %r67, %r4;
setp.gt.s32 %p1, %r6, 0;
@%p1 bra $L__BB0_2;
bra.uni $L__BB0_1;
$L__BB0_2:
mov.u32 %r69, %tid.y;
mov.u32 %r70, %ctaid.x;
mov.u32 %r71, %ntid.y;
mad.lo.s32 %r7, %r71, %r70, %r69;
mul.lo.s32 %r72, %r48, %r49;
add.s32 %r73, %r50, 1;
shr.u32 %r74, %r73, 31;
add.s32 %r75, %r73, %r74;
shr.s32 %r76, %r75, 1;
add.s32 %r77, %r76, 3;
shr.s32 %r78, %r77, 31;
shr.u32 %r79, %r78, 30;
add.s32 %r80, %r77, %r79;
shr.s32 %r81, %r80, 2;
mul.lo.s32 %r82, %r72, %r81;
shl.b32 %r8, %r82, 3;
shl.b32 %r9, %r81, 3;
mov.u32 %r116, %tid.x;
mul.lo.s32 %r117, %r55, %r116;
mul.lo.s32 %r12, %r55, %r4;
cvta.to.global.u64 %rd1, %rd3;
mov.f32 %f21, 0f00000000;
mov.u32 %r118, 0;
$L__BB0_3:
.pragma "nounroll";
setp.ge.s32 %p2, %r7, %r8;
@%p2 bra $L__BB0_6;
setp.ge.s32 %p3, %r116, %r51;
@%p3 bra $L__BB0_6;
div.s32 %r83, %r7, %r9;
div.s32 %r84, %r83, %r49;
mul.lo.s32 %r85, %r84, %r49;
sub.s32 %r86, %r83, %r85;
mul.lo.s32 %r87, %r83, %r9;
sub.s32 %r88, %r7, %r87;
mul.lo.s32 %r89, %r54, %r88;
mad.lo.s32 %r90, %r53, %r86, %r89;
mad.lo.s32 %r91, %r52, %r84, %r90;
add.s32 %r92, %r117, %r91;
mul.wide.s32 %rd5, %r92, 4;
add.s64 %rd6, %rd1, %rd5;
ld.global.f32 %f10, [%rd6];
add.f32 %f21, %f21, %f10;
$L__BB0_6:
add.s32 %r117, %r117, %r12;
add.s32 %r116, %r116, %r4;
add.s32 %r118, %r118, 1;
setp.lt.s32 %p4, %r118, %r6;
@%p4 bra $L__BB0_3;
bra.uni $L__BB0_7;
$L__BB0_1:
mov.f32 %f21, 0f00000000;
$L__BB0_7:
mov.u32 %r93, %tid.z;
mov.u32 %r22, %ntid.y;
mov.u32 %r23, %tid.y;
mad.lo.s32 %r94, %r22, %r93, %r23;
mov.u32 %r24, %tid.x;
mad.lo.s32 %r25, %r94, %r4, %r24;
mul.wide.u32 %rd7, %r25, 4;
mov.u64 %rd8, _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_485_cu_8ba90e9e_160115arrayE;
add.s64 %rd2, %rd8, %rd7;
st.shared.f32 [%rd2], %f21;
bar.sync 0;
clz.b32 %r95, %r4;
mov.u32 %r96, 31;
sub.s32 %r97, %r96, %r95;
mov.u32 %r98, 1;
shl.b32 %r119, %r98, %r97;
setp.ge.u32 %p5, %r24, %r119;
add.s32 %r99, %r119, %r24;
setp.ge.u32 %p6, %r99, %r4;
or.pred %p7, %p5, %p6;
@%p7 bra $L__BB0_9;
add.s32 %r100, %r25, %r119;
mul.wide.s32 %rd9, %r100, 4;
add.s64 %rd11, %rd8, %rd9;
ld.shared.f32 %f11, [%rd2];
ld.shared.f32 %f12, [%rd11];
add.f32 %f13, %f12, %f11;
st.shared.f32 [%rd2], %f13;
$L__BB0_9:
bar.sync 0;
setp.lt.s32 %p8, %r119, 4;
@%p8 bra $L__BB0_13;
$L__BB0_10:
shr.u32 %r28, %r119, 1;
setp.ge.u32 %p9, %r24, %r28;
@%p9 bra $L__BB0_12;
add.s32 %r101, %r28, %r25;
mul.wide.s32 %rd12, %r101, 4;
add.s64 %rd14, %rd8, %rd12;
ld.shared.f32 %f14, [%rd2];
ld.shared.f32 %f15, [%rd14];
add.f32 %f16, %f15, %f14;
st.shared.f32 [%rd2], %f16;
$L__BB0_12:
bar.sync 0;
setp.gt.u32 %p10, %r119, 7;
mov.u32 %r119, %r28;
@%p10 bra $L__BB0_10;
$L__BB0_13:
setp.ne.s32 %p11, %r24, 0;
mov.f32 %f23, 0f00000000;
@%p11 bra $L__BB0_16;
ld.shared.f32 %f18, [%rd2];
add.f32 %f23, %f18, 0f00000000;
setp.lt.u32 %p12, %r4, 2;
@%p12 bra $L__BB0_16;
add.s32 %r102, %r25, 1;
mul.wide.u32 %rd15, %r102, 4;
add.s64 %rd17, %rd8, %rd15;
ld.shared.f32 %f19, [%rd17];
add.f32 %f23, %f23, %f19;
$L__BB0_16:
bar.sync 0;
@%p11 bra $L__BB0_19;
mov.u32 %r103, %ctaid.x;
mad.lo.s32 %r29, %r22, %r103, %r23;
mul.lo.s32 %r104, %r48, %r49;
add.s32 %r105, %r50, 1;
shr.u32 %r106, %r105, 31;
add.s32 %r107, %r105, %r106;
shr.s32 %r108, %r107, 1;
add.s32 %r109, %r108, 3;
shr.s32 %r110, %r109, 31;
shr.u32 %r111, %r110, 30;
add.s32 %r112, %r109, %r111;
shr.u32 %r113, %r112, 2;
mul.lo.s32 %r114, %r104, %r113;
shl.b32 %r115, %r114, 3;
setp.ge.s32 %p14, %r29, %r115;
@%p14 bra $L__BB0_19;
cvta.to.global.u64 %rd18, %rd4;
mul.wide.s32 %rd19, %r29, 4;
add.s64 %rd20, %rd18, %rd19;
st.global.f32 [%rd20], %f23;
$L__BB0_19:
ret;
}
--- 53997da5d
+++ 03a1b695e
@@ -20,186 +20,181 @@
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi5ELi5EEE_param_1[48]
)
{
.reg .pred %p<15>;
.reg .f32 %f<24>;
- .reg .b32 %r<125>;
+ .reg .b32 %r<120>;
.reg .b64 %rd<21>;
- ld.param.v2.u32 {%r50, %r51}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi5ELi5EEE_param_0+8];
- ld.param.v2.u32 {%r52, %r53}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi5ELi5EEE_param_0+16];
- ld.param.v2.u32 {%r54, %r55}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi5ELi5EEE_param_0+24];
- ld.param.v2.u32 {%r56, %r57}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi5ELi5EEE_param_0+32];
+ ld.param.v2.u32 {%r48, %r49}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi5ELi5EEE_param_0+8];
+ ld.param.v2.u32 {%r50, %r51}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi5ELi5EEE_param_0+16];
+ ld.param.v2.u32 {%r52, %r53}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi5ELi5EEE_param_0+24];
+ ld.param.v2.u32 {%r54, %r55}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi5ELi5EEE_param_0+32];
ld.param.u64 %rd4, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi5ELi5EEE_param_1];
ld.param.u64 %rd3, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi5ELi5EEE_param_0];
mov.u32 %r4, %ntid.x;
- add.s32 %r68, %r4, %r53;
- add.s32 %r69, %r68, -1;
- div.s32 %r6, %r69, %r4;
+ add.s32 %r66, %r4, %r51;
+ add.s32 %r67, %r66, -1;
+ div.s32 %r6, %r67, %r4;
setp.gt.s32 %p1, %r6, 0;
@%p1 bra $L__BB0_2;
bra.uni $L__BB0_1;
$L__BB0_2:
- mov.u32 %r71, %tid.y;
- mov.u32 %r72, %ctaid.x;
- mov.u32 %r73, %ntid.y;
- mad.lo.s32 %r7, %r73, %r72, %r71;
- mul.lo.s32 %r74, %r50, %r51;
- add.s32 %r75, %r52, 1;
- shr.u32 %r76, %r75, 31;
- add.s32 %r77, %r75, %r76;
- shr.s32 %r8, %r77, 1;
- add.s32 %r78, %r8, 3;
- shr.s32 %r79, %r78, 31;
- shr.u32 %r80, %r79, 30;
- add.s32 %r81, %r78, %r80;
- shr.s32 %r82, %r81, 2;
- mul.lo.s32 %r83, %r74, %r82;
- shl.b32 %r9, %r83, 3;
- shl.b32 %r10, %r82, 3;
- and.b32 %r11, %r81, -4;
- mov.u32 %r121, %tid.x;
- mul.lo.s32 %r122, %r57, %r121;
- mul.lo.s32 %r14, %r57, %r4;
+ mov.u32 %r69, %tid.y;
+ mov.u32 %r70, %ctaid.x;
+ mov.u32 %r71, %ntid.y;
+ mad.lo.s32 %r7, %r71, %r70, %r69;
+ mul.lo.s32 %r72, %r48, %r49;
+ add.s32 %r73, %r50, 1;
+ shr.u32 %r74, %r73, 31;
+ add.s32 %r75, %r73, %r74;
+ shr.s32 %r76, %r75, 1;
+ add.s32 %r77, %r76, 3;
+ shr.s32 %r78, %r77, 31;
+ shr.u32 %r79, %r78, 30;
+ add.s32 %r80, %r77, %r79;
+ shr.s32 %r81, %r80, 2;
+ mul.lo.s32 %r82, %r72, %r81;
+ shl.b32 %r8, %r82, 3;
+ shl.b32 %r9, %r81, 3;
+ mov.u32 %r116, %tid.x;
+ mul.lo.s32 %r117, %r55, %r116;
+ mul.lo.s32 %r12, %r55, %r4;
cvta.to.global.u64 %rd1, %rd3;
mov.f32 %f21, 0f00000000;
- mov.u32 %r123, 0;
+ mov.u32 %r118, 0;
$L__BB0_3:
.pragma "nounroll";
- setp.ge.s32 %p2, %r7, %r9;
+ setp.ge.s32 %p2, %r7, %r8;
@%p2 bra $L__BB0_6;
- setp.ge.s32 %p3, %r121, %r53;
+ setp.ge.s32 %p3, %r116, %r51;
@%p3 bra $L__BB0_6;
- div.s32 %r84, %r7, %r10;
- div.s32 %r85, %r84, %r51;
- mul.lo.s32 %r86, %r85, %r51;
- sub.s32 %r87, %r84, %r86;
- mul.lo.s32 %r88, %r84, %r10;
- sub.s32 %r89, %r7, %r88;
- div.s32 %r90, %r89, %r11;
- mul.lo.s32 %r91, %r90, %r11;
- sub.s32 %r92, %r89, %r91;
- mad.lo.s32 %r93, %r90, %r8, %r92;
- mul.lo.s32 %r94, %r54, %r85;
- mad.lo.s32 %r95, %r55, %r87, %r94;
- mad.lo.s32 %r96, %r93, %r56, %r95;
- add.s32 %r97, %r122, %r96;
- mul.wide.s32 %rd5, %r97, 4;
+ div.s32 %r83, %r7, %r9;
+ div.s32 %r84, %r83, %r49;
+ mul.lo.s32 %r85, %r84, %r49;
+ sub.s32 %r86, %r83, %r85;
+ mul.lo.s32 %r87, %r83, %r9;
+ sub.s32 %r88, %r7, %r87;
+ mul.lo.s32 %r89, %r54, %r88;
+ mad.lo.s32 %r90, %r53, %r86, %r89;
+ mad.lo.s32 %r91, %r52, %r84, %r90;
+ add.s32 %r92, %r117, %r91;
+ mul.wide.s32 %rd5, %r92, 4;
add.s64 %rd6, %rd1, %rd5;
ld.global.f32 %f10, [%rd6];
add.f32 %f21, %f21, %f10;
$L__BB0_6:
- add.s32 %r122, %r122, %r14;
- add.s32 %r121, %r121, %r4;
- add.s32 %r123, %r123, 1;
- setp.lt.s32 %p4, %r123, %r6;
+ add.s32 %r117, %r117, %r12;
+ add.s32 %r116, %r116, %r4;
+ add.s32 %r118, %r118, 1;
+ setp.lt.s32 %p4, %r118, %r6;
@%p4 bra $L__BB0_3;
bra.uni $L__BB0_7;
$L__BB0_1:
mov.f32 %f21, 0f00000000;
$L__BB0_7:
- mov.u32 %r98, %tid.z;
- mov.u32 %r24, %ntid.y;
- mov.u32 %r25, %tid.y;
- mad.lo.s32 %r99, %r24, %r98, %r25;
- mov.u32 %r26, %tid.x;
- mad.lo.s32 %r27, %r99, %r4, %r26;
- mul.wide.u32 %rd7, %r27, 4;
+ mov.u32 %r93, %tid.z;
+ mov.u32 %r22, %ntid.y;
+ mov.u32 %r23, %tid.y;
+ mad.lo.s32 %r94, %r22, %r93, %r23;
+ mov.u32 %r24, %tid.x;
+ mad.lo.s32 %r25, %r94, %r4, %r24;
+ mul.wide.u32 %rd7, %r25, 4;
mov.u64 %rd8, _ZN11kernelscope6kernelE;
add.s64 %rd2, %rd8, %rd7;
st.shared.f32 [%rd2], %f21;
bar.sync 0;
- clz.b32 %r100, %r4;
- mov.u32 %r101, 31;
- sub.s32 %r102, %r101, %r100;
- mov.u32 %r103, 1;
- shl.b32 %r124, %r103, %r102;
- setp.ge.u32 %p5, %r26, %r124;
- add.s32 %r104, %r124, %r26;
- setp.ge.u32 %p6, %r104, %r4;
+ clz.b32 %r95, %r4;
+ mov.u32 %r96, 31;
+ sub.s32 %r97, %r96, %r95;
+ mov.u32 %r98, 1;
+ shl.b32 %r119, %r98, %r97;
+ setp.ge.u32 %p5, %r24, %r119;
+ add.s32 %r99, %r119, %r24;
+ setp.ge.u32 %p6, %r99, %r4;
or.pred %p7, %p5, %p6;
@%p7 bra $L__BB0_9;
- add.s32 %r105, %r27, %r124;
- mul.wide.s32 %rd9, %r105, 4;
+ add.s32 %r100, %r25, %r119;
+ mul.wide.s32 %rd9, %r100, 4;
add.s64 %rd11, %rd8, %rd9;
ld.shared.f32 %f11, [%rd2];
ld.shared.f32 %f12, [%rd11];
add.f32 %f13, %f12, %f11;
st.shared.f32 [%rd2], %f13;
$L__BB0_9:
bar.sync 0;
- setp.lt.s32 %p8, %r124, 4;
+ setp.lt.s32 %p8, %r119, 4;
@%p8 bra $L__BB0_13;
$L__BB0_10:
- shr.u32 %r30, %r124, 1;
- setp.ge.u32 %p9, %r26, %r30;
+ shr.u32 %r28, %r119, 1;
+ setp.ge.u32 %p9, %r24, %r28;
@%p9 bra $L__BB0_12;
- add.s32 %r106, %r30, %r27;
- mul.wide.s32 %rd12, %r106, 4;
+ add.s32 %r101, %r28, %r25;
+ mul.wide.s32 %rd12, %r101, 4;
add.s64 %rd14, %rd8, %rd12;
ld.shared.f32 %f14, [%rd2];
ld.shared.f32 %f15, [%rd14];
add.f32 %f16, %f15, %f14;
st.shared.f32 [%rd2], %f16;
$L__BB0_12:
bar.sync 0;
- setp.gt.u32 %p10, %r124, 7;
- mov.u32 %r124, %r30;
+ setp.gt.u32 %p10, %r119, 7;
+ mov.u32 %r119, %r28;
@%p10 bra $L__BB0_10;
$L__BB0_13:
- setp.ne.s32 %p11, %r26, 0;
+ setp.ne.s32 %p11, %r24, 0;
mov.f32 %f23, 0f00000000;
@%p11 bra $L__BB0_16;
ld.shared.f32 %f18, [%rd2];
add.f32 %f23, %f18, 0f00000000;
setp.lt.u32 %p12, %r4, 2;
@%p12 bra $L__BB0_16;
- add.s32 %r107, %r27, 1;
- mul.wide.u32 %rd15, %r107, 4;
+ add.s32 %r102, %r25, 1;
+ mul.wide.u32 %rd15, %r102, 4;
add.s64 %rd17, %rd8, %rd15;
ld.shared.f32 %f19, [%rd17];
add.f32 %f23, %f23, %f19;
$L__BB0_16:
bar.sync 0;
@%p11 bra $L__BB0_19;
- mov.u32 %r108, %ctaid.x;
- mad.lo.s32 %r31, %r24, %r108, %r25;
- mul.lo.s32 %r109, %r50, %r51;
- add.s32 %r110, %r52, 1;
- shr.u32 %r111, %r110, 31;
- add.s32 %r112, %r110, %r111;
- shr.s32 %r113, %r112, 1;
- add.s32 %r114, %r113, 3;
- shr.s32 %r115, %r114, 31;
- shr.u32 %r116, %r115, 30;
- add.s32 %r117, %r114, %r116;
- shr.u32 %r118, %r117, 2;
- mul.lo.s32 %r119, %r109, %r118;
- shl.b32 %r120, %r119, 3;
- setp.ge.s32 %p14, %r31, %r120;
+ mov.u32 %r103, %ctaid.x;
+ mad.lo.s32 %r29, %r22, %r103, %r23;
+ mul.lo.s32 %r104, %r48, %r49;
+ add.s32 %r105, %r50, 1;
+ shr.u32 %r106, %r105, 31;
+ add.s32 %r107, %r105, %r106;
+ shr.s32 %r108, %r107, 1;
+ add.s32 %r109, %r108, 3;
+ shr.s32 %r110, %r109, 31;
+ shr.u32 %r111, %r110, 30;
+ add.s32 %r112, %r109, %r111;
+ shr.u32 %r113, %r112, 2;
+ mul.lo.s32 %r114, %r104, %r113;
+ shl.b32 %r115, %r114, 3;
+ setp.ge.s32 %p14, %r29, %r115;
@%p14 bra $L__BB0_19;
cvta.to.global.u64 %rd18, %rd4;
- mul.wide.s32 %rd19, %r31, 4;
+ mul.wide.s32 %rd19, %r29, 4;
add.s64 %rd20, %rd18, %rd19;
st.global.f32 [%rd20], %f23;
$L__BB0_19:
ret;
20: ReshapeReduction.FusionReshapeReduction/84
Kernel 2
CUDA
PTX
53997da5d
Diff
03a1b695e
-4
+4 index type: int
registers: 16
gmem: 3
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 6, 6> T1, Tensor<float, 6, 6> T4, Tensor<float, 6, 6> T5) {
if (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < 24)) {
Array<float, 1, 1> T6;
T6[0] = 0;
T6[0]
= T1[((((T1.alloc_stride[1LL] * (((nvfuser_index_t)threadIdx.x) / 4)) + (T1.alloc_stride[3LL] * ((((nvfuser_index_t)threadIdx.x) % 4) / 2))) + (T1.alloc_stride[4LL] * ((((nvfuser_index_t)threadIdx.x) % 4) % 2))) + ((32 * T1.alloc_stride[1LL]) * ((nvfuser_index_t)blockIdx.x)))];
Array<float, 1, 1> T7;
T7[0] = 0;
T7[0]
= T4[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))];
Array<float, 1, 1> T8;
T8[0]
= T7[0]
+ T6[0];
T5[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
= T8[0];
}
}
__global__ void nvfuser_N(Tensor<float, 6, 6> T1, Tensor<float, 6, 6> T4, Tensor<float, 6, 6> T5) {
if ((((nvfuser_index_t)threadIdx.x) < 24)) {
Array<float, 1, 1> T6;
T6[0] = 0;
T6[0]
= T1[(((T1.alloc_stride[1LL] * (((nvfuser_index_t)threadIdx.x) / 4)) + (T1.alloc_stride[3LL] * ((((nvfuser_index_t)threadIdx.x) % 4) / 2))) + (T1.alloc_stride[4LL] * ((((nvfuser_index_t)threadIdx.x) % 4) % 2)))];
Array<float, 1, 1> T7;
T7[0] = 0;
T7[0]
= T4[((nvfuser_index_t)threadIdx.x)];
Array<float, 1, 1> T8;
T8[0]
= T7[0]
+ T6[0];
T5[((nvfuser_index_t)threadIdx.x)]
= T8[0];
}
}
--- 53997da5d
+++ 03a1b695e
@@ -1,18 +1,18 @@
__global__ void nvfuser_N(Tensor<float, 6, 6> T1, Tensor<float, 6, 6> T4, Tensor<float, 6, 6> T5) {
- if (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < 24)) {
+ if ((((nvfuser_index_t)threadIdx.x) < 24)) {
Array<float, 1, 1> T6;
T6[0] = 0;
T6[0]
- = T1[((((T1.alloc_stride[1LL] * (((nvfuser_index_t)threadIdx.x) / 4)) + (T1.alloc_stride[3LL] * ((((nvfuser_index_t)threadIdx.x) % 4) / 2))) + (T1.alloc_stride[4LL] * ((((nvfuser_index_t)threadIdx.x) % 4) % 2))) + ((32 * T1.alloc_stride[1LL]) * ((nvfuser_index_t)blockIdx.x)))];
+ = T1[(((T1.alloc_stride[1LL] * (((nvfuser_index_t)threadIdx.x) / 4)) + (T1.alloc_stride[3LL] * ((((nvfuser_index_t)threadIdx.x) % 4) / 2))) + (T1.alloc_stride[4LL] * ((((nvfuser_index_t)threadIdx.x) % 4) % 2)))];
Array<float, 1, 1> T7;
T7[0] = 0;
T7[0]
- = T4[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))];
+ = T4[((nvfuser_index_t)threadIdx.x)];
Array<float, 1, 1> T8;
T8[0]
= T7[0]
+ T6[0];
- T5[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
+ T5[((nvfuser_index_t)threadIdx.x)]
= T8[0];
}
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_517_cu_2912043c_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_517_cu_2912043c_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_517_cu_2912043c_191103std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_517_cu_2912043c_1911011nvfuser_517ENS_6TensorIfLi6ELi6EEES1_S1_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_517_cu_2912043c_1911011nvfuser_517ENS_6TensorIfLi6ELi6EEES1_S1__param_0[56],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_517_cu_2912043c_1911011nvfuser_517ENS_6TensorIfLi6ELi6EEES1_S1__param_1[56],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_517_cu_2912043c_1911011nvfuser_517ENS_6TensorIfLi6ELi6EEES1_S1__param_2[56]
)
{
.reg .pred %p<2>;
.reg .f32 %f<4>;
.reg .b32 %r<93>;
.reg .b64 %rd<12>;
ld.param.v2.u32 {%r46, %r47}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_517_cu_2912043c_1911011nvfuser_517ENS_6TensorIfLi6ELi6EEES1_S1__param_0+32];
ld.param.v2.u32 {%r48, %r49}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_517_cu_2912043c_1911011nvfuser_517ENS_6TensorIfLi6ELi6EEES1_S1__param_0+40];
ld.param.v2.u32 {%r50, %r51}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_517_cu_2912043c_1911011nvfuser_517ENS_6TensorIfLi6ELi6EEES1_S1__param_0+48];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_517_cu_2912043c_1911011nvfuser_517ENS_6TensorIfLi6ELi6EEES1_S1__param_2];
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_517_cu_2912043c_1911011nvfuser_517ENS_6TensorIfLi6ELi6EEES1_S1__param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_517_cu_2912043c_1911011nvfuser_517ENS_6TensorIfLi6ELi6EEES1_S1__param_0];
mov.u32 %r1, %ctaid.x;
shl.b32 %r76, %r1, 7;
mov.u32 %r2, %tid.x;
add.s32 %r3, %r76, %r2;
setp.gt.s32 %p1, %r3, 23;
@%p1 bra $L__BB0_2;
cvta.to.global.u64 %rd4, %rd1;
cvta.to.global.u64 %rd5, %rd3;
cvta.to.global.u64 %rd6, %rd2;
shr.s32 %r77, %r2, 31;
shr.u32 %r78, %r77, 30;
add.s32 %r79, %r2, %r78;
and.b32 %r80, %r79, -4;
sub.s32 %r81, %r2, %r80;
shr.u32 %r82, %r81, 31;
add.s32 %r83, %r81, %r82;
shr.s32 %r84, %r83, 1;
mul.lo.s32 %r85, %r49, %r84;
and.b32 %r86, %r83, -2;
sub.s32 %r87, %r81, %r86;
shl.b32 %r88, %r1, 5;
shr.s32 %r89, %r79, 2;
add.s32 %r90, %r88, %r89;
mad.lo.s32 %r91, %r50, %r87, %r85;
mad.lo.s32 %r92, %r90, %r47, %r91;
mul.wide.s32 %rd7, %r92, 4;
add.s64 %rd8, %rd4, %rd7;
mul.wide.s32 %rd9, %r3, 4;
add.s64 %rd10, %rd6, %rd9;
ld.global.f32 %f1, [%rd10];
ld.global.f32 %f2, [%rd8];
add.f32 %f3, %f2, %f1;
add.s64 %rd11, %rd5, %rd9;
st.global.f32 [%rd11], %f3;
$L__BB0_2:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_517_cu_37f538a6_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_517_cu_37f538a6_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_517_cu_37f538a6_160113std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_517_cu_37f538a6_1601111nvfuser_517ENS_6TensorIfLi6ELi6EEES1_S1_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_517_cu_37f538a6_1601111nvfuser_517ENS_6TensorIfLi6ELi6EEES1_S1__param_0[56],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_517_cu_37f538a6_1601111nvfuser_517ENS_6TensorIfLi6ELi6EEES1_S1__param_1[56],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_517_cu_37f538a6_1601111nvfuser_517ENS_6TensorIfLi6ELi6EEES1_S1__param_2[56]
)
{
.reg .pred %p<2>;
.reg .f32 %f<4>;
.reg .b32 %r<88>;
.reg .b64 %rd<12>;
ld.param.v2.u32 {%r44, %r45}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_517_cu_37f538a6_1601111nvfuser_517ENS_6TensorIfLi6ELi6EEES1_S1__param_0+32];
ld.param.v2.u32 {%r46, %r47}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_517_cu_37f538a6_1601111nvfuser_517ENS_6TensorIfLi6ELi6EEES1_S1__param_0+40];
ld.param.v2.u32 {%r48, %r49}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_517_cu_37f538a6_1601111nvfuser_517ENS_6TensorIfLi6ELi6EEES1_S1__param_0+48];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_517_cu_37f538a6_1601111nvfuser_517ENS_6TensorIfLi6ELi6EEES1_S1__param_2];
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_517_cu_37f538a6_1601111nvfuser_517ENS_6TensorIfLi6ELi6EEES1_S1__param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_517_cu_37f538a6_1601111nvfuser_517ENS_6TensorIfLi6ELi6EEES1_S1__param_0];
mov.u32 %r1, %tid.x;
setp.gt.s32 %p1, %r1, 23;
@%p1 bra $L__BB0_2;
cvta.to.global.u64 %rd4, %rd1;
cvta.to.global.u64 %rd5, %rd3;
cvta.to.global.u64 %rd6, %rd2;
shr.s32 %r74, %r1, 31;
shr.u32 %r75, %r74, 30;
add.s32 %r76, %r1, %r75;
shr.s32 %r77, %r76, 2;
and.b32 %r78, %r76, -4;
sub.s32 %r79, %r1, %r78;
shr.u32 %r80, %r79, 31;
add.s32 %r81, %r79, %r80;
shr.s32 %r82, %r81, 1;
mul.lo.s32 %r83, %r47, %r82;
mad.lo.s32 %r84, %r45, %r77, %r83;
and.b32 %r85, %r81, -2;
sub.s32 %r86, %r79, %r85;
mad.lo.s32 %r87, %r48, %r86, %r84;
mul.wide.s32 %rd7, %r87, 4;
add.s64 %rd8, %rd4, %rd7;
mul.wide.s32 %rd9, %r1, 4;
add.s64 %rd10, %rd6, %rd9;
ld.global.f32 %f1, [%rd10];
ld.global.f32 %f2, [%rd8];
add.f32 %f3, %f2, %f1;
add.s64 %rd11, %rd5, %rd9;
st.global.f32 [%rd11], %f3;
$L__BB0_2:
ret;
}
--- 53997da5d
+++ 03a1b695e
@@ -20,49 +20,44 @@
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi6ELi6EEES1_S1__param_2[56]
)
{
.reg .pred %p<2>;
.reg .f32 %f<4>;
- .reg .b32 %r<93>;
+ .reg .b32 %r<88>;
.reg .b64 %rd<12>;
- ld.param.v2.u32 {%r46, %r47}, [_ZN11kernelscope6kernelENS_6TensorIfLi6ELi6EEES1_S1__param_0+32];
- ld.param.v2.u32 {%r48, %r49}, [_ZN11kernelscope6kernelENS_6TensorIfLi6ELi6EEES1_S1__param_0+40];
- ld.param.v2.u32 {%r50, %r51}, [_ZN11kernelscope6kernelENS_6TensorIfLi6ELi6EEES1_S1__param_0+48];
+ ld.param.v2.u32 {%r44, %r45}, [_ZN11kernelscope6kernelENS_6TensorIfLi6ELi6EEES1_S1__param_0+32];
+ ld.param.v2.u32 {%r46, %r47}, [_ZN11kernelscope6kernelENS_6TensorIfLi6ELi6EEES1_S1__param_0+40];
+ ld.param.v2.u32 {%r48, %r49}, [_ZN11kernelscope6kernelENS_6TensorIfLi6ELi6EEES1_S1__param_0+48];
ld.param.u64 %rd3, [_ZN11kernelscope6kernelENS_6TensorIfLi6ELi6EEES1_S1__param_2];
ld.param.u64 %rd2, [_ZN11kernelscope6kernelENS_6TensorIfLi6ELi6EEES1_S1__param_1];
ld.param.u64 %rd1, [_ZN11kernelscope6kernelENS_6TensorIfLi6ELi6EEES1_S1__param_0];
- mov.u32 %r1, %ctaid.x;
- shl.b32 %r76, %r1, 7;
- mov.u32 %r2, %tid.x;
- add.s32 %r3, %r76, %r2;
- setp.gt.s32 %p1, %r3, 23;
+ mov.u32 %r1, %tid.x;
+ setp.gt.s32 %p1, %r1, 23;
@%p1 bra $L__BB0_2;
cvta.to.global.u64 %rd4, %rd1;
cvta.to.global.u64 %rd5, %rd3;
cvta.to.global.u64 %rd6, %rd2;
- shr.s32 %r77, %r2, 31;
- shr.u32 %r78, %r77, 30;
- add.s32 %r79, %r2, %r78;
- and.b32 %r80, %r79, -4;
- sub.s32 %r81, %r2, %r80;
- shr.u32 %r82, %r81, 31;
- add.s32 %r83, %r81, %r82;
- shr.s32 %r84, %r83, 1;
- mul.lo.s32 %r85, %r49, %r84;
- and.b32 %r86, %r83, -2;
- sub.s32 %r87, %r81, %r86;
- shl.b32 %r88, %r1, 5;
- shr.s32 %r89, %r79, 2;
- add.s32 %r90, %r88, %r89;
- mad.lo.s32 %r91, %r50, %r87, %r85;
- mad.lo.s32 %r92, %r90, %r47, %r91;
- mul.wide.s32 %rd7, %r92, 4;
+ shr.s32 %r74, %r1, 31;
+ shr.u32 %r75, %r74, 30;
+ add.s32 %r76, %r1, %r75;
+ shr.s32 %r77, %r76, 2;
+ and.b32 %r78, %r76, -4;
+ sub.s32 %r79, %r1, %r78;
+ shr.u32 %r80, %r79, 31;
+ add.s32 %r81, %r79, %r80;
+ shr.s32 %r82, %r81, 1;
+ mul.lo.s32 %r83, %r47, %r82;
+ mad.lo.s32 %r84, %r45, %r77, %r83;
+ and.b32 %r85, %r81, -2;
+ sub.s32 %r86, %r79, %r85;
+ mad.lo.s32 %r87, %r48, %r86, %r84;
+ mul.wide.s32 %rd7, %r87, 4;
add.s64 %rd8, %rd4, %rd7;
- mul.wide.s32 %rd9, %r3, 4;
+ mul.wide.s32 %rd9, %r1, 4;
add.s64 %rd10, %rd6, %rd9;
ld.global.f32 %f1, [%rd10];
ld.global.f32 %f2, [%rd8];
add.f32 %f3, %f2, %f1;
add.s64 %rd11, %rd5, %rd9;
21: ReshapeReduction.FusionReshapeReduction/88
Kernel 1
CUDA
PTX
53997da5d
Diff
03a1b695e
-21
+18 index type: int
registers: 0
gmem: 0
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 6, 6> T0, Tensor<float, 3, 3> T4) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
Array<float, 1, 1> T7;
T7[0] = 0.000000000e+00f;
#pragma unroll 1
for(nvfuser_index_t i0 = 0; i0 < (ceilDiv(8, ((nvfuser_index_t)blockDim.x))); ++i0) {
if ((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 64) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i0)) < 8))) {
T7[0]
= T7[0]
+ T0[((((T0.alloc_stride[5LL] * ((nvfuser_index_t)threadIdx.x)) + (T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) / 8))) + (T0.alloc_stride[3LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) % 8))) + ((((nvfuser_index_t)blockDim.x) * T0.alloc_stride[5LL]) * i0))];
}
}
Array<float, 1, 1> T2;
T2[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T2[0], T7[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T3;
T3[0]
= T2[0];
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 64))) {
T4[(((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x)))]
= T3[0];
}
}
__global__ void nvfuser_N(Tensor<float, 4, 4> T1, Tensor<float, 3, 3> T4, Tensor<float, 4, 4> T6) {
if ((((nvfuser_index_t)threadIdx.x) < 64)) {
Array<float, 1, 1> T7;
T7[0] = 0;
T7[0]
= T1[(((T1.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) / 8)) + (T1.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.x) % 8) / 4))) + (T1.alloc_stride[2LL] * ((((nvfuser_index_t)threadIdx.x) % 8) % 4)))];
Array<float, 1, 1> T8;
T8[0] = 0;
T8[0]
= T4[((nvfuser_index_t)threadIdx.x)];
Array<float, 1, 1> T5;
T5[0]
= T8[0];
Array<float, 1, 1> T9;
T9[0]
= T5[0]
+ T7[0];
T6[((nvfuser_index_t)threadIdx.x)]
= T9[0];
}
}
--- 53997da5d
+++ 03a1b695e
@@ -1,24 +1,21 @@
-__global__ void nvfuser_N(Tensor<float, 6, 6> T0, Tensor<float, 3, 3> T4) {
- alignas(16) extern __shared__ char array[];
- void* shared_mem = array;
- Array<float, 1, 1> T7;
- T7[0] = 0.000000000e+00f;
- #pragma unroll 1
- for(nvfuser_index_t i0 = 0; i0 < (ceilDiv(8, ((nvfuser_index_t)blockDim.x))); ++i0) {
- if ((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 64) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i0)) < 8))) {
- T7[0]
- = T7[0]
- + T0[((((T0.alloc_stride[5LL] * ((nvfuser_index_t)threadIdx.x)) + (T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) / 8))) + (T0.alloc_stride[3LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) % 8))) + ((((nvfuser_index_t)blockDim.x) * T0.alloc_stride[5LL]) * i0))];
- }
- }
- Array<float, 1, 1> T2;
- T2[0] = 0.000000000e+00f;
- blockReduce<true, false, false, true>(T2[0], T7[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
- Array<float, 1, 1> T3;
- T3[0]
- = T2[0];
- if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 64))) {
- T4[(((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x)))]
- = T3[0];
+__global__ void nvfuser_N(Tensor<float, 4, 4> T1, Tensor<float, 3, 3> T4, Tensor<float, 4, 4> T6) {
+ if ((((nvfuser_index_t)threadIdx.x) < 64)) {
+ Array<float, 1, 1> T7;
+ T7[0] = 0;
+ T7[0]
+ = T1[(((T1.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) / 8)) + (T1.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.x) % 8) / 4))) + (T1.alloc_stride[2LL] * ((((nvfuser_index_t)threadIdx.x) % 8) % 4)))];
+ Array<float, 1, 1> T8;
+ T8[0] = 0;
+ T8[0]
+ = T4[((nvfuser_index_t)threadIdx.x)];
+ Array<float, 1, 1> T5;
+ T5[0]
+ = T8[0];
+ Array<float, 1, 1> T9;
+ T9[0]
+ = T5[0]
+ + T7[0];
+ T6[((nvfuser_index_t)threadIdx.x)]
+ = T9[0];
}
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_523_cu_3378c5c7_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_523_cu_3378c5c7_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_523_cu_3378c5c7_191103std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_523_cu_3378c5c7_191105arrayE[];
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_523_cu_3378c5c7_1911011nvfuser_523ENS_6TensorIfLi6ELi6EEENS0_IfLi3ELi3EEE(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_523_cu_3378c5c7_1911011nvfuser_523ENS_6TensorIfLi6ELi6EEENS0_IfLi3ELi3EEE_param_0[56],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_523_cu_3378c5c7_1911011nvfuser_523ENS_6TensorIfLi6ELi6EEENS0_IfLi3ELi3EEE_param_1[32]
)
{
.reg .pred %p<16>;
.reg .f32 %f<24>;
.reg .b32 %r<85>;
.reg .b64 %rd<21>;
ld.param.v2.u32 {%r45, %r46}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_523_cu_3378c5c7_1911011nvfuser_523ENS_6TensorIfLi6ELi6EEENS0_IfLi3ELi3EEE_param_0+32];
ld.param.v2.u32 {%r47, %r48}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_523_cu_3378c5c7_1911011nvfuser_523ENS_6TensorIfLi6ELi6EEENS0_IfLi3ELi3EEE_param_0+40];
ld.param.v2.u32 {%r49, %r50}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_523_cu_3378c5c7_1911011nvfuser_523ENS_6TensorIfLi6ELi6EEENS0_IfLi3ELi3EEE_param_0+48];
ld.param.u64 %rd4, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_523_cu_3378c5c7_1911011nvfuser_523ENS_6TensorIfLi6ELi6EEENS0_IfLi3ELi3EEE_param_1];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_523_cu_3378c5c7_1911011nvfuser_523ENS_6TensorIfLi6ELi6EEENS0_IfLi3ELi3EEE_param_0];
mov.u32 %r1, %ntid.x;
add.s32 %r57, %r1, 7;
div.s32 %r2, %r57, %r1;
setp.gt.s32 %p1, %r2, 0;
@%p1 bra $L__BB0_2;
bra.uni $L__BB0_1;
$L__BB0_2:
mov.u32 %r59, %ctaid.x;
mov.u32 %r60, %ntid.y;
mov.u32 %r61, %tid.y;
mad.lo.s32 %r3, %r60, %r59, %r61;
shr.s32 %r62, %r3, 31;
shr.u32 %r63, %r62, 29;
add.s32 %r64, %r3, %r63;
shr.s32 %r65, %r64, 3;
and.b32 %r66, %r64, -8;
sub.s32 %r67, %r3, %r66;
mov.u32 %r81, %tid.x;
mul.lo.s32 %r68, %r48, %r67;
mad.lo.s32 %r69, %r50, %r81, %r68;
mad.lo.s32 %r82, %r45, %r65, %r69;
mul.lo.s32 %r6, %r50, %r1;
cvta.to.global.u64 %rd1, %rd3;
mov.f32 %f21, 0f00000000;
mov.u32 %r83, 0;
$L__BB0_3:
.pragma "nounroll";
setp.gt.s32 %p2, %r81, 7;
setp.gt.s32 %p3, %r3, 63;
or.pred %p4, %p3, %p2;
@%p4 bra $L__BB0_5;
mul.wide.s32 %rd5, %r82, 4;
add.s64 %rd6, %rd1, %rd5;
ld.global.f32 %f10, [%rd6];
add.f32 %f21, %f21, %f10;
$L__BB0_5:
add.s32 %r82, %r82, %r6;
add.s32 %r81, %r81, %r1;
add.s32 %r83, %r83, 1;
setp.lt.s32 %p5, %r83, %r2;
@%p5 bra $L__BB0_3;
bra.uni $L__BB0_6;
$L__BB0_1:
mov.f32 %f21, 0f00000000;
$L__BB0_6:
mov.u32 %r70, %tid.z;
mov.u32 %r13, %ntid.y;
mov.u32 %r14, %tid.y;
mad.lo.s32 %r71, %r13, %r70, %r14;
mov.u32 %r15, %tid.x;
mad.lo.s32 %r16, %r71, %r1, %r15;
mul.wide.u32 %rd7, %r16, 4;
mov.u64 %rd8, _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_523_cu_3378c5c7_191105arrayE;
add.s64 %rd2, %rd8, %rd7;
st.shared.f32 [%rd2], %f21;
bar.sync 0;
clz.b32 %r72, %r1;
mov.u32 %r73, 31;
sub.s32 %r74, %r73, %r72;
mov.u32 %r75, 1;
shl.b32 %r84, %r75, %r74;
setp.ge.u32 %p6, %r15, %r84;
add.s32 %r76, %r84, %r15;
setp.ge.u32 %p7, %r76, %r1;
or.pred %p8, %p6, %p7;
@%p8 bra $L__BB0_8;
add.s32 %r77, %r16, %r84;
mul.wide.s32 %rd9, %r77, 4;
add.s64 %rd11, %rd8, %rd9;
ld.shared.f32 %f11, [%rd2];
ld.shared.f32 %f12, [%rd11];
add.f32 %f13, %f12, %f11;
st.shared.f32 [%rd2], %f13;
$L__BB0_8:
bar.sync 0;
setp.lt.s32 %p9, %r84, 4;
@%p9 bra $L__BB0_12;
$L__BB0_9:
shr.u32 %r19, %r84, 1;
setp.ge.u32 %p10, %r15, %r19;
@%p10 bra $L__BB0_11;
add.s32 %r78, %r19, %r16;
mul.wide.s32 %rd12, %r78, 4;
add.s64 %rd14, %rd8, %rd12;
ld.shared.f32 %f14, [%rd2];
ld.shared.f32 %f15, [%rd14];
add.f32 %f16, %f15, %f14;
st.shared.f32 [%rd2], %f16;
$L__BB0_11:
bar.sync 0;
setp.gt.u32 %p11, %r84, 7;
mov.u32 %r84, %r19;
@%p11 bra $L__BB0_9;
$L__BB0_12:
setp.ne.s32 %p12, %r15, 0;
mov.f32 %f23, 0f00000000;
@%p12 bra $L__BB0_15;
ld.shared.f32 %f18, [%rd2];
add.f32 %f23, %f18, 0f00000000;
setp.lt.u32 %p13, %r1, 2;
@%p13 bra $L__BB0_15;
add.s32 %r79, %r16, 1;
mul.wide.u32 %rd15, %r79, 4;
add.s64 %rd17, %rd8, %rd15;
ld.shared.f32 %f19, [%rd17];
add.f32 %f23, %f23, %f19;
$L__BB0_15:
bar.sync 0;
@%p12 bra $L__BB0_18;
mov.u32 %r80, %ctaid.x;
mad.lo.s32 %r20, %r13, %r80, %r14;
setp.gt.s32 %p15, %r20, 63;
@%p15 bra $L__BB0_18;
cvta.to.global.u64 %rd18, %rd4;
mul.wide.s32 %rd19, %r20, 4;
add.s64 %rd20, %rd18, %rd19;
st.global.f32 [%rd20], %f23;
$L__BB0_18:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_523_cu_37f538a6_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_523_cu_37f538a6_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_523_cu_37f538a6_160113std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_523_cu_37f538a6_1601111nvfuser_523ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEES1_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_523_cu_37f538a6_1601111nvfuser_523ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEES1__param_0[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_523_cu_37f538a6_1601111nvfuser_523ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEES1__param_1[32],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_523_cu_37f538a6_1601111nvfuser_523ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEES1__param_2[40]
)
{
.reg .pred %p<2>;
.reg .f32 %f<4>;
.reg .b32 %r<61>;
.reg .b64 %rd<12>;
ld.param.v2.u32 {%r28, %r29}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_523_cu_37f538a6_1601111nvfuser_523ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEES1__param_0+24];
ld.param.v2.u32 {%r30, %r31}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_523_cu_37f538a6_1601111nvfuser_523ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEES1__param_0+32];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_523_cu_37f538a6_1601111nvfuser_523ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEES1__param_2];
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_523_cu_37f538a6_1601111nvfuser_523ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEES1__param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_523_cu_37f538a6_1601111nvfuser_523ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEES1__param_0];
mov.u32 %r1, %tid.x;
setp.gt.s32 %p1, %r1, 63;
@%p1 bra $L__BB0_2;
cvta.to.global.u64 %rd4, %rd1;
cvta.to.global.u64 %rd5, %rd3;
cvta.to.global.u64 %rd6, %rd2;
shr.s32 %r46, %r1, 31;
shr.u32 %r47, %r46, 29;
add.s32 %r48, %r1, %r47;
shr.s32 %r49, %r48, 3;
and.b32 %r50, %r48, -8;
sub.s32 %r51, %r1, %r50;
shr.s32 %r52, %r51, 31;
shr.u32 %r53, %r52, 30;
add.s32 %r54, %r51, %r53;
shr.s32 %r55, %r54, 2;
mul.lo.s32 %r56, %r29, %r55;
mad.lo.s32 %r57, %r28, %r49, %r56;
and.b32 %r58, %r54, -4;
sub.s32 %r59, %r51, %r58;
mad.lo.s32 %r60, %r30, %r59, %r57;
mul.wide.s32 %rd7, %r60, 4;
add.s64 %rd8, %rd4, %rd7;
mul.wide.s32 %rd9, %r1, 4;
add.s64 %rd10, %rd6, %rd9;
ld.global.f32 %f1, [%rd10];
ld.global.f32 %f2, [%rd8];
add.f32 %f3, %f2, %f1;
add.s64 %rd11, %rd5, %rd9;
st.global.f32 [%rd11], %f3;
$L__BB0_2:
ret;
}
--- 53997da5d
+++ 03a1b695e
@@ -11,164 +11,61 @@
.address_size 64
.global .align 1 .u8 _ZN11kernelscope6kernel17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN11kernelscope6kernel17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN11kernelscope6kernel14__numeric_typeIvE5valueE = 1;
-.extern .shared .align 16 .b8 _ZN11kernelscope6kernelE[];
-.entry _ZN11kernelscope6kernelENS_6TensorIfLi6ELi6EEENS0_IfLi3ELi3EEE(
- .param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi6ELi6EEENS0_IfLi3ELi3EEE_param_0[56],
- .param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi6ELi6EEENS0_IfLi3ELi3EEE_param_1[32]
+.entry _ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEES1_(
+ .param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEES1__param_0[40],
+ .param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEES1__param_1[32],
+ .param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEES1__param_2[40]
)
{
- .reg .pred %p<16>;
- .reg .f32 %f<24>;
- .reg .b32 %r<85>;
- .reg .b64 %rd<21>;
+ .reg .pred %p<2>;
+ .reg .f32 %f<4>;
+ .reg .b32 %r<61>;
+ .reg .b64 %rd<12>;
- ld.param.v2.u32 {%r45, %r46}, [_ZN11kernelscope6kernelENS_6TensorIfLi6ELi6EEENS0_IfLi3ELi3EEE_param_0+32];
- ld.param.v2.u32 {%r47, %r48}, [_ZN11kernelscope6kernelENS_6TensorIfLi6ELi6EEENS0_IfLi3ELi3EEE_param_0+40];
- ld.param.v2.u32 {%r49, %r50}, [_ZN11kernelscope6kernelENS_6TensorIfLi6ELi6EEENS0_IfLi3ELi3EEE_param_0+48];
- ld.param.u64 %rd4, [_ZN11kernelscope6kernelENS_6TensorIfLi6ELi6EEENS0_IfLi3ELi3EEE_param_1];
- ld.param.u64 %rd3, [_ZN11kernelscope6kernelENS_6TensorIfLi6ELi6EEENS0_IfLi3ELi3EEE_param_0];
- mov.u32 %r1, %ntid.x;
- add.s32 %r57, %r1, 7;
- div.s32 %r2, %r57, %r1;
- setp.gt.s32 %p1, %r2, 0;
+ ld.param.v2.u32 {%r28, %r29}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEES1__param_0+24];
+ ld.param.v2.u32 {%r30, %r31}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEES1__param_0+32];
+ ld.param.u64 %rd3, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEES1__param_2];
+ ld.param.u64 %rd2, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEES1__param_1];
+ ld.param.u64 %rd1, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEES1__param_0];
+ mov.u32 %r1, %tid.x;
+ setp.gt.s32 %p1, %r1, 63;
@%p1 bra $L__BB0_2;
- bra.uni $L__BB0_1;
+
+ cvta.to.global.u64 %rd4, %rd1;
+ cvta.to.global.u64 %rd5, %rd3;
+ cvta.to.global.u64 %rd6, %rd2;
+ shr.s32 %r46, %r1, 31;
+ shr.u32 %r47, %r46, 29;
+ add.s32 %r48, %r1, %r47;
+ shr.s32 %r49, %r48, 3;
+ and.b32 %r50, %r48, -8;
+ sub.s32 %r51, %r1, %r50;
+ shr.s32 %r52, %r51, 31;
+ shr.u32 %r53, %r52, 30;
+ add.s32 %r54, %r51, %r53;
+ shr.s32 %r55, %r54, 2;
+ mul.lo.s32 %r56, %r29, %r55;
+ mad.lo.s32 %r57, %r28, %r49, %r56;
+ and.b32 %r58, %r54, -4;
+ sub.s32 %r59, %r51, %r58;
+ mad.lo.s32 %r60, %r30, %r59, %r57;
+ mul.wide.s32 %rd7, %r60, 4;
+ add.s64 %rd8, %rd4, %rd7;
+ mul.wide.s32 %rd9, %r1, 4;
+ add.s64 %rd10, %rd6, %rd9;
+ ld.global.f32 %f1, [%rd10];
+ ld.global.f32 %f2, [%rd8];
+ add.f32 %f3, %f2, %f1;
+ add.s64 %rd11, %rd5, %rd9;
+ st.global.f32 [%rd11], %f3;
$L__BB0_2:
- mov.u32 %r59, %ctaid.x;
- mov.u32 %r60, %ntid.y;
- mov.u32 %r61, %tid.y;
- mad.lo.s32 %r3, %r60, %r59, %r61;
- shr.s32 %r62, %r3, 31;
- shr.u32 %r63, %r62, 29;
- add.s32 %r64, %r3, %r63;
- shr.s32 %r65, %r64, 3;
- and.b32 %r66, %r64, -8;
- sub.s32 %r67, %r3, %r66;
- mov.u32 %r81, %tid.x;
- mul.lo.s32 %r68, %r48, %r67;
- mad.lo.s32 %r69, %r50, %r81, %r68;
- mad.lo.s32 %r82, %r45, %r65, %r69;
- mul.lo.s32 %r6, %r50, %r1;
- cvta.to.global.u64 %rd1, %rd3;
- mov.f32 %f21, 0f00000000;
- mov.u32 %r83, 0;
-
-$L__BB0_3:
- .pragma "nounroll";
- setp.gt.s32 %p2, %r81, 7;
- setp.gt.s32 %p3, %r3, 63;
- or.pred %p4, %p3, %p2;
- @%p4 bra $L__BB0_5;
-
- mul.wide.s32 %rd5, %r82, 4;
- add.s64 %rd6, %rd1, %rd5;
- ld.global.f32 %f10, [%rd6];
- add.f32 %f21, %f21, %f10;
-
-$L__BB0_5:
- add.s32 %r82, %r82, %r6;
- add.s32 %r81, %r81, %r1;
- add.s32 %r83, %r83, 1;
- setp.lt.s32 %p5, %r83, %r2;
- @%p5 bra $L__BB0_3;
- bra.uni $L__BB0_6;
-
-$L__BB0_1:
- mov.f32 %f21, 0f00000000;
-
-$L__BB0_6:
- mov.u32 %r70, %tid.z;
- mov.u32 %r13, %ntid.y;
- mov.u32 %r14, %tid.y;
- mad.lo.s32 %r71, %r13, %r70, %r14;
- mov.u32 %r15, %tid.x;
- mad.lo.s32 %r16, %r71, %r1, %r15;
- mul.wide.u32 %rd7, %r16, 4;
- mov.u64 %rd8, _ZN11kernelscope6kernelE;
- add.s64 %rd2, %rd8, %rd7;
- st.shared.f32 [%rd2], %f21;
- bar.sync 0;
- clz.b32 %r72, %r1;
- mov.u32 %r73, 31;
- sub.s32 %r74, %r73, %r72;
- mov.u32 %r75, 1;
- shl.b32 %r84, %r75, %r74;
- setp.ge.u32 %p6, %r15, %r84;
- add.s32 %r76, %r84, %r15;
- setp.ge.u32 %p7, %r76, %r1;
- or.pred %p8, %p6, %p7;
- @%p8 bra $L__BB0_8;
-
- add.s32 %r77, %r16, %r84;
- mul.wide.s32 %rd9, %r77, 4;
- add.s64 %rd11, %rd8, %rd9;
- ld.shared.f32 %f11, [%rd2];
- ld.shared.f32 %f12, [%rd11];
- add.f32 %f13, %f12, %f11;
- st.shared.f32 [%rd2], %f13;
-
-$L__BB0_8:
- bar.sync 0;
- setp.lt.s32 %p9, %r84, 4;
- @%p9 bra $L__BB0_12;
-
-$L__BB0_9:
- shr.u32 %r19, %r84, 1;
- setp.ge.u32 %p10, %r15, %r19;
- @%p10 bra $L__BB0_11;
-
- add.s32 %r78, %r19, %r16;
- mul.wide.s32 %rd12, %r78, 4;
- add.s64 %rd14, %rd8, %rd12;
- ld.shared.f32 %f14, [%rd2];
- ld.shared.f32 %f15, [%rd14];
- add.f32 %f16, %f15, %f14;
- st.shared.f32 [%rd2], %f16;
-
-$L__BB0_11:
- bar.sync 0;
- setp.gt.u32 %p11, %r84, 7;
- mov.u32 %r84, %r19;
- @%p11 bra $L__BB0_9;
-
-$L__BB0_12:
- setp.ne.s32 %p12, %r15, 0;
- mov.f32 %f23, 0f00000000;
- @%p12 bra $L__BB0_15;
-
- ld.shared.f32 %f18, [%rd2];
- add.f32 %f23, %f18, 0f00000000;
- setp.lt.u32 %p13, %r1, 2;
- @%p13 bra $L__BB0_15;
-
- add.s32 %r79, %r16, 1;
- mul.wide.u32 %rd15, %r79, 4;
- add.s64 %rd17, %rd8, %rd15;
- ld.shared.f32 %f19, [%rd17];
- add.f32 %f23, %f23, %f19;
-
-$L__BB0_15:
- bar.sync 0;
- @%p12 bra $L__BB0_18;
-
- mov.u32 %r80, %ctaid.x;
- mad.lo.s32 %r20, %r13, %r80, %r14;
- setp.gt.s32 %p15, %r20, 63;
- @%p15 bra $L__BB0_18;
-
- cvta.to.global.u64 %rd18, %rd4;
- mul.wide.s32 %rd19, %r20, 4;
- add.s64 %rd20, %rd18, %rd19;
- st.global.f32 [%rd20], %f23;
-
-$L__BB0_18:
ret;
}
Kernel 2
CUDA
PTX
53997da5d
Diff
03a1b695e
-18
+21 index type: int
registers: 15→ 14
gmem: 3
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 4, 4> T1, Tensor<float, 3, 3> T4, Tensor<float, 4, 4> T6) {
if (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < 64)) {
Array<float, 1, 1> T7;
T7[0] = 0;
T7[0]
= T1[((((T1.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) / 8)) + (T1.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.x) % 8) / 4))) + (T1.alloc_stride[2LL] * ((((nvfuser_index_t)threadIdx.x) % 8) % 4))) + ((16 * T1.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.x)))];
Array<float, 1, 1> T8;
T8[0] = 0;
T8[0]
= T4[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))];
Array<float, 1, 1> T5;
T5[0]
= T8[0];
Array<float, 1, 1> T9;
T9[0]
= T5[0]
+ T7[0];
T6[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
= T9[0];
}
}
__global__ void nvfuser_N(Tensor<float, 6, 6> T0, Tensor<float, 3, 3> T4) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
Array<float, 1, 1> T7;
T7[0] = 0.000000000e+00f;
#pragma unroll 1
for(nvfuser_index_t i0 = 0; i0 < (ceilDiv(8, ((nvfuser_index_t)blockDim.x))); ++i0) {
if ((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 64) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i0)) < 8))) {
T7[0]
= T7[0]
+ T0[((((T0.alloc_stride[5LL] * ((nvfuser_index_t)threadIdx.x)) + (T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) / 8))) + (T0.alloc_stride[3LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) % 8))) + ((((nvfuser_index_t)blockDim.x) * T0.alloc_stride[5LL]) * i0))];
}
}
Array<float, 1, 1> T2;
T2[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T2[0], T7[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T3;
T3[0]
= T2[0];
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 64))) {
T4[(((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x)))]
= T3[0];
}
}
--- 53997da5d
+++ 03a1b695e
@@ -1,21 +1,24 @@
-__global__ void nvfuser_N(Tensor<float, 4, 4> T1, Tensor<float, 3, 3> T4, Tensor<float, 4, 4> T6) {
- if (((((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x))) < 64)) {
- Array<float, 1, 1> T7;
- T7[0] = 0;
- T7[0]
- = T1[((((T1.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) / 8)) + (T1.alloc_stride[1LL] * ((((nvfuser_index_t)threadIdx.x) % 8) / 4))) + (T1.alloc_stride[2LL] * ((((nvfuser_index_t)threadIdx.x) % 8) % 4))) + ((16 * T1.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.x)))];
- Array<float, 1, 1> T8;
- T8[0] = 0;
- T8[0]
- = T4[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))];
- Array<float, 1, 1> T5;
- T5[0]
- = T8[0];
- Array<float, 1, 1> T9;
- T9[0]
- = T5[0]
- + T7[0];
- T6[(((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)))]
- = T9[0];
+__global__ void nvfuser_N(Tensor<float, 6, 6> T0, Tensor<float, 3, 3> T4) {
+ alignas(16) extern __shared__ char array[];
+ void* shared_mem = array;
+ Array<float, 1, 1> T7;
+ T7[0] = 0.000000000e+00f;
+ #pragma unroll 1
+ for(nvfuser_index_t i0 = 0; i0 < (ceilDiv(8, ((nvfuser_index_t)blockDim.x))); ++i0) {
+ if ((((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 64) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i0)) < 8))) {
+ T7[0]
+ = T7[0]
+ + T0[((((T0.alloc_stride[5LL] * ((nvfuser_index_t)threadIdx.x)) + (T0.alloc_stride[0LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) / 8))) + (T0.alloc_stride[3LL] * ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) % 8))) + ((((nvfuser_index_t)blockDim.x) * T0.alloc_stride[5LL]) * i0))];
+ }
+ }
+ Array<float, 1, 1> T2;
+ T2[0] = 0.000000000e+00f;
+ blockReduce<true, false, false, true>(T2[0], T7[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
+ Array<float, 1, 1> T3;
+ T3[0]
+ = T2[0];
+ if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x))) < 64))) {
+ T4[(((nvfuser_index_t)threadIdx.y) + (((nvfuser_index_t)blockDim.y) * ((nvfuser_index_t)blockIdx.x)))]
+ = T3[0];
}
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_524_cu_3378c5c7_191103std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_524_cu_3378c5c7_191103std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_524_cu_3378c5c7_191103std14__numeric_typeIvE5valueE = 1;
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_524_cu_3378c5c7_1911011nvfuser_524ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEES1_(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_524_cu_3378c5c7_1911011nvfuser_524ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEES1__param_0[40],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_524_cu_3378c5c7_1911011nvfuser_524ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEES1__param_1[32],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_524_cu_3378c5c7_1911011nvfuser_524ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEES1__param_2[40]
)
{
.reg .pred %p<2>;
.reg .f32 %f<4>;
.reg .b32 %r<66>;
.reg .b64 %rd<12>;
ld.param.v2.u32 {%r30, %r31}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_524_cu_3378c5c7_1911011nvfuser_524ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEES1__param_0+24];
ld.param.v2.u32 {%r32, %r33}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_524_cu_3378c5c7_1911011nvfuser_524ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEES1__param_0+32];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_524_cu_3378c5c7_1911011nvfuser_524ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEES1__param_2];
ld.param.u64 %rd2, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_524_cu_3378c5c7_1911011nvfuser_524ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEES1__param_1];
ld.param.u64 %rd1, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_524_cu_3378c5c7_1911011nvfuser_524ENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEES1__param_0];
mov.u32 %r1, %ctaid.x;
shl.b32 %r48, %r1, 7;
mov.u32 %r2, %tid.x;
add.s32 %r3, %r48, %r2;
setp.gt.s32 %p1, %r3, 63;
@%p1 bra $L__BB0_2;
cvta.to.global.u64 %rd4, %rd1;
cvta.to.global.u64 %rd5, %rd3;
cvta.to.global.u64 %rd6, %rd2;
shr.s32 %r49, %r2, 31;
shr.u32 %r50, %r49, 29;
add.s32 %r51, %r2, %r50;
and.b32 %r52, %r51, -8;
sub.s32 %r53, %r2, %r52;
shr.s32 %r54, %r53, 31;
shr.u32 %r55, %r54, 30;
add.s32 %r56, %r53, %r55;
shr.s32 %r57, %r56, 2;
mul.lo.s32 %r58, %r31, %r57;
and.b32 %r59, %r56, -4;
sub.s32 %r60, %r53, %r59;
shl.b32 %r61, %r1, 4;
shr.s32 %r62, %r51, 3;
add.s32 %r63, %r61, %r62;
mad.lo.s32 %r64, %r32, %r60, %r58;
mad.lo.s32 %r65, %r63, %r30, %r64;
mul.wide.s32 %rd7, %r65, 4;
add.s64 %rd8, %rd4, %rd7;
mul.wide.s32 %rd9, %r3, 4;
add.s64 %rd10, %rd6, %rd9;
ld.global.f32 %f1, [%rd10];
ld.global.f32 %f2, [%rd8];
add.f32 %f3, %f2, %f1;
add.s64 %rd11, %rd5, %rd9;
st.global.f32 [%rd11], %f3;
$L__BB0_2:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_524_cu_37f538a6_160113std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_524_cu_37f538a6_160113std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_524_cu_37f538a6_160113std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_524_cu_37f538a6_160115arrayE[];
.entry _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_524_cu_37f538a6_1601111nvfuser_524ENS_6TensorIfLi6ELi6EEENS0_IfLi3ELi3EEE(
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_524_cu_37f538a6_1601111nvfuser_524ENS_6TensorIfLi6ELi6EEENS0_IfLi3ELi3EEE_param_0[56],
.param .align 8 .b8 _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_524_cu_37f538a6_1601111nvfuser_524ENS_6TensorIfLi6ELi6EEENS0_IfLi3ELi3EEE_param_1[32]
)
{
.reg .pred %p<16>;
.reg .f32 %f<24>;
.reg .b32 %r<85>;
.reg .b64 %rd<21>;
ld.param.v2.u32 {%r45, %r46}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_524_cu_37f538a6_1601111nvfuser_524ENS_6TensorIfLi6ELi6EEENS0_IfLi3ELi3EEE_param_0+32];
ld.param.v2.u32 {%r47, %r48}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_524_cu_37f538a6_1601111nvfuser_524ENS_6TensorIfLi6ELi6EEENS0_IfLi3ELi3EEE_param_0+40];
ld.param.v2.u32 {%r49, %r50}, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_524_cu_37f538a6_1601111nvfuser_524ENS_6TensorIfLi6ELi6EEENS0_IfLi3ELi3EEE_param_0+48];
ld.param.u64 %rd4, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_524_cu_37f538a6_1601111nvfuser_524ENS_6TensorIfLi6ELi6EEENS0_IfLi3ELi3EEE_param_1];
ld.param.u64 %rd3, [_ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_524_cu_37f538a6_1601111nvfuser_524ENS_6TensorIfLi6ELi6EEENS0_IfLi3ELi3EEE_param_0];
mov.u32 %r1, %ntid.x;
add.s32 %r57, %r1, 7;
div.s32 %r2, %r57, %r1;
setp.gt.s32 %p1, %r2, 0;
@%p1 bra $L__BB0_2;
bra.uni $L__BB0_1;
$L__BB0_2:
mov.u32 %r59, %ctaid.x;
mov.u32 %r60, %ntid.y;
mov.u32 %r61, %tid.y;
mad.lo.s32 %r3, %r60, %r59, %r61;
shr.s32 %r62, %r3, 31;
shr.u32 %r63, %r62, 29;
add.s32 %r64, %r3, %r63;
shr.s32 %r65, %r64, 3;
and.b32 %r66, %r64, -8;
sub.s32 %r67, %r3, %r66;
mov.u32 %r81, %tid.x;
mul.lo.s32 %r68, %r48, %r67;
mad.lo.s32 %r69, %r50, %r81, %r68;
mad.lo.s32 %r82, %r45, %r65, %r69;
mul.lo.s32 %r6, %r50, %r1;
cvta.to.global.u64 %rd1, %rd3;
mov.f32 %f21, 0f00000000;
mov.u32 %r83, 0;
$L__BB0_3:
.pragma "nounroll";
setp.gt.s32 %p2, %r81, 7;
setp.gt.s32 %p3, %r3, 63;
or.pred %p4, %p3, %p2;
@%p4 bra $L__BB0_5;
mul.wide.s32 %rd5, %r82, 4;
add.s64 %rd6, %rd1, %rd5;
ld.global.f32 %f10, [%rd6];
add.f32 %f21, %f21, %f10;
$L__BB0_5:
add.s32 %r82, %r82, %r6;
add.s32 %r81, %r81, %r1;
add.s32 %r83, %r83, 1;
setp.lt.s32 %p5, %r83, %r2;
@%p5 bra $L__BB0_3;
bra.uni $L__BB0_6;
$L__BB0_1:
mov.f32 %f21, 0f00000000;
$L__BB0_6:
mov.u32 %r70, %tid.z;
mov.u32 %r13, %ntid.y;
mov.u32 %r14, %tid.y;
mad.lo.s32 %r71, %r13, %r70, %r14;
mov.u32 %r15, %tid.x;
mad.lo.s32 %r16, %r71, %r1, %r15;
mul.wide.u32 %rd7, %r16, 4;
mov.u64 %rd8, _ZN59_GLOBAL__N__00000000_20___tmp_nvfuser_524_cu_37f538a6_160115arrayE;
add.s64 %rd2, %rd8, %rd7;
st.shared.f32 [%rd2], %f21;
bar.sync 0;
clz.b32 %r72, %r1;
mov.u32 %r73, 31;
sub.s32 %r74, %r73, %r72;
mov.u32 %r75, 1;
shl.b32 %r84, %r75, %r74;
setp.ge.u32 %p6, %r15, %r84;
add.s32 %r76, %r84, %r15;
setp.ge.u32 %p7, %r76, %r1;
or.pred %p8, %p6, %p7;
@%p8 bra $L__BB0_8;
add.s32 %r77, %r16, %r84;
mul.wide.s32 %rd9, %r77, 4;
add.s64 %rd11, %rd8, %rd9;
ld.shared.f32 %f11, [%rd2];
ld.shared.f32 %f12, [%rd11];
add.f32 %f13, %f12, %f11;
st.shared.f32 [%rd2], %f13;
$L__BB0_8:
bar.sync 0;
setp.lt.s32 %p9, %r84, 4;
@%p9 bra $L__BB0_12;
$L__BB0_9:
shr.u32 %r19, %r84, 1;
setp.ge.u32 %p10, %r15, %r19;
@%p10 bra $L__BB0_11;
add.s32 %r78, %r19, %r16;
mul.wide.s32 %rd12, %r78, 4;
add.s64 %rd14, %rd8, %rd12;
ld.shared.f32 %f14, [%rd2];
ld.shared.f32 %f15, [%rd14];
add.f32 %f16, %f15, %f14;
st.shared.f32 [%rd2], %f16;
$L__BB0_11:
bar.sync 0;
setp.gt.u32 %p11, %r84, 7;
mov.u32 %r84, %r19;
@%p11 bra $L__BB0_9;
$L__BB0_12:
setp.ne.s32 %p12, %r15, 0;
mov.f32 %f23, 0f00000000;
@%p12 bra $L__BB0_15;
ld.shared.f32 %f18, [%rd2];
add.f32 %f23, %f18, 0f00000000;
setp.lt.u32 %p13, %r1, 2;
@%p13 bra $L__BB0_15;
add.s32 %r79, %r16, 1;
mul.wide.u32 %rd15, %r79, 4;
add.s64 %rd17, %rd8, %rd15;
ld.shared.f32 %f19, [%rd17];
add.f32 %f23, %f23, %f19;
$L__BB0_15:
bar.sync 0;
@%p12 bra $L__BB0_18;
mov.u32 %r80, %ctaid.x;
mad.lo.s32 %r20, %r13, %r80, %r14;
setp.gt.s32 %p15, %r20, 63;
@%p15 bra $L__BB0_18;
cvta.to.global.u64 %rd18, %rd4;
mul.wide.s32 %rd19, %r20, 4;
add.s64 %rd20, %rd18, %rd19;
st.global.f32 [%rd20], %f23;
$L__BB0_18:
ret;
}
--- 53997da5d
+++ 03a1b695e
@@ -11,66 +11,164 @@
.address_size 64
.global .align 1 .u8 _ZN11kernelscope6kernel17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN11kernelscope6kernel17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN11kernelscope6kernel14__numeric_typeIvE5valueE = 1;
+.extern .shared .align 16 .b8 _ZN11kernelscope6kernelE[];
-.entry _ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEES1_(
- .param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEES1__param_0[40],
- .param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEES1__param_1[32],
- .param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEES1__param_2[40]
+.entry _ZN11kernelscope6kernelENS_6TensorIfLi6ELi6EEENS0_IfLi3ELi3EEE(
+ .param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi6ELi6EEENS0_IfLi3ELi3EEE_param_0[56],
+ .param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi6ELi6EEENS0_IfLi3ELi3EEE_param_1[32]
)
{
- .reg .pred %p<2>;
- .reg .f32 %f<4>;
- .reg .b32 %r<66>;
- .reg .b64 %rd<12>;
+ .reg .pred %p<16>;
+ .reg .f32 %f<24>;
+ .reg .b32 %r<85>;
+ .reg .b64 %rd<21>;
- ld.param.v2.u32 {%r30, %r31}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEES1__param_0+24];
- ld.param.v2.u32 {%r32, %r33}, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEES1__param_0+32];
- ld.param.u64 %rd3, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEES1__param_2];
- ld.param.u64 %rd2, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEES1__param_1];
- ld.param.u64 %rd1, [_ZN11kernelscope6kernelENS_6TensorIfLi4ELi4EEENS0_IfLi3ELi3EEES1__param_0];
- mov.u32 %r1, %ctaid.x;
- shl.b32 %r48, %r1, 7;
- mov.u32 %r2, %tid.x;
- add.s32 %r3, %r48, %r2;
- setp.gt.s32 %p1, %r3, 63;
+ ld.param.v2.u32 {%r45, %r46}, [_ZN11kernelscope6kernelENS_6TensorIfLi6ELi6EEENS0_IfLi3ELi3EEE_param_0+32];
+ ld.param.v2.u32 {%r47, %r48}, [_ZN11kernelscope6kernelENS_6TensorIfLi6ELi6EEENS0_IfLi3ELi3EEE_param_0+40];
+ ld.param.v2.u32 {%r49, %r50}, [_ZN11kernelscope6kernelENS_6TensorIfLi6ELi6EEENS0_IfLi3ELi3EEE_param_0+48];
+ ld.param.u64 %rd4, [_ZN11kernelscope6kernelENS_6TensorIfLi6ELi6EEENS0_IfLi3ELi3EEE_param_1];
+ ld.param.u64 %rd3, [_ZN11kernelscope6kernelENS_6TensorIfLi6ELi6EEENS0_IfLi3ELi3EEE_param_0];
+ mov.u32 %r1, %ntid.x;
+ add.s32 %r57, %r1, 7;
+ div.s32 %r2, %r57, %r1;
+ setp.gt.s32 %p1, %r2, 0;
@%p1 bra $L__BB0_2;
-
- cvta.to.global.u64 %rd4, %rd1;
- cvta.to.global.u64 %rd5, %rd3;
- cvta.to.global.u64 %rd6, %rd2;
- shr.s32 %r49, %r2, 31;
- shr.u32 %r50, %r49, 29;
- add.s32 %r51, %r2, %r50;
- and.b32 %r52, %r51, -8;
- sub.s32 %r53, %r2, %r52;
- shr.s32 %r54, %r53, 31;
- shr.u32 %r55, %r54, 30;
- add.s32 %r56, %r53, %r55;
- shr.s32 %r57, %r56, 2;
- mul.lo.s32 %r58, %r31, %r57;
- and.b32 %r59, %r56, -4;
- sub.s32 %r60, %r53, %r59;
- shl.b32 %r61, %r1, 4;
- shr.s32 %r62, %r51, 3;
- add.s32 %r63, %r61, %r62;
- mad.lo.s32 %r64, %r32, %r60, %r58;
- mad.lo.s32 %r65, %r63, %r30, %r64;
- mul.wide.s32 %rd7, %r65, 4;
- add.s64 %rd8, %rd4, %rd7;
- mul.wide.s32 %rd9, %r3, 4;
- add.s64 %rd10, %rd6, %rd9;
- ld.global.f32 %f1, [%rd10];
- ld.global.f32 %f2, [%rd8];
- add.f32 %f3, %f2, %f1;
- add.s64 %rd11, %rd5, %rd9;
- st.global.f32 [%rd11], %f3;
+ bra.uni $L__BB0_1;
$L__BB0_2:
+ mov.u32 %r59, %ctaid.x;
+ mov.u32 %r60, %ntid.y;
+ mov.u32 %r61, %tid.y;
+ mad.lo.s32 %r3, %r60, %r59, %r61;
+ shr.s32 %r62, %r3, 31;
+ shr.u32 %r63, %r62, 29;
+ add.s32 %r64, %r3, %r63;
+ shr.s32 %r65, %r64, 3;
+ and.b32 %r66, %r64, -8;
+ sub.s32 %r67, %r3, %r66;
+ mov.u32 %r81, %tid.x;
+ mul.lo.s32 %r68, %r48, %r67;
+ mad.lo.s32 %r69, %r50, %r81, %r68;
+ mad.lo.s32 %r82, %r45, %r65, %r69;
+ mul.lo.s32 %r6, %r50, %r1;
+ cvta.to.global.u64 %rd1, %rd3;
+ mov.f32 %f21, 0f00000000;
+ mov.u32 %r83, 0;
+
+$L__BB0_3:
+ .pragma "nounroll";
+ setp.gt.s32 %p2, %r81, 7;
+ setp.gt.s32 %p3, %r3, 63;
+ or.pred %p4, %p3, %p2;
+ @%p4 bra $L__BB0_5;
+
+ mul.wide.s32 %rd5, %r82, 4;
+ add.s64 %rd6, %rd1, %rd5;
+ ld.global.f32 %f10, [%rd6];
+ add.f32 %f21, %f21, %f10;
+
+$L__BB0_5:
+ add.s32 %r82, %r82, %r6;
+ add.s32 %r81, %r81, %r1;
+ add.s32 %r83, %r83, 1;
+ setp.lt.s32 %p5, %r83, %r2;
+ @%p5 bra $L__BB0_3;
+ bra.uni $L__BB0_6;
+
+$L__BB0_1:
+ mov.f32 %f21, 0f00000000;
+
+$L__BB0_6:
+ mov.u32 %r70, %tid.z;
+ mov.u32 %r13, %ntid.y;
+ mov.u32 %r14, %tid.y;
+ mad.lo.s32 %r71, %r13, %r70, %r14;
+ mov.u32 %r15, %tid.x;
+ mad.lo.s32 %r16, %r71, %r1, %r15;
+ mul.wide.u32 %rd7, %r16, 4;
+ mov.u64 %rd8, _ZN11kernelscope6kernelE;
+ add.s64 %rd2, %rd8, %rd7;
+ st.shared.f32 [%rd2], %f21;
+ bar.sync 0;
+ clz.b32 %r72, %r1;
+ mov.u32 %r73, 31;
+ sub.s32 %r74, %r73, %r72;
+ mov.u32 %r75, 1;
+ shl.b32 %r84, %r75, %r74;
+ setp.ge.u32 %p6, %r15, %r84;
+ add.s32 %r76, %r84, %r15;
+ setp.ge.u32 %p7, %r76, %r1;
+ or.pred %p8, %p6, %p7;
+ @%p8 bra $L__BB0_8;
+
+ add.s32 %r77, %r16, %r84;
+ mul.wide.s32 %rd9, %r77, 4;
+ add.s64 %rd11, %rd8, %rd9;
+ ld.shared.f32 %f11, [%rd2];
+ ld.shared.f32 %f12, [%rd11];
+ add.f32 %f13, %f12, %f11;
+ st.shared.f32 [%rd2], %f13;
+
+$L__BB0_8:
+ bar.sync 0;
+ setp.lt.s32 %p9, %r84, 4;
+ @%p9 bra $L__BB0_12;
+
+$L__BB0_9:
+ shr.u32 %r19, %r84, 1;
+ setp.ge.u32 %p10, %r15, %r19;
+ @%p10 bra $L__BB0_11;
+
+ add.s32 %r78, %r19, %r16;
+ mul.wide.s32 %rd12, %r78, 4;
+ add.s64 %rd14, %rd8, %rd12;
+ ld.shared.f32 %f14, [%rd2];
+ ld.shared.f32 %f15, [%rd14];
+ add.f32 %f16, %f15, %f14;
+ st.shared.f32 [%rd2], %f16;
+
+$L__BB0_11:
+ bar.sync 0;
+ setp.gt.u32 %p11, %r84, 7;
+ mov.u32 %r84, %r19;
+ @%p11 bra $L__BB0_9;
+
+$L__BB0_12:
+ setp.ne.s32 %p12, %r15, 0;
+ mov.f32 %f23, 0f00000000;
+ @%p12 bra $L__BB0_15;
+
+ ld.shared.f32 %f18, [%rd2];
+ add.f32 %f23, %f18, 0f00000000;
+ setp.lt.u32 %p13, %r1, 2;
+ @%p13 bra $L__BB0_15;
+
+ add.s32 %r79, %r16, 1;
+ mul.wide.u32 %rd15, %r79, 4;
+ add.s64 %rd17, %rd8, %rd15;
+ ld.shared.f32 %f19, [%rd17];
+ add.f32 %f23, %f23, %f19;
+
+$L__BB0_15:
+ bar.sync 0;
+ @%p12 bra $L__BB0_18;
+
+ mov.u32 %r80, %ctaid.x;
+ mad.lo.s32 %r20, %r13, %r80, %r14;
+ setp.gt.s32 %p15, %r20, 63;
+ @%p15 bra $L__BB0_18;
+
+ cvta.to.global.u64 %rd18, %rd4;
+ mul.wide.s32 %rd19, %r20, 4;
+ add.s64 %rd20, %rd18, %rd19;
+ st.global.f32 [%rd20], %f23;
+
+$L__BB0_18:
ret;
}